#
tokens: 48999/50000 21/513 files (page 6/21)
lines: on (toggle) GitHub
raw markdown copy reset
This is page 6 of 21. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .all-contributorsrc
├── .cursorignore
├── .devcontainer
│   ├── devcontainer.json
│   ├── post-install.sh
│   └── README.md
├── .dockerignore
├── .gitattributes
├── .github
│   ├── FUNDING.yml
│   ├── scripts
│   │   ├── get_pyproject_version.py
│   │   └── tests
│   │       ├── __init__.py
│   │       ├── README.md
│   │       └── test_get_pyproject_version.py
│   └── workflows
│       ├── ci-lume.yml
│       ├── docker-publish-kasm.yml
│       ├── docker-publish-xfce.yml
│       ├── docker-reusable-publish.yml
│       ├── npm-publish-computer.yml
│       ├── npm-publish-core.yml
│       ├── publish-lume.yml
│       ├── pypi-publish-agent.yml
│       ├── pypi-publish-computer-server.yml
│       ├── pypi-publish-computer.yml
│       ├── pypi-publish-core.yml
│       ├── pypi-publish-mcp-server.yml
│       ├── pypi-publish-pylume.yml
│       ├── pypi-publish-som.yml
│       ├── pypi-reusable-publish.yml
│       └── test-validation-script.yml
├── .gitignore
├── .vscode
│   ├── docs.code-workspace
│   ├── launch.json
│   ├── libs-ts.code-workspace
│   ├── lume.code-workspace
│   ├── lumier.code-workspace
│   ├── py.code-workspace
│   └── settings.json
├── blog
│   ├── app-use.md
│   ├── assets
│   │   ├── composite-agents.png
│   │   ├── docker-ubuntu-support.png
│   │   ├── hack-booth.png
│   │   ├── hack-closing-ceremony.jpg
│   │   ├── hack-cua-ollama-hud.jpeg
│   │   ├── hack-leaderboard.png
│   │   ├── hack-the-north.png
│   │   ├── hack-winners.jpeg
│   │   ├── hack-workshop.jpeg
│   │   ├── hud-agent-evals.png
│   │   └── trajectory-viewer.jpeg
│   ├── bringing-computer-use-to-the-web.md
│   ├── build-your-own-operator-on-macos-1.md
│   ├── build-your-own-operator-on-macos-2.md
│   ├── composite-agents.md
│   ├── cua-hackathon.md
│   ├── hack-the-north.md
│   ├── hud-agent-evals.md
│   ├── human-in-the-loop.md
│   ├── introducing-cua-cloud-containers.md
│   ├── lume-to-containerization.md
│   ├── sandboxed-python-execution.md
│   ├── training-computer-use-models-trajectories-1.md
│   ├── trajectory-viewer.md
│   ├── ubuntu-docker-support.md
│   └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│   ├── .gitignore
│   ├── .prettierrc
│   ├── content
│   │   └── docs
│   │       ├── agent-sdk
│   │       │   ├── agent-loops.mdx
│   │       │   ├── benchmarks
│   │       │   │   ├── index.mdx
│   │       │   │   ├── interactive.mdx
│   │       │   │   ├── introduction.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── osworld-verified.mdx
│   │       │   │   ├── screenspot-pro.mdx
│   │       │   │   └── screenspot-v2.mdx
│   │       │   ├── callbacks
│   │       │   │   ├── agent-lifecycle.mdx
│   │       │   │   ├── cost-saving.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── logging.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── pii-anonymization.mdx
│   │       │   │   └── trajectories.mdx
│   │       │   ├── chat-history.mdx
│   │       │   ├── custom-computer-handlers.mdx
│   │       │   ├── custom-tools.mdx
│   │       │   ├── customizing-computeragent.mdx
│   │       │   ├── integrations
│   │       │   │   ├── hud.mdx
│   │       │   │   └── meta.json
│   │       │   ├── message-format.mdx
│   │       │   ├── meta.json
│   │       │   ├── migration-guide.mdx
│   │       │   ├── prompt-caching.mdx
│   │       │   ├── supported-agents
│   │       │   │   ├── composed-agents.mdx
│   │       │   │   ├── computer-use-agents.mdx
│   │       │   │   ├── grounding-models.mdx
│   │       │   │   ├── human-in-the-loop.mdx
│   │       │   │   └── meta.json
│   │       │   ├── supported-model-providers
│   │       │   │   ├── index.mdx
│   │       │   │   └── local-models.mdx
│   │       │   └── usage-tracking.mdx
│   │       ├── computer-sdk
│   │       │   ├── cloud-vm-management.mdx
│   │       │   ├── commands.mdx
│   │       │   ├── computer-ui.mdx
│   │       │   ├── computers.mdx
│   │       │   ├── meta.json
│   │       │   └── sandboxed-python.mdx
│   │       ├── index.mdx
│   │       ├── libraries
│   │       │   ├── agent
│   │       │   │   └── index.mdx
│   │       │   ├── computer
│   │       │   │   └── index.mdx
│   │       │   ├── computer-server
│   │       │   │   ├── Commands.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── REST-API.mdx
│   │       │   │   └── WebSocket-API.mdx
│   │       │   ├── core
│   │       │   │   └── index.mdx
│   │       │   ├── lume
│   │       │   │   ├── cli-reference.mdx
│   │       │   │   ├── faq.md
│   │       │   │   ├── http-api.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── meta.json
│   │       │   │   └── prebuilt-images.mdx
│   │       │   ├── lumier
│   │       │   │   ├── building-lumier.mdx
│   │       │   │   ├── docker-compose.mdx
│   │       │   │   ├── docker.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   └── meta.json
│   │       │   ├── mcp-server
│   │       │   │   ├── client-integrations.mdx
│   │       │   │   ├── configuration.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── llm-integrations.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── tools.mdx
│   │       │   │   └── usage.mdx
│   │       │   └── som
│   │       │       ├── configuration.mdx
│   │       │       └── index.mdx
│   │       ├── meta.json
│   │       ├── quickstart-cli.mdx
│   │       ├── quickstart-devs.mdx
│   │       └── telemetry.mdx
│   ├── next.config.mjs
│   ├── package-lock.json
│   ├── package.json
│   ├── pnpm-lock.yaml
│   ├── postcss.config.mjs
│   ├── public
│   │   └── img
│   │       ├── agent_gradio_ui.png
│   │       ├── agent.png
│   │       ├── cli.png
│   │       ├── computer.png
│   │       ├── som_box_threshold.png
│   │       └── som_iou_threshold.png
│   ├── README.md
│   ├── source.config.ts
│   ├── src
│   │   ├── app
│   │   │   ├── (home)
│   │   │   │   ├── [[...slug]]
│   │   │   │   │   └── page.tsx
│   │   │   │   └── layout.tsx
│   │   │   ├── api
│   │   │   │   └── search
│   │   │   │       └── route.ts
│   │   │   ├── favicon.ico
│   │   │   ├── global.css
│   │   │   ├── layout.config.tsx
│   │   │   ├── layout.tsx
│   │   │   ├── llms.mdx
│   │   │   │   └── [[...slug]]
│   │   │   │       └── route.ts
│   │   │   └── llms.txt
│   │   │       └── route.ts
│   │   ├── assets
│   │   │   ├── discord-black.svg
│   │   │   ├── discord-white.svg
│   │   │   ├── logo-black.svg
│   │   │   └── logo-white.svg
│   │   ├── components
│   │   │   ├── iou.tsx
│   │   │   └── mermaid.tsx
│   │   ├── lib
│   │   │   ├── llms.ts
│   │   │   └── source.ts
│   │   └── mdx-components.tsx
│   └── tsconfig.json
├── examples
│   ├── agent_examples.py
│   ├── agent_ui_examples.py
│   ├── cloud_api_examples.py
│   ├── computer_examples_windows.py
│   ├── computer_examples.py
│   ├── computer_ui_examples.py
│   ├── computer-example-ts
│   │   ├── .env.example
│   │   ├── .gitignore
│   │   ├── .prettierrc
│   │   ├── package-lock.json
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── README.md
│   │   ├── src
│   │   │   ├── helpers.ts
│   │   │   └── index.ts
│   │   └── tsconfig.json
│   ├── docker_examples.py
│   ├── evals
│   │   ├── hud_eval_examples.py
│   │   └── wikipedia_most_linked.txt
│   ├── pylume_examples.py
│   ├── sandboxed_functions_examples.py
│   ├── som_examples.py
│   ├── utils.py
│   └── winsandbox_example.py
├── img
│   ├── agent_gradio_ui.png
│   ├── agent.png
│   ├── cli.png
│   ├── computer.png
│   ├── logo_black.png
│   └── logo_white.png
├── libs
│   ├── kasm
│   │   ├── Dockerfile
│   │   ├── LICENSE
│   │   ├── README.md
│   │   └── src
│   │       └── ubuntu
│   │           └── install
│   │               └── firefox
│   │                   ├── custom_startup.sh
│   │                   ├── firefox.desktop
│   │                   └── install_firefox.sh
│   ├── lume
│   │   ├── .cursorignore
│   │   ├── CONTRIBUTING.md
│   │   ├── Development.md
│   │   ├── img
│   │   │   └── cli.png
│   │   ├── Package.resolved
│   │   ├── Package.swift
│   │   ├── README.md
│   │   ├── resources
│   │   │   └── lume.entitlements
│   │   ├── scripts
│   │   │   ├── build
│   │   │   │   ├── build-debug.sh
│   │   │   │   ├── build-release-notarized.sh
│   │   │   │   └── build-release.sh
│   │   │   └── install.sh
│   │   ├── src
│   │   │   ├── Commands
│   │   │   │   ├── Clone.swift
│   │   │   │   ├── Config.swift
│   │   │   │   ├── Create.swift
│   │   │   │   ├── Delete.swift
│   │   │   │   ├── Get.swift
│   │   │   │   ├── Images.swift
│   │   │   │   ├── IPSW.swift
│   │   │   │   ├── List.swift
│   │   │   │   ├── Logs.swift
│   │   │   │   ├── Options
│   │   │   │   │   └── FormatOption.swift
│   │   │   │   ├── Prune.swift
│   │   │   │   ├── Pull.swift
│   │   │   │   ├── Push.swift
│   │   │   │   ├── Run.swift
│   │   │   │   ├── Serve.swift
│   │   │   │   ├── Set.swift
│   │   │   │   └── Stop.swift
│   │   │   ├── ContainerRegistry
│   │   │   │   ├── ImageContainerRegistry.swift
│   │   │   │   ├── ImageList.swift
│   │   │   │   └── ImagesPrinter.swift
│   │   │   ├── Errors
│   │   │   │   └── Errors.swift
│   │   │   ├── FileSystem
│   │   │   │   ├── Home.swift
│   │   │   │   ├── Settings.swift
│   │   │   │   ├── VMConfig.swift
│   │   │   │   ├── VMDirectory.swift
│   │   │   │   └── VMLocation.swift
│   │   │   ├── LumeController.swift
│   │   │   ├── Main.swift
│   │   │   ├── Server
│   │   │   │   ├── Handlers.swift
│   │   │   │   ├── HTTP.swift
│   │   │   │   ├── Requests.swift
│   │   │   │   ├── Responses.swift
│   │   │   │   └── Server.swift
│   │   │   ├── Utils
│   │   │   │   ├── CommandRegistry.swift
│   │   │   │   ├── CommandUtils.swift
│   │   │   │   ├── Logger.swift
│   │   │   │   ├── NetworkUtils.swift
│   │   │   │   ├── Path.swift
│   │   │   │   ├── ProcessRunner.swift
│   │   │   │   ├── ProgressLogger.swift
│   │   │   │   ├── String.swift
│   │   │   │   └── Utils.swift
│   │   │   ├── Virtualization
│   │   │   │   ├── DarwinImageLoader.swift
│   │   │   │   ├── DHCPLeaseParser.swift
│   │   │   │   ├── ImageLoaderFactory.swift
│   │   │   │   └── VMVirtualizationService.swift
│   │   │   ├── VM
│   │   │   │   ├── DarwinVM.swift
│   │   │   │   ├── LinuxVM.swift
│   │   │   │   ├── VM.swift
│   │   │   │   ├── VMDetails.swift
│   │   │   │   ├── VMDetailsPrinter.swift
│   │   │   │   ├── VMDisplayResolution.swift
│   │   │   │   └── VMFactory.swift
│   │   │   └── VNC
│   │   │       ├── PassphraseGenerator.swift
│   │   │       └── VNCService.swift
│   │   └── tests
│   │       ├── Mocks
│   │       │   ├── MockVM.swift
│   │       │   ├── MockVMVirtualizationService.swift
│   │       │   └── MockVNCService.swift
│   │       ├── VM
│   │       │   └── VMDetailsPrinterTests.swift
│   │       ├── VMTests.swift
│   │       ├── VMVirtualizationServiceTests.swift
│   │       └── VNCServiceTests.swift
│   ├── lumier
│   │   ├── .dockerignore
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   └── src
│   │       ├── bin
│   │       │   └── entry.sh
│   │       ├── config
│   │       │   └── constants.sh
│   │       ├── hooks
│   │       │   └── on-logon.sh
│   │       └── lib
│   │           ├── utils.sh
│   │           └── vm.sh
│   ├── python
│   │   ├── agent
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── agent
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── adapters
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── huggingfacelocal_adapter.py
│   │   │   │   │   ├── human_adapter.py
│   │   │   │   │   ├── mlxvlm_adapter.py
│   │   │   │   │   └── models
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── generic.py
│   │   │   │   │       ├── internvl.py
│   │   │   │   │       ├── opencua.py
│   │   │   │   │       └── qwen2_5_vl.py
│   │   │   │   ├── agent.py
│   │   │   │   ├── callbacks
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── budget_manager.py
│   │   │   │   │   ├── image_retention.py
│   │   │   │   │   ├── logging.py
│   │   │   │   │   ├── operator_validator.py
│   │   │   │   │   ├── pii_anonymization.py
│   │   │   │   │   ├── prompt_instructions.py
│   │   │   │   │   ├── telemetry.py
│   │   │   │   │   └── trajectory_saver.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── computers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cua.py
│   │   │   │   │   └── custom.py
│   │   │   │   ├── decorators.py
│   │   │   │   ├── human_tool
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   ├── server.py
│   │   │   │   │   └── ui.py
│   │   │   │   ├── integrations
│   │   │   │   │   └── hud
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── agent.py
│   │   │   │   │       └── proxy.py
│   │   │   │   ├── loops
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── anthropic.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── composed_grounded.py
│   │   │   │   │   ├── gemini.py
│   │   │   │   │   ├── glm45v.py
│   │   │   │   │   ├── gta1.py
│   │   │   │   │   ├── holo.py
│   │   │   │   │   ├── internvl.py
│   │   │   │   │   ├── model_types.csv
│   │   │   │   │   ├── moondream3.py
│   │   │   │   │   ├── omniparser.py
│   │   │   │   │   ├── openai.py
│   │   │   │   │   ├── opencua.py
│   │   │   │   │   └── uitars.py
│   │   │   │   ├── proxy
│   │   │   │   │   ├── examples.py
│   │   │   │   │   └── handlers.py
│   │   │   │   ├── responses.py
│   │   │   │   ├── types.py
│   │   │   │   └── ui
│   │   │   │       ├── __init__.py
│   │   │   │       ├── __main__.py
│   │   │   │       └── gradio
│   │   │   │           ├── __init__.py
│   │   │   │           ├── app.py
│   │   │   │           └── ui_components.py
│   │   │   ├── benchmarks
│   │   │   │   ├── .gitignore
│   │   │   │   ├── contrib.md
│   │   │   │   ├── interactive.py
│   │   │   │   ├── models
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   └── gta1.py
│   │   │   │   ├── README.md
│   │   │   │   ├── ss-pro.py
│   │   │   │   ├── ss-v2.py
│   │   │   │   └── utils.py
│   │   │   ├── example.py
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── computer
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer
│   │   │   │   ├── __init__.py
│   │   │   │   ├── computer.py
│   │   │   │   ├── diorama_computer.py
│   │   │   │   ├── helpers.py
│   │   │   │   ├── interface
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   ├── models.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── logger.py
│   │   │   │   ├── models.py
│   │   │   │   ├── providers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cloud
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── docker
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── lume
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── lume_api.py
│   │   │   │   │   ├── lumier
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── types.py
│   │   │   │   │   └── winsandbox
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── provider.py
│   │   │   │   │       └── setup_script.ps1
│   │   │   │   ├── ui
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   └── gradio
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       └── app.py
│   │   │   │   └── utils.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── computer-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── diorama
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── diorama_computer.py
│   │   │   │   │   ├── diorama.py
│   │   │   │   │   ├── draw.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── safezone.py
│   │   │   │   ├── handlers
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── main.py
│   │   │   │   ├── server.py
│   │   │   │   └── watchdog.py
│   │   │   ├── examples
│   │   │   │   ├── __init__.py
│   │   │   │   └── usage_example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   ├── run_server.py
│   │   │   └── test_connection.py
│   │   ├── core
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── core
│   │   │   │   ├── __init__.py
│   │   │   │   └── telemetry
│   │   │   │       ├── __init__.py
│   │   │   │       └── posthog.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── mcp-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── CONCURRENT_SESSIONS.md
│   │   │   ├── mcp_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── server.py
│   │   │   │   └── session_manager.py
│   │   │   ├── pdm.lock
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── scripts
│   │   │       ├── install_mcp_server.sh
│   │   │       └── start_mcp_server.sh
│   │   ├── pylume
│   │   │   ├── __init__.py
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── pylume
│   │   │   │   ├── __init__.py
│   │   │   │   ├── client.py
│   │   │   │   ├── exceptions.py
│   │   │   │   ├── lume
│   │   │   │   ├── models.py
│   │   │   │   ├── pylume.py
│   │   │   │   └── server.py
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   └── som
│   │       ├── .bumpversion.cfg
│   │       ├── LICENSE
│   │       ├── poetry.toml
│   │       ├── pyproject.toml
│   │       ├── README.md
│   │       ├── som
│   │       │   ├── __init__.py
│   │       │   ├── detect.py
│   │       │   ├── detection.py
│   │       │   ├── models.py
│   │       │   ├── ocr.py
│   │       │   ├── util
│   │       │   │   └── utils.py
│   │       │   └── visualization.py
│   │       └── tests
│   │           └── test_omniparser.py
│   ├── typescript
│   │   ├── .gitignore
│   │   ├── .nvmrc
│   │   ├── agent
│   │   │   ├── examples
│   │   │   │   ├── playground-example.html
│   │   │   │   └── README.md
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── client.ts
│   │   │   │   ├── index.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   └── client.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── biome.json
│   │   ├── computer
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── computer
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── providers
│   │   │   │   │   │   ├── base.ts
│   │   │   │   │   │   ├── cloud.ts
│   │   │   │   │   │   └── index.ts
│   │   │   │   │   └── types.ts
│   │   │   │   ├── index.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── base.ts
│   │   │   │   │   ├── factory.ts
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── linux.ts
│   │   │   │   │   ├── macos.ts
│   │   │   │   │   └── windows.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   ├── computer
│   │   │   │   │   └── cloud.test.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── factory.test.ts
│   │   │   │   │   ├── index.test.ts
│   │   │   │   │   ├── linux.test.ts
│   │   │   │   │   ├── macos.test.ts
│   │   │   │   │   └── windows.test.ts
│   │   │   │   └── setup.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── core
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── index.ts
│   │   │   │   └── telemetry
│   │   │   │       ├── clients
│   │   │   │       │   ├── index.ts
│   │   │   │       │   └── posthog.ts
│   │   │   │       └── index.ts
│   │   │   ├── tests
│   │   │   │   └── telemetry.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── pnpm-workspace.yaml
│   │   └── README.md
│   └── xfce
│       ├── .dockerignore
│       ├── .gitignore
│       ├── Dockerfile
│       ├── README.md
│       └── src
│           ├── scripts
│           │   ├── resize-display.sh
│           │   ├── start-computer-server.sh
│           │   ├── start-novnc.sh
│           │   ├── start-vnc.sh
│           │   └── xstartup.sh
│           ├── supervisor
│           │   └── supervisord.conf
│           └── xfce-config
│               ├── helpers.rc
│               ├── xfce4-power-manager.xml
│               └── xfce4-session.xml
├── LICENSE.md
├── Makefile
├── notebooks
│   ├── agent_nb.ipynb
│   ├── blog
│   │   ├── build-your-own-operator-on-macos-1.ipynb
│   │   └── build-your-own-operator-on-macos-2.ipynb
│   ├── composite_agents_docker_nb.ipynb
│   ├── computer_nb.ipynb
│   ├── computer_server_nb.ipynb
│   ├── customizing_computeragent.ipynb
│   ├── eval_osworld.ipynb
│   ├── ollama_nb.ipynb
│   ├── pylume_nb.ipynb
│   ├── README.md
│   ├── sota_hackathon_cloud.ipynb
│   └── sota_hackathon.ipynb
├── pdm.lock
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── samples
│   └── community
│       ├── global-online
│       │   └── README.md
│       └── hack-the-north
│           └── README.md
├── scripts
│   ├── build-uv.sh
│   ├── build.ps1
│   ├── build.sh
│   ├── cleanup.sh
│   ├── playground-docker.sh
│   ├── playground.sh
│   └── run-docker-dev.sh
└── tests
    ├── pytest.ini
    ├── shell_cmd.py
    ├── test_files.py
    ├── test_mcp_server_session_management.py
    ├── test_mcp_server_streaming.py
    ├── test_shell_bash.py
    ├── test_telemetry.py
    ├── test_venv.py
    └── test_watchdog.py
```

# Files

--------------------------------------------------------------------------------
/scripts/build.ps1:
--------------------------------------------------------------------------------

```
  1 | # PowerShell Build Script for CUA
  2 | # Exit on error
  3 | $ErrorActionPreference = "Stop"
  4 | 
  5 | # Colors for output
  6 | $RED = "Red"
  7 | $GREEN = "Green"
  8 | $BLUE = "Blue"
  9 | 
 10 | # Function to print step information
 11 | function Print-Step {
 12 |     param([string]$Message)
 13 |     Write-Host "==> $Message" -ForegroundColor $BLUE
 14 | }
 15 | 
 16 | # Function to print success message
 17 | function Print-Success {
 18 |     param([string]$Message)
 19 |     Write-Host "==> Success: $Message" -ForegroundColor $GREEN
 20 | }
 21 | 
 22 | # Function to print error message
 23 | function Print-Error {
 24 |     param([string]$Message)
 25 |     Write-Host "==> Error: $Message" -ForegroundColor $RED
 26 | }
 27 | 
 28 | # Get the script's directory and project root
 29 | $SCRIPT_DIR = Split-Path -Parent $MyInvocation.MyCommand.Path
 30 | $PROJECT_ROOT = Split-Path -Parent $SCRIPT_DIR
 31 | 
 32 | # Change to project root
 33 | Set-Location $PROJECT_ROOT
 34 | 
 35 | # Load environment variables from .env.local
 36 | if (Test-Path ".env.local") {
 37 |     Print-Step "Loading environment variables from .env.local..."
 38 |     Get-Content ".env.local" | ForEach-Object {
 39 |         if ($_ -match "^([^#][^=]*?)=(.*)$") {
 40 |             [Environment]::SetEnvironmentVariable($matches[1], $matches[2], "Process")
 41 |         }
 42 |     }
 43 |     Print-Success "Environment variables loaded"
 44 | } else {
 45 |     Print-Error ".env.local file not found"
 46 |     exit 1
 47 | }
 48 | 
 49 | # Check if conda is available
 50 | try {
 51 |     conda --version | Out-Null
 52 |     Print-Success "Conda is available"
 53 | } catch {
 54 |     Print-Error "Conda is not available. Please install Anaconda or Miniconda first."
 55 |     exit 1
 56 | }
 57 | 
 58 | # Create or update conda environment
 59 | Print-Step "Creating/updating conda environment 'cua' with Python 3.12..."
 60 | try {
 61 |     # Check if environment exists
 62 |     $envExists = conda env list | Select-String "^cua\s"
 63 |     if ($envExists) {
 64 |         Print-Step "Environment 'cua' already exists. Updating..."
 65 |         conda env update -n cua -f environment.yml --prune
 66 |     } else {
 67 |         Print-Step "Creating new environment 'cua'..."
 68 |         conda create -n cua python=3.12 -y
 69 |     }
 70 |     Print-Success "Conda environment 'cua' ready"
 71 | } catch {
 72 |     Print-Error "Failed to create/update conda environment"
 73 |     exit 1
 74 | }
 75 | 
 76 | # Activate conda environment
 77 | Print-Step "Activating conda environment 'cua'..."
 78 | try {
 79 |     conda activate cua
 80 |     Print-Success "Environment activated"
 81 | } catch {
 82 |     Print-Error "Failed to activate conda environment 'cua'"
 83 |     Print-Step "Please run: conda activate cua"
 84 |     Print-Step "Then re-run this script"
 85 |     exit 1
 86 | }
 87 | 
 88 | # Clean up existing environments and cache
 89 | Print-Step "Cleaning up existing environments..."
 90 | Get-ChildItem -Path . -Recurse -Directory -Name "__pycache__" | ForEach-Object { Remove-Item -Path $_ -Recurse -Force }
 91 | Get-ChildItem -Path . -Recurse -Directory -Name ".pytest_cache" | ForEach-Object { Remove-Item -Path $_ -Recurse -Force }
 92 | Get-ChildItem -Path . -Recurse -Directory -Name "dist" | ForEach-Object { Remove-Item -Path $_ -Recurse -Force }
 93 | Get-ChildItem -Path . -Recurse -Directory -Name "*.egg-info" | ForEach-Object { Remove-Item -Path $_ -Recurse -Force }
 94 | 
 95 | # Function to install a package and its dependencies
 96 | function Install-Package {
 97 |     param(
 98 |         [string]$PackageDir,
 99 |         [string]$PackageName,
100 |         [string]$Extras = ""
101 |     )
102 |     
103 |     Print-Step "Installing $PackageName..."
104 |     Set-Location $PackageDir
105 |     
106 |     if (Test-Path "pyproject.toml") {
107 |         if ($Extras) {
108 |             pip install -e ".[$Extras]"
109 |         } else {
110 |             pip install -e .
111 |         }
112 |     } else {
113 |         Print-Error "No pyproject.toml found in $PackageDir"
114 |         Set-Location $PROJECT_ROOT
115 |         return $false
116 |     }
117 |     
118 |     Set-Location $PROJECT_ROOT
119 |     return $true
120 | }
121 | 
122 | # Install packages in order of dependency
123 | Print-Step "Installing packages in development mode..."
124 | 
125 | # Install core first (base package with telemetry support)
126 | if (-not (Install-Package "libs/python/core" "core")) { exit 1 }
127 | 
128 | # Install pylume (base dependency)
129 | if (-not (Install-Package "libs/python/pylume" "pylume")) { exit 1 }
130 | 
131 | # Install computer with all its dependencies and extras
132 | if (-not (Install-Package "libs/python/computer" "computer" "all")) { exit 1 }
133 | 
134 | # Install omniparser
135 | if (-not (Install-Package "libs/python/som" "som")) { exit 1 }
136 | 
137 | # Install agent with all its dependencies and extras
138 | if (-not (Install-Package "libs/python/agent" "agent" "all")) { exit 1 }
139 | 
140 | # Install computer-server
141 | if (-not (Install-Package "libs/python/computer-server" "computer-server")) { exit 1 }
142 | 
143 | # Install mcp-server
144 | if (-not (Install-Package "libs/python/mcp-server" "mcp-server")) { exit 1 }
145 | 
146 | # Install development tools from root project
147 | Print-Step "Installing development dependencies..."
148 | pip install -e ".[dev,test,docs]"
149 | 
150 | # Create a .env file for VS Code to use the virtual environment
151 | Print-Step "Creating .env file for VS Code..."
152 | $pythonPath = "$PROJECT_ROOT/libs/python/core;$PROJECT_ROOT/libs/python/computer;$PROJECT_ROOT/libs/python/agent;$PROJECT_ROOT/libs/python/som;$PROJECT_ROOT/libs/python/pylume;$PROJECT_ROOT/libs/python/computer-server;$PROJECT_ROOT/libs/python/mcp-server"
153 | "PYTHONPATH=$pythonPath" | Out-File -FilePath ".env" -Encoding UTF8
154 | 
155 | Print-Success "All packages installed successfully!"
156 | Print-Step "Your conda environment 'cua' is ready. To activate it:"
157 | Write-Host "  conda activate cua" -ForegroundColor Yellow
158 | 
```

--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/integrations/hud.mdx:
--------------------------------------------------------------------------------

```markdown
  1 | ---
  2 | title: HUD Evals
  3 | description: Use ComputerAgent with HUD for benchmarking and evaluation
  4 | ---
  5 | 
  6 | <Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.</Callout>
  7 | 
  8 | The HUD integration allows an agent to be benchmarked using the [HUD framework](https://www.hud.so/). Through the HUD integration, the agent controls a computer inside HUD, where tests are run to evaluate the success of each task.
  9 | 
 10 | ## Installation
 11 | 
 12 | First, install the required package:
 13 | 
 14 | ```bash
 15 | pip install "cua-agent[hud]"
 16 | ## or install hud-python directly
 17 | # pip install hud-python==0.4.12
 18 | ```
 19 | 
 20 | ## Environment Variables
 21 | 
 22 | Before running any evaluations, you’ll need to set up your environment variables for HUD and your model providers:
 23 | 
 24 | ```bash
 25 | # HUD access
 26 | export HUD_API_KEY="your_hud_api_key"
 27 | 
 28 | # Model provider keys (at least one required)
 29 | export OPENAI_API_KEY="your_openai_key"
 30 | export ANTHROPIC_API_KEY="your_anthropic_key"
 31 | ```
 32 | 
 33 | ## Running a Single Task
 34 | 
 35 | You can run a single task from a HUD dataset for quick verification.
 36 | 
 37 | ### Example
 38 | 
 39 | ```python
 40 | from agent.integrations.hud import run_single_task
 41 | 
 42 | await run_single_task(
 43 |     dataset="hud-evals/OSWorld-Verified",   # or another HUD dataset
 44 |     model="openai/computer-use-preview+openai/gpt-5-nano",  # any supported model string
 45 |     task_id=155,  # e.g., reopen last closed tab
 46 | )
 47 | ```
 48 | 
 49 | ### Parameters
 50 | 
 51 | - `task_id` (`int`): Default: `0`
 52 |   Index of the task to run from the dataset.
 53 | 
 54 | ## Running a Full Dataset
 55 | 
 56 | To benchmark your agent at scale, you can run an entire dataset (or a subset) in parallel.
 57 | 
 58 | ### Example
 59 | 
 60 | ```python
 61 | from agent.integrations.hud import run_full_dataset
 62 | 
 63 | results = await run_full_dataset(
 64 |     dataset="hud-evals/OSWorld-Verified",   # can also pass a Dataset or list[dict]
 65 |     model="openai/computer-use-preview",
 66 |     split="train[:3]",           # try a few tasks to start
 67 |     max_concurrent=20,            # tune to your infra
 68 |     max_steps=50                  # safety cap per task
 69 | )
 70 | ```
 71 | 
 72 | ### Parameters
 73 | 
 74 | - `job_name` (`str` | `None`):
 75 |   Optional human-readable name for the evaluation job (shows up in HUD UI).
 76 | - `max_concurrent` (`int`): Default: `30`
 77 |   Number of tasks to run in parallel. Scale this based on your infra.
 78 | - `max_steps` (`int`): Default: `50`
 79 |   Safety cap on steps per task to prevent infinite loops.
 80 | - `split` (`str`): Default: `"train"`
 81 |   Dataset split or subset to run. Uses the [Hugging Face split format](https://huggingface.co/docs/datasets/v1.11.0/splits.html), e.g., `"train[:10]"` for the first 10 tasks.
 82 | 
 83 | ## Additional Parameters
 84 | 
 85 | Both single-task and full-dataset runs share a common set of configuration options. These let you fine-tune how the evaluation runs.
 86 | 
 87 | - `dataset` (`str` | `Dataset` | `list[dict]`): **Required**
 88 |   HUD dataset name (e.g. `"hud-evals/OSWorld-Verified"`), a loaded `Dataset`, or a list of tasks.
 89 | - `model` (`str`): Default: `"computer-use-preview"`
 90 |   Model string, e.g. `"openai/computer-use-preview+openai/gpt-5-nano"`. Supports composition with `+` (planning + grounding).
 91 | - `allowed_tools` (`list[str]`): Default: `["openai_computer"]`
 92 |   Restrict which tools the agent may use.
 93 | - `tools` (`list[Any]`):
 94 |   Extra tool configs to inject.
 95 | - `custom_loop` (`Callable`):
 96 |   Optional custom agent loop function. If provided, overrides automatic loop selection.
 97 | - `only_n_most_recent_images` (`int`): Default: `5` for full dataset, `None` for single task.
 98 |   Retain only the last N screenshots in memory.
 99 | - `callbacks` (`list[Any]`):
100 |   Hook functions for logging, telemetry, or side effects.
101 | - `verbosity` (`int`):
102 |   Logging level. Set `2` for debugging every call/action.
103 | - `trajectory_dir` (`str` | `dict`):
104 |   Save local copies of trajectories for replay/analysis.
105 | - `max_retries` (`int`): Default: `3`
106 |   Number of retries for failed model/tool calls.
107 | - `screenshot_delay` (`float` | `int`): Default: `0.5`
108 |   Delay (seconds) between screenshots to avoid race conditions.
109 | - `use_prompt_caching` (`bool`): Default: `False`
110 |   Cache repeated prompts to reduce API calls.
111 | - `max_trajectory_budget` (`float` | `dict`):
112 |   Limit on trajectory size/budget (e.g., tokens, steps).
113 | - `telemetry_enabled` (`bool`): Default: `True`
114 |   Whether to send telemetry/traces to HUD.
115 | - `**kwargs` (`any`):
116 |   Any additional keyword arguments are passed through to the agent loop or model provider.
117 | 
118 | ## Available Benchmarks
119 | 
120 | HUD provides multiple benchmark datasets for realistic evaluation.
121 | 
122 | 1. **[OSWorld-Verified](/agent-sdk/benchmarks/osworld-verified)** – Benchmark on 369+ real-world desktop tasks across Chrome, LibreOffice, GIMP, VS Code, etc.
123 |    *Best for*: evaluating full computer-use agents in realistic environments.
124 |    *Verified variant*: fixes 300+ issues from earlier versions for reliability.
125 | 
126 | **Coming soon:** SheetBench (spreadsheet automation) and other specialized HUD datasets.
127 | 
128 | See the [HUD docs](https://docs.hud.so/environment-creation) for more eval environments.
129 | 
130 | ## Tips
131 | 
132 | * **Debugging:** set `verbosity=2` to see every model call and tool action.
133 | * **Performance:** lower `screenshot_delay` for faster runs; raise it if you see race conditions.
134 | * **Safety:** always set `max_steps` (defaults to 50) to prevent runaway loops.
135 | * **Custom tools:** pass extra `tools=[...]` into the agent config if you need beyond `openai_computer`.
```

--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/message-format.mdx:
--------------------------------------------------------------------------------

```markdown
  1 | ---
  2 | title: Message Format
  3 | ---
  4 | 
  5 | This page documents the Python message and response schema used by the Agent SDK.
  6 | It mirrors the structure shown in Chat History and provides precise type definitions you can target in your own code.
  7 | 
  8 | All examples below use Python type hints with `TypedDict` and `Literal` from the standard `typing` module.
  9 | 
 10 | ## Response
 11 | 
 12 | The agent yields response chunks as an async generator of objects with `output` and `usage`.
 13 | 
 14 | ```python
 15 | from typing import List, TypedDict
 16 | 
 17 | class Usage(TypedDict, total=False):
 18 |     prompt_tokens: int
 19 |     completion_tokens: int
 20 |     total_tokens: int
 21 |     response_cost: float  # USD cost if available
 22 | 
 23 | class AgentResponse(TypedDict):
 24 |     output: List["AgentMessage"]
 25 |     usage: Usage
 26 | ```
 27 | 
 28 | ## Messages
 29 | 
 30 | Agent messages represent the state of the conversation and the agent's actions.
 31 | 
 32 | ```python
 33 | from typing import List, Literal, Optional, TypedDict, Union
 34 | 
 35 | # Union of all message variants
 36 | AgentMessage = Union[
 37 |     "UserMessage",
 38 |     "AssistantMessage",
 39 |     "ReasoningMessage",
 40 |     "ComputerCallMessage",
 41 |     "ComputerCallOutputMessage",
 42 |     "FunctionCallMessage",
 43 |     "FunctionCallOutputMessage",
 44 | ]
 45 | 
 46 | # Input message (role: user/system/developer)
 47 | class UserMessage(TypedDict, total=False):
 48 |     type: Literal["message"]  # optional for user input
 49 |     role: Literal["user", "system", "developer"]
 50 |     content: Union[str, List["InputContent"]]
 51 | 
 52 | # Output message (assistant text)
 53 | class AssistantMessage(TypedDict):
 54 |     type: Literal["message"]
 55 |     role: Literal["assistant"]
 56 |     content: List["OutputContent"]
 57 | 
 58 | # Output reasoning/thinking message
 59 | class ReasoningMessage(TypedDict):
 60 |     type: Literal["reasoning"]
 61 |     summary: List["SummaryContent"]
 62 | 
 63 | # Output computer action call (agent intends to act)
 64 | class ComputerCallMessage(TypedDict):
 65 |     type: Literal["computer_call"]
 66 |     call_id: str
 67 |     status: Literal["completed", "failed", "pending"]
 68 |     action: "ComputerAction"
 69 | 
 70 | # Output computer action result (always a screenshot)
 71 | class ComputerCallOutputMessage(TypedDict):
 72 |     type: Literal["computer_call_output"]
 73 |     call_id: str
 74 |     output: "ComputerResultContent"
 75 | 
 76 | # Output function call (agent calls a Python tool)
 77 | class FunctionCallMessage(TypedDict):
 78 |     type: Literal["function_call"]
 79 |     call_id: str
 80 |     status: Literal["completed", "failed", "pending"]
 81 |     name: str
 82 |     arguments: str  # JSON-serialized kwargs
 83 | 
 84 | # Output function call result (text)
 85 | class FunctionCallOutputMessage(TypedDict):
 86 |     type: Literal["function_call_output"]
 87 |     call_id: str
 88 |     output: str
 89 | ```
 90 | 
 91 | ## Message Content
 92 | 
 93 | These content items appear inside `content` arrays for the message types above.
 94 | 
 95 | ```python
 96 | # Input content kinds
 97 | class InputContent(TypedDict):
 98 |     type: Literal["input_image", "input_text"]
 99 |     text: Optional[str]
100 |     image_url: Optional[str]  # e.g., data URL
101 | 
102 | # Assistant output content
103 | class OutputContent(TypedDict):
104 |     type: Literal["output_text"]
105 |     text: str
106 | 
107 | # Reasoning/summary output content
108 | class SummaryContent(TypedDict):
109 |     type: Literal["summary_text"]
110 |     text: str
111 | 
112 | # Computer call outputs (screenshots)
113 | class ComputerResultContent(TypedDict):
114 |     type: Literal["computer_screenshot", "input_image"]
115 |     image_url: str  # data URL (e.g., "data:image/png;base64,....")
116 | ```
117 | 
118 | ## Actions
119 | 
120 | Computer actions represent concrete operations the agent will perform on the computer.
121 | 
122 | Two broad families exist depending on the provider: OpenAI-style and Anthropic-style.
123 | 
124 | ```python
125 | # Union of all supported computer actions
126 | ComputerAction = Union[
127 |     "ClickAction",
128 |     "DoubleClickAction",
129 |     "DragAction",
130 |     "KeyPressAction",
131 |     "MoveAction",
132 |     "ScreenshotAction",
133 |     "ScrollAction",
134 |     "TypeAction",
135 |     "WaitAction",
136 |     # Anthropic variants
137 |     "LeftMouseDownAction",
138 |     "LeftMouseUpAction",
139 | ]
140 | 
141 | # OpenAI Computer Actions
142 | class ClickAction(TypedDict):
143 |     type: Literal["click"]
144 |     button: Literal["left", "right", "wheel", "back", "forward"]
145 |     x: int
146 |     y: int
147 | 
148 | class DoubleClickAction(TypedDict, total=False):
149 |     type: Literal["double_click"]
150 |     button: Literal["left", "right", "wheel", "back", "forward"]
151 |     x: int
152 |     y: int
153 | 
154 | class DragAction(TypedDict, total=False):
155 |     type: Literal["drag"]
156 |     button: Literal["left", "right", "wheel", "back", "forward"]
157 |     path: List[tuple[int, int]]  # [(x1, y1), (x2, y2), ...]
158 | 
159 | class KeyPressAction(TypedDict):
160 |     type: Literal["keypress"]
161 |     keys: List[str]  # e.g., ["ctrl", "a"]
162 | 
163 | class MoveAction(TypedDict):
164 |     type: Literal["move"]
165 |     x: int
166 |     y: int
167 | 
168 | class ScreenshotAction(TypedDict):
169 |     type: Literal["screenshot"]
170 | 
171 | class ScrollAction(TypedDict):
172 |     type: Literal["scroll"]
173 |     scroll_x: int
174 |     scroll_y: int
175 |     x: int
176 |     y: int
177 | 
178 | class TypeAction(TypedDict):
179 |     type: Literal["type"]
180 |     text: str
181 | 
182 | class WaitAction(TypedDict):
183 |     type: Literal["wait"]
184 | 
185 | # Anthropic Computer Actions
186 | class LeftMouseDownAction(TypedDict):
187 |     type: Literal["left_mouse_down"]
188 |     x: int
189 |     y: int
190 | 
191 | class LeftMouseUpAction(TypedDict):
192 |     type: Literal["left_mouse_up"]
193 |     x: int
194 |     y: int
195 | ```
196 | 
197 | ## Notes
198 | 
199 | - The agent runtime may add provider-specific fields when available (e.g., usage cost). Unknown fields should be ignored for forward compatibility.
200 | - Computer action outputs are screenshots as data URLs. For security and storage, some serializers may redact or omit large fields in persisted metadata.
201 | - The message flow typically alternates between reasoning, actions, screenshots, and concluding assistant text. See [Chat History](./chat-history) for a step-by-step example.
202 | 
```

--------------------------------------------------------------------------------
/docs/content/docs/computer-sdk/cloud-vm-management.mdx:
--------------------------------------------------------------------------------

```markdown
  1 | ---
  2 | title: Cloud VM Management
  3 | description: Manage your Cua Cloud sandboxes (VMs) via Python SDK or HTTP API
  4 | ---
  5 | 
  6 | import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
  7 | 
  8 | 
  9 | Using the Cua Cloud API, you can manage your Cua Cloud sandboxes (VMs) with Python or HTTP (curl).
 10 | 
 11 | All examples require a CUA API key. You can obtain one from the [Dashboard](https://www.cua.ai/dashboard/keys).
 12 | 
 13 | ---
 14 | 
 15 | ## List VMs
 16 | 
 17 | <Tabs items={["Python", "curl"]}>
 18 |   <Tab value="Python">
 19 | 
 20 |   ```python
 21 |   import os
 22 |   import asyncio
 23 |   from computer.providers.cloud.provider import CloudProvider
 24 | 
 25 |   async def main():
 26 |       api_key = os.getenv("CUA_API_KEY") or "your-api-key"
 27 |       # Optional: point to a different API base
 28 |       # os.environ["CUA_API_BASE"] = "https://api.cua.ai"
 29 | 
 30 |       provider = CloudProvider(api_key=api_key, verbose=False)
 31 |       async with provider:
 32 |           vms = await provider.list_vms()
 33 |           for vm in vms:
 34 |               print({
 35 |                   "name": vm["name"],
 36 |                   "status": vm["status"],
 37 |                   "api_url": vm.get("api_url"),
 38 |                   "vnc_url": vm.get("vnc_url"),
 39 |               })
 40 | 
 41 |   if __name__ == "__main__":
 42 |       asyncio.run(main())
 43 |   ```
 44 | 
 45 |   </Tab>
 46 |   <Tab value="curl">
 47 | 
 48 |   ```bash
 49 |   curl -H "Authorization: Bearer $CUA_API_KEY" \
 50 |        "https://api.cua.ai/v1/vms"
 51 |   ```
 52 | 
 53 |   Responses:
 54 |   - 200: Array of minimal VM objects with fields `{ name, password, status }`
 55 |   - 401: Unauthorized (missing/invalid API key)
 56 | 
 57 |   ```json
 58 |   [
 59 |     {
 60 |       "name": "s-windows-x4snp46ebf",
 61 |       "password": "49b8daa3",
 62 |       "status": "running"
 63 |     }
 64 |   ]
 65 |   ```
 66 | 
 67 |   Status values:
 68 | 
 69 |   - `pending`: VM deployment in progress
 70 |   - `running`: VM is active and accessible
 71 |   - `stopped`: VM is stopped but not terminated
 72 |   - `terminated`: VM has been permanently destroyed
 73 |   - `failed`: VM deployment or operation failed
 74 | 
 75 |   </Tab>
 76 | </Tabs>
 77 | 
 78 | ---
 79 | 
 80 | ## Start a VM
 81 | Provide the VM name you want to start.
 82 | 
 83 | <Tabs items={["Python", "curl"]}>
 84 |   <Tab value="Python">
 85 | 
 86 |   ```python
 87 |   import os
 88 |   import asyncio
 89 |   from computer.providers.cloud.provider import CloudProvider
 90 | 
 91 |   async def main():
 92 |       api_key = os.getenv("CUA_API_KEY") or "your-api-key"
 93 |       name = "my-vm-name"  # e.g., "m-linux-96lcxd2c2k"
 94 | 
 95 |       provider = CloudProvider(api_key=api_key)
 96 |       async with provider:
 97 |           resp = await provider.run_vm(name)
 98 |           print(resp)  # { "name": name, "status": "starting" }
 99 | 
100 |   if __name__ == "__main__":
101 |       asyncio.run(main())
102 |   ```
103 | 
104 |   </Tab>
105 |   <Tab value="curl">
106 | 
107 |   ```bash
108 |   curl -X POST \
109 |        -H "Authorization: Bearer $CUA_API_KEY" \
110 |        "https://api.cua.ai/v1/vms/my-vm-name/start" -i
111 |   ```
112 | 
113 |   Responses:
114 |   - 204: No Content (start accepted)
115 |   - 401: Unauthorized (missing/invalid API key)
116 |   - 404: VM not found or not owned by the user
117 | 
118 |   ```text
119 |   HTTP/1.1 204 No Content
120 |   ```
121 | 
122 |   </Tab>
123 | </Tabs>
124 | 
125 | ---
126 | 
127 | ## Stop a VM
128 | Stops the VM asynchronously.
129 | 
130 | <Tabs items={["Python", "curl"]}>
131 |   <Tab value="Python">
132 | 
133 |   ```python
134 |   import os
135 |   import asyncio
136 |   from computer.providers.cloud.provider import CloudProvider
137 | 
138 |   async def main():
139 |       api_key = os.getenv("CUA_API_KEY") or "your-api-key"
140 |       name = "my-vm-name"
141 | 
142 |       provider = CloudProvider(api_key=api_key)
143 |       async with provider:
144 |           resp = await provider.stop_vm(name)
145 |           print(resp)  # { "name": name, "status": "stopping" }
146 | 
147 |   if __name__ == "__main__":
148 |       asyncio.run(main())
149 |   ```
150 | 
151 |   </Tab>
152 |   <Tab value="curl">
153 | 
154 |   ```bash
155 |   curl -X POST \
156 |        -H "Authorization: Bearer $CUA_API_KEY" \
157 |        "https://api.cua.ai/v1/vms/my-vm-name/stop"
158 |   ```
159 | 
160 |   Responses:
161 |   - 202: Accepted with `{ "status": "stopping" }`
162 |   - 401: Unauthorized (missing/invalid API key)
163 |   - 404: VM not found or not owned by the user
164 | 
165 |   ```json
166 |   { "status": "stopping" }
167 |   ```
168 | 
169 |   </Tab>
170 | </Tabs>
171 | 
172 | ---
173 | 
174 | ## Restart a VM
175 | Restarts the VM asynchronously.
176 | 
177 | <Tabs items={["Python", "curl"]}>
178 |   <Tab value="Python">
179 | 
180 |   ```python
181 |   import os
182 |   import asyncio
183 |   from computer.providers.cloud.provider import CloudProvider
184 | 
185 |   async def main():
186 |       api_key = os.getenv("CUA_API_KEY") or "your-api-key"
187 |       name = "my-vm-name"
188 | 
189 |       provider = CloudProvider(api_key=api_key)
190 |       async with provider:
191 |           resp = await provider.restart_vm(name)
192 |           print(resp)  # { "name": name, "status": "restarting" }
193 | 
194 |   if __name__ == "__main__":
195 |       asyncio.run(main())
196 |   ```
197 | 
198 |   </Tab>
199 |   <Tab value="curl">
200 | 
201 |   ```bash
202 |   curl -X POST \
203 |        -H "Authorization: Bearer $CUA_API_KEY" \
204 |        "https://api.cua.ai/v1/vms/my-vm-name/restart"
205 |   ```
206 | 
207 |   Responses:
208 |   - 202: Accepted with `{ "status": "restarting" }`
209 |   - 401: Unauthorized (missing/invalid API key)
210 |   - 404: VM not found or not owned by the user
211 | 
212 |   ```json
213 |   { "status": "restarting" }
214 |   ```
215 | 
216 |   </Tab>
217 | </Tabs>
218 | 
219 | ---
220 | 
221 | ## Query a VM by name
222 | Query the computer-server running on the VM. Useful for checking details like status or OS type.
223 | 
224 | <Tabs items={["Python", "curl"]}>
225 |   <Tab value="Python">
226 | 
227 |   ```python
228 |   import os
229 |   import asyncio
230 |   from computer.providers.cloud.provider import CloudProvider
231 | 
232 |   async def main():
233 |       api_key = os.getenv("CUA_API_KEY") or "your-api-key"
234 |       name = "my-vm-name"
235 | 
236 |       provider = CloudProvider(api_key=api_key)
237 |       async with provider:
238 |           info = await provider.get_vm(name)
239 |           print(info)
240 | 
241 |   if __name__ == "__main__":
242 |       asyncio.run(main())
243 |   ```
244 | 
245 |   </Tab>
246 |   <Tab value="curl">
247 | 
248 |   ```bash
249 |   curl "https://my-vm-name.containers.cloud.cua.ai:8443/status"
250 |   ```
251 | 
252 |   Responses:
253 |   - 200: Server available
254 | 
255 |   ```json
256 |   { "status": "ok", "os_type": "linux", "features": ["agent"] }
257 |   ```
258 | 
259 |   </Tab>
260 | </Tabs>
261 | 
```

--------------------------------------------------------------------------------
/libs/typescript/agent/src/client.ts:
--------------------------------------------------------------------------------

```typescript
  1 | import {Peer}  from "peerjs";
  2 | import type {
  3 |   AgentRequest,
  4 |   AgentResponse,
  5 |   ConnectionType,
  6 |   AgentClientOptions,
  7 | } from "./types";
  8 | 
  9 | export class AgentClient {
 10 |   private url: string;
 11 |   private connectionType: ConnectionType;
 12 |   private options: AgentClientOptions;
 13 |   private peer?: Peer;
 14 |   private connection?: any;
 15 | 
 16 |   constructor(url: string, options: AgentClientOptions = {}) {
 17 |     this.url = url;
 18 |     this.options = {
 19 |       timeout: 30000,
 20 |       retries: 3,
 21 |       ...options,
 22 |     };
 23 | 
 24 |     // Determine connection type from URL
 25 |     if (url.startsWith("http://") || url.startsWith("https://")) {
 26 |       this.connectionType = url.startsWith("https://") ? "https" : "http";
 27 |     } else if (url.startsWith("peer://")) {
 28 |       this.connectionType = "peer";
 29 |     } else {
 30 |       throw new Error(
 31 |         "Invalid URL format. Must start with http://, https://, or peer://"
 32 |       );
 33 |     }
 34 |   }
 35 | 
 36 |   // Main responses API matching the desired usage pattern
 37 |   public responses = {
 38 |     create: async (request: AgentRequest): Promise<AgentResponse> => {
 39 |       return this.sendRequest(request);
 40 |     },
 41 |   };
 42 | 
 43 |   private async sendRequest(request: AgentRequest): Promise<AgentResponse> {
 44 |     switch (this.connectionType) {
 45 |       case "http":
 46 |       case "https":
 47 |         return this.sendHttpRequest(request);
 48 |       case "peer":
 49 |         return this.sendPeerRequest(request);
 50 |       default:
 51 |         throw new Error(`Unsupported connection type: ${this.connectionType}`);
 52 |     }
 53 |   }
 54 | 
 55 |   private async sendHttpRequest(request: AgentRequest): Promise<AgentResponse> {
 56 |     const controller = new AbortController();
 57 |     const timeoutId = setTimeout(
 58 |       () => controller.abort(),
 59 |       this.options.timeout
 60 |     );
 61 | 
 62 |     try {
 63 |       const headers: Record<string, string> = {
 64 |         "Content-Type": "application/json",
 65 |       };
 66 |       if (this.options.apiKey) {
 67 |         headers["X-API-Key"] = this.options.apiKey;
 68 |       }
 69 | 
 70 |       const response = await fetch(`${this.url}/responses`, {
 71 |         method: "POST",
 72 |         headers,
 73 |         body: JSON.stringify(request),
 74 |         signal: controller.signal,
 75 |       });
 76 | 
 77 |       clearTimeout(timeoutId);
 78 | 
 79 |       if (!response.ok) {
 80 |         throw new Error(`HTTP error! status: ${response.status}`);
 81 |       }
 82 | 
 83 |       const data = await response.json();
 84 |       return data as AgentResponse;
 85 |     } catch (error) {
 86 |       clearTimeout(timeoutId);
 87 |       if (error instanceof Error) {
 88 |         throw new Error(`Failed to send HTTP request: ${error.message}`);
 89 |       }
 90 |       throw error;
 91 |     }
 92 |   }
 93 | 
 94 |   private async sendPeerRequest(request: AgentRequest): Promise<AgentResponse> {
 95 |     // Extract peer ID from peer:// URL
 96 |     const peerId = this.url.replace("peer://", "");
 97 | 
 98 |     if (!this.peer) {
 99 |       // Initialize peer connection with default options as requested
100 |       this.peer = new Peer();
101 | 
102 |       return new Promise<AgentResponse>((resolve, reject) => {
103 |         const timeout = setTimeout(() => {
104 |           reject(new Error("Peer connection timeout"));
105 |         }, this.options.timeout);
106 | 
107 |         this.peer!.on("open", () => {
108 |           // Connect to the target peer
109 |           this.connection = this.peer!.connect(peerId);
110 | 
111 |           this.connection.on("open", () => {
112 |             // Send the request
113 |             this.connection!.send(JSON.stringify(request));
114 |           });
115 | 
116 |           this.connection.on("data", (data: any) => {
117 |             clearTimeout(timeout);
118 |             try {
119 |               const response =
120 |                 typeof data === "string" ? JSON.parse(data) : data;
121 |               resolve(response as AgentResponse);
122 |             } catch (error) {
123 |               reject(new Error("Failed to parse peer response"));
124 |             }
125 |           });
126 | 
127 |           this.connection.on("error", (error: any) => {
128 |             clearTimeout(timeout);
129 |             reject(new Error(`Peer connection error: ${error}`));
130 |           });
131 |         });
132 | 
133 |         this.peer!.on("error", (error: any) => {
134 |           clearTimeout(timeout);
135 |           reject(new Error(`Peer error: ${error}`));
136 |         });
137 |       });
138 |     } else {
139 |       // Reuse existing connection
140 |       return new Promise<AgentResponse>((resolve, reject) => {
141 |         const timeout = setTimeout(() => {
142 |           reject(new Error("Peer request timeout"));
143 |         }, this.options.timeout);
144 | 
145 |         if (this.connection && this.connection.open) {
146 |           this.connection.send(JSON.stringify(request));
147 | 
148 |           const handleData = (data: any) => {
149 |             clearTimeout(timeout);
150 |             this.connection!.off("data", handleData);
151 |             try {
152 |               const response =
153 |                 typeof data === "string" ? JSON.parse(data) : data;
154 |               resolve(response as AgentResponse);
155 |             } catch (error) {
156 |               reject(new Error("Failed to parse peer response"));
157 |             }
158 |           };
159 | 
160 |           this.connection.on("data", handleData);
161 |         } else {
162 |           clearTimeout(timeout);
163 |           reject(new Error("Peer connection not available"));
164 |         }
165 |       });
166 |     }
167 |   }
168 | 
169 |   // Health check method
170 |   async health(): Promise<{ status: string }> {
171 |     if (this.connectionType === "peer") {
172 |       return { status: this.peer?.open ? "connected" : "disconnected" };
173 |     }
174 | 
175 |     try {
176 |       const response = await fetch(`${this.url}/health`);
177 |       if (response.ok) {
178 |         return { status: "healthy" };
179 |       }
180 |       return { status: "unhealthy" };
181 |     } catch {
182 |       return { status: "unreachable" };
183 |     }
184 |   }
185 | 
186 |   // Clean up resources
187 |   async disconnect(): Promise<void> {
188 |     if (this.connection) {
189 |       this.connection.close();
190 |       this.connection = undefined;
191 |     }
192 |     if (this.peer) {
193 |       this.peer.destroy();
194 |       this.peer = undefined;
195 |     }
196 |   }
197 | }
198 | 
```

--------------------------------------------------------------------------------
/scripts/build-uv.sh:
--------------------------------------------------------------------------------

```bash
  1 | #!/bin/bash
  2 | 
  3 | # Exit on error
  4 | set -e
  5 | 
  6 | # Colors for output
  7 | RED='\033[0;31m'
  8 | GREEN='\033[0;32m'
  9 | BLUE='\033[0;34m'
 10 | YELLOW='\033[1;33m'
 11 | NC='\033[0m' # No Color
 12 | 
 13 | # Function to print step information
 14 | print_step() {
 15 |     echo -e "${BLUE}==> $1${NC}"
 16 | }
 17 | 
 18 | # Function to print success message
 19 | print_success() {
 20 |     echo -e "${GREEN}==> Success: $1${NC}"
 21 | }
 22 | 
 23 | # Function to print error message
 24 | print_error() {
 25 |     echo -e "${RED}==> Error: $1${NC}" >&2
 26 | }
 27 | 
 28 | # Function to print warning message
 29 | print_warning() {
 30 |     echo -e "${YELLOW}==> Warning: $1${NC}"
 31 | }
 32 | 
 33 | # Function to check if UV is installed
 34 | check_uv() {
 35 |     if command -v uv &> /dev/null; then
 36 |         print_success "UV is already installed"
 37 |         uv --version
 38 |         return 0
 39 |     else
 40 |         return 1
 41 |     fi
 42 | }
 43 | 
 44 | # Function to install UV
 45 | install_uv() {
 46 |     print_step "UV not found. Installing UV..."
 47 |     
 48 |     # Detect OS
 49 |     if [[ "$OSTYPE" == "linux-gnu"* ]] || [[ "$OSTYPE" == "darwin"* ]]; then
 50 |         print_step "Installing UV for Unix-like system..."
 51 |         curl -LsSf https://astral.sh/uv/install.sh | sh
 52 |         
 53 |         # Add UV to PATH for current session
 54 |         export PATH="$HOME/.cargo/bin:$PATH"
 55 |         
 56 |         # Check if installation was successful
 57 |         if command -v uv &> /dev/null; then
 58 |             print_success "UV installed successfully"
 59 |             uv --version
 60 |         else
 61 |             print_error "UV installation failed"
 62 |             print_step "Please restart your terminal and try again, or install manually:"
 63 |             echo "  curl -LsSf https://astral.sh/uv/install.sh | sh"
 64 |             exit 1
 65 |         fi
 66 |     elif [[ "$OSTYPE" == "msys" ]] || [[ "$OSTYPE" == "cygwin" ]]; then
 67 |         print_error "For Windows, please use PowerShell and run:"
 68 |         echo "  powershell -ExecutionPolicy ByPass -c \"irm https://astral.sh/uv/install.ps1 | iex\""
 69 |         exit 1
 70 |     else
 71 |         print_error "Unsupported operating system: $OSTYPE"
 72 |         print_step "Please install UV manually from: https://docs.astral.sh/uv/getting-started/installation/"
 73 |         exit 1
 74 |     fi
 75 | }
 76 | 
 77 | # Get the script's directory
 78 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 79 | PROJECT_ROOT="$( cd "${SCRIPT_DIR}/.." && pwd )"
 80 | 
 81 | # Change to project root
 82 | cd "$PROJECT_ROOT"
 83 | 
 84 | # Check if UV is installed, install if not
 85 | if ! check_uv; then
 86 |     install_uv
 87 | fi
 88 | 
 89 | # Load environment variables from .env.local
 90 | if [ -f .env.local ]; then
 91 |     print_step "Loading environment variables from .env.local..."
 92 |     set -a
 93 |     source .env.local
 94 |     set +a
 95 |     print_success "Environment variables loaded"
 96 | else
 97 |     print_error ".env.local file not found"
 98 |     exit 1
 99 | fi
100 | 
101 | # Clean up existing environments and cache
102 | print_step "Cleaning up existing environments..."
103 | find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
104 | find . -type d -name ".pytest_cache" -exec rm -rf {} + 2>/dev/null || true
105 | find . -type d -name "dist" -exec rm -rf {} + 2>/dev/null || true
106 | find . -type d -name ".venv" -exec rm -rf {} + 2>/dev/null || true
107 | find . -type d -name "*.egg-info" -exec rm -rf {} + 2>/dev/null || true
108 | print_success "Environment cleanup complete"
109 | 
110 | # Install Python 3.12 using UV
111 | print_step "Installing Python 3.12 using UV..."
112 | uv python install 3.12
113 | print_success "Python 3.12 installed"
114 | 
115 | # Create virtual environment using UV
116 | print_step "Creating virtual environment with UV..."
117 | uv venv .venv --python 3.12
118 | print_success "Virtual environment created"
119 | 
120 | # Activate virtual environment
121 | print_step "Activating virtual environment..."
122 | source .venv/bin/activate
123 | print_success "Virtual environment activated"
124 | 
125 | # Function to install a package and its dependencies using UV
126 | install_package() {
127 |     local package_dir=$1
128 |     local package_name=$2
129 |     local extras=$3
130 |     print_step "Installing ${package_name} with UV..."
131 |     cd "$package_dir"
132 |     
133 |     if [ -f "pyproject.toml" ]; then
134 |         if [ -n "$extras" ]; then
135 |             uv pip install -e ".[${extras}]"
136 |         else
137 |             uv pip install -e .
138 |         fi
139 |     else
140 |         print_error "No pyproject.toml found in ${package_dir}"
141 |         return 1
142 |     fi
143 |     
144 |     cd "$PROJECT_ROOT"
145 | }
146 | 
147 | # Install packages in order of dependency
148 | print_step "Installing packages in development mode with UV..."
149 | 
150 | # Install core first (base package with telemetry support)
151 | install_package "libs/python/core" "core"
152 | 
153 | # Install pylume (base dependency)
154 | install_package "libs/python/pylume" "pylume"
155 | 
156 | # Install computer with all its dependencies and extras
157 | install_package "libs/python/computer" "computer" "all"
158 | 
159 | # Install omniparser
160 | install_package "libs/python/som" "som"
161 | 
162 | # Install agent with all its dependencies and extras
163 | install_package "libs/python/agent" "agent" "all"
164 | 
165 | # Install computer-server
166 | install_package "libs/python/computer-server" "computer-server"
167 | 
168 | # Install mcp-server
169 | install_package "libs/python/mcp-server" "mcp-server"
170 | 
171 | # Install development tools from root project
172 | print_step "Installing development dependencies with UV..."
173 | uv pip install -e ".[dev,test,docs]"
174 | 
175 | # Create a .env file for VS Code to use the virtual environment
176 | print_step "Creating .env file for VS Code..."
177 | echo "PYTHONPATH=${PROJECT_ROOT}/libs/python/core:${PROJECT_ROOT}/libs/python/computer:${PROJECT_ROOT}/libs/python/agent:${PROJECT_ROOT}/libs/python/som:${PROJECT_ROOT}/libs/python/pylume:${PROJECT_ROOT}/libs/python/computer-server:${PROJECT_ROOT}/libs/python/mcp-server" > .env
178 | 
179 | print_success "All packages installed successfully with UV!"
180 | print_step "Your virtual environment is ready. To activate it:"
181 | echo "  source .venv/bin/activate"
182 | print_step "UV provides fast dependency resolution and installation."
183 | print_step "You can also use 'uv run' to run commands in the virtual environment without activation."
184 | 
```

--------------------------------------------------------------------------------
/libs/python/computer/computer/providers/winsandbox/setup_script.ps1:
--------------------------------------------------------------------------------

```
  1 | # Setup script for Windows Sandbox CUA Computer provider
  2 | # This script runs when the sandbox starts
  3 | 
  4 | Write-Host "Starting CUA Computer setup in Windows Sandbox..."
  5 | 
  6 | # Function to find the mapped Python installation from pywinsandbox
  7 | function Find-MappedPython {
  8 |     Write-Host "Looking for mapped Python installation from pywinsandbox..."
  9 |     
 10 |     # pywinsandbox maps the host Python installation to the sandbox
 11 |     # Look for mapped shared folders on the desktop (common pywinsandbox pattern)
 12 |     $desktopPath = "C:\Users\WDAGUtilityAccount\Desktop"
 13 |     $sharedFolders = Get-ChildItem -Path $desktopPath -Directory -ErrorAction SilentlyContinue
 14 |     
 15 |     foreach ($folder in $sharedFolders) {
 16 |         # Look for Python executables in shared folders
 17 |         $pythonPaths = @(
 18 |             "$($folder.FullName)\python.exe",
 19 |             "$($folder.FullName)\Scripts\python.exe",
 20 |             "$($folder.FullName)\bin\python.exe"
 21 |         )
 22 |         
 23 |         foreach ($pythonPath in $pythonPaths) {
 24 |             if (Test-Path $pythonPath) {
 25 |                 try {
 26 |                     $version = & $pythonPath --version 2>&1
 27 |                     if ($version -match "Python") {
 28 |                         Write-Host "Found mapped Python: $pythonPath - $version"
 29 |                         return $pythonPath
 30 |                     }
 31 |                 } catch {
 32 |                     continue
 33 |                 }
 34 |             }
 35 |         }
 36 |         
 37 |         # Also check subdirectories that might contain Python
 38 |         $subDirs = Get-ChildItem -Path $folder.FullName -Directory -ErrorAction SilentlyContinue
 39 |         foreach ($subDir in $subDirs) {
 40 |             $pythonPath = "$($subDir.FullName)\python.exe"
 41 |             if (Test-Path $pythonPath) {
 42 |                 try {
 43 |                     $version = & $pythonPath --version 2>&1
 44 |                     if ($version -match "Python") {
 45 |                         Write-Host "Found mapped Python in subdirectory: $pythonPath - $version"
 46 |                         return $pythonPath
 47 |                     }
 48 |                 } catch {
 49 |                     continue
 50 |                 }
 51 |             }
 52 |         }
 53 |     }
 54 |     
 55 |     # Fallback: try common Python commands that might be available
 56 |     $pythonCommands = @("python", "py", "python3")
 57 |     foreach ($cmd in $pythonCommands) {
 58 |         try {
 59 |             $version = & $cmd --version 2>&1
 60 |             if ($version -match "Python") {
 61 |                 Write-Host "Found Python via command '$cmd': $version"
 62 |                 return $cmd
 63 |             }
 64 |         } catch {
 65 |             continue
 66 |         }
 67 |     }
 68 |     
 69 |     throw "Could not find any Python installation (mapped or otherwise)"
 70 | }
 71 | 
 72 | try {
 73 |     # Step 1: Find the mapped Python installation
 74 |     Write-Host "Step 1: Finding mapped Python installation..."
 75 |     $pythonExe = Find-MappedPython
 76 |     Write-Host "Using Python: $pythonExe"
 77 |     
 78 |     # Verify Python works and show version
 79 |     $pythonVersion = & $pythonExe --version 2>&1
 80 |     Write-Host "Python version: $pythonVersion"
 81 | 
 82 |     # Step 2: Create a dedicated virtual environment in mapped Desktop folder (persistent)
 83 |     Write-Host "Step 2: Creating virtual environment (if needed)..."
 84 |     $cachePath = "C:\Users\WDAGUtilityAccount\Desktop\wsb_cache"
 85 |     $venvPath = "C:\Users\WDAGUtilityAccount\Desktop\wsb_cache\venv"
 86 |     if (!(Test-Path $venvPath)) {
 87 |         Write-Host "Creating venv at: $venvPath"
 88 |         & $pythonExe -m venv $venvPath
 89 |     } else {
 90 |         Write-Host "Venv already exists at: $venvPath"
 91 |     }
 92 |     # Hide the folder to keep Desktop clean
 93 |     try {
 94 |         $item = Get-Item $cachePath -ErrorAction SilentlyContinue
 95 |         if ($item) {
 96 |             if (-not ($item.Attributes -band [IO.FileAttributes]::Hidden)) {
 97 |                 $item.Attributes = $item.Attributes -bor [IO.FileAttributes]::Hidden
 98 |             }
 99 |         }
100 |     } catch { }
101 |     $venvPython = Join-Path $venvPath "Scripts\python.exe"
102 |     if (!(Test-Path $venvPython)) {
103 |         throw "Virtual environment Python not found at $venvPython"
104 |     }
105 |     Write-Host "Using venv Python: $venvPython"
106 | 
107 |     # Step 3: Install cua-computer-server into the venv
108 |     Write-Host "Step 3: Installing cua-computer-server..."
109 |     
110 |     Write-Host "Upgrading pip..."
111 |     & $venvPython -m pip install --upgrade pip --quiet
112 |     
113 |     Write-Host "Installing cua-computer-server..."
114 |     & $venvPython -m pip install cua-computer-server
115 |     
116 |     Write-Host "cua-computer-server installation completed."
117 | 
118 |     # Step 4: Start computer server in background using the venv Python
119 |     Write-Host "Step 4: Starting computer server in background..."
120 |     Write-Host "Starting computer server with: $venvPython"
121 |     
122 |     # Start the computer server in the background
123 |     $serverProcess = Start-Process -FilePath $venvPython -ArgumentList "-m", "computer_server.main" -WindowStyle Hidden -PassThru
124 |     Write-Host "Computer server started in background with PID: $($serverProcess.Id)"
125 |     
126 |     # Give it a moment to start
127 |     Start-Sleep -Seconds 3
128 |     
129 |     # Check if the process is still running
130 |     if (Get-Process -Id $serverProcess.Id -ErrorAction SilentlyContinue) {
131 |         Write-Host "Computer server is running successfully in background"
132 |     } else {
133 |         throw "Computer server failed to start or exited immediately"
134 |     }
135 | 
136 | } catch {
137 |     Write-Error "Setup failed: $_"
138 |     Write-Host "Error details: $($_.Exception.Message)"
139 |     Write-Host "Stack trace: $($_.ScriptStackTrace)"
140 |     Write-Host ""
141 |     Write-Host "Press any key to close this window..."
142 |     $null = $Host.UI.RawUI.ReadKey("NoEcho,IncludeKeyDown")
143 |     exit 1
144 | }
145 | 
146 | Write-Host ""
147 | Write-Host "Setup completed successfully!"
148 | Write-Host "Press any key to close this window..."
149 | $null = $Host.UI.RawUI.ReadKey("NoEcho,IncludeKeyDown")
150 | 
```

--------------------------------------------------------------------------------
/libs/python/som/som/ocr.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import List, Dict, Any, Tuple, Union
  2 | import logging
  3 | import signal
  4 | from contextlib import contextmanager
  5 | from pathlib import Path
  6 | import easyocr
  7 | from PIL import Image
  8 | import numpy as np
  9 | import torch
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class TimeoutException(Exception):
 15 |     pass
 16 | 
 17 | 
 18 | @contextmanager
 19 | def timeout(seconds: int):
 20 |     import threading
 21 |     
 22 |     # Check if we're in the main thread
 23 |     if threading.current_thread() is threading.main_thread():
 24 |         def timeout_handler(signum, frame):
 25 |             raise TimeoutException("OCR process timed out")
 26 | 
 27 |         original_handler = signal.signal(signal.SIGALRM, timeout_handler)
 28 |         signal.alarm(seconds)
 29 | 
 30 |         try:
 31 |             yield
 32 |         finally:
 33 |             signal.alarm(0)
 34 |             signal.signal(signal.SIGALRM, original_handler)
 35 |     else:
 36 |         # In a non-main thread, we can't use signal
 37 |         logger.warning("Timeout function called from non-main thread; signal-based timeout disabled")
 38 |         try:
 39 |             yield
 40 |         finally:
 41 |             pass
 42 | 
 43 | 
 44 | class OCRProcessor:
 45 |     """Class for handling OCR text detection."""
 46 | 
 47 |     _shared_reader = None  # Class-level shared reader instance
 48 | 
 49 |     def __init__(self):
 50 |         """Initialize the OCR processor."""
 51 |         self.reader = None
 52 |         # Determine best available device
 53 |         self.device = "cpu"
 54 |         if torch.cuda.is_available():
 55 |             self.device = "cuda"
 56 |         elif (
 57 |             hasattr(torch, "backends")
 58 |             and hasattr(torch.backends, "mps")
 59 |             and torch.backends.mps.is_available()
 60 |         ):
 61 |             self.device = "mps"
 62 |         logger.info(f"OCR processor initialized with device: {self.device}")
 63 | 
 64 |     def _ensure_reader(self):
 65 |         """Ensure EasyOCR reader is initialized.
 66 | 
 67 |         Uses a class-level cached reader to avoid reinitializing on every instance.
 68 |         """
 69 |         # First check if we already have a class-level reader
 70 |         if OCRProcessor._shared_reader is not None:
 71 |             self.reader = OCRProcessor._shared_reader
 72 |             return
 73 | 
 74 |         # Otherwise initialize a new one
 75 |         if self.reader is None:
 76 |             try:
 77 |                 logger.info("Initializing EasyOCR reader...")
 78 |                 import easyocr
 79 | 
 80 |                 # Use GPU if available
 81 |                 use_gpu = self.device in ["cuda", "mps"]
 82 |                 self.reader = easyocr.Reader(["en"], gpu=use_gpu)
 83 |                 
 84 |                 # Verify reader initialization
 85 |                 if self.reader is None:
 86 |                     raise ValueError("Failed to initialize EasyOCR reader")
 87 | 
 88 |                 # Cache the reader at class level
 89 |                 OCRProcessor._shared_reader = self.reader
 90 | 
 91 |                 logger.info(f"EasyOCR reader initialized successfully with GPU={use_gpu}")
 92 |             except Exception as e:
 93 |                 logger.error(f"Failed to initialize EasyOCR reader: {str(e)}")
 94 |                 # Set to a placeholder that will be checked
 95 |                 self.reader = None
 96 |                 raise RuntimeError(f"EasyOCR initialization failed: {str(e)}") from e
 97 | 
 98 |     def detect_text(
 99 |         self, image: Image.Image, confidence_threshold: float = 0.5, timeout_seconds: int = 5
100 |     ) -> List[Dict[str, Any]]:
101 |         """Detect text in an image using EasyOCR.
102 | 
103 |         Args:
104 |             image: PIL Image to process
105 |             confidence_threshold: Minimum confidence for text detection
106 |             timeout_seconds: Maximum time to wait for OCR
107 | 
108 |         Returns:
109 |             List of text detection dictionaries
110 |         """
111 |         try:
112 |             # Try to initialize reader, catch any exceptions
113 |             try:
114 |                 self._ensure_reader()
115 |             except Exception as e:
116 |                 logger.error(f"Failed to initialize OCR reader: {str(e)}")
117 |                 return []
118 | 
119 |             # Ensure reader was properly initialized
120 |             if self.reader is None:
121 |                 logger.error("OCR reader is None after initialization")
122 |                 return []
123 | 
124 |             # Convert PIL Image to numpy array
125 |             image_np = np.array(image)
126 | 
127 |             try:
128 |                 with timeout(timeout_seconds):
129 |                     results = self.reader.readtext(
130 |                         image_np, paragraph=False, text_threshold=confidence_threshold
131 |                     )
132 |             except TimeoutException:
133 |                 logger.warning("OCR timed out")
134 |                 return []
135 |             except Exception as e:
136 |                 logger.warning(f"OCR failed: {str(e)}")
137 |                 return []
138 | 
139 |             detections = []
140 |             img_width, img_height = image.size
141 | 
142 |             for box, text, conf in results:
143 |                 # Ensure conf is float
144 |                 conf_float = float(conf)
145 |                 if conf_float < confidence_threshold:
146 |                     continue
147 | 
148 |                 # Convert box format to [x1, y1, x2, y2]
149 |                 # Ensure box points are properly typed as float
150 |                 x1 = min(float(point[0]) for point in box) / img_width
151 |                 y1 = min(float(point[1]) for point in box) / img_height
152 |                 x2 = max(float(point[0]) for point in box) / img_width
153 |                 y2 = max(float(point[1]) for point in box) / img_height
154 | 
155 |                 detections.append(
156 |                     {
157 |                         "type": "text",
158 |                         "bbox": [x1, y1, x2, y2],
159 |                         "content": text,
160 |                         "confidence": conf,
161 |                         "interactivity": False,  # Text is typically non-interactive
162 |                     }
163 |                 )
164 | 
165 |             return detections
166 |         except Exception as e:
167 |             logger.error(f"Unexpected error in OCR processing: {str(e)}")
168 |             return []
169 | 
```

--------------------------------------------------------------------------------
/.github/workflows/pypi-publish-mcp-server.yml:
--------------------------------------------------------------------------------

```yaml
  1 | name: Publish MCP Server Package
  2 | 
  3 | on:
  4 |   push:
  5 |     tags:
  6 |       - "mcp-server-v*"
  7 |   workflow_dispatch:
  8 |     inputs:
  9 |       version:
 10 |         description: "Version to publish (without v prefix)"
 11 |         required: true
 12 |         default: "0.1.0"
 13 |   workflow_call:
 14 |     inputs:
 15 |       version:
 16 |         description: "Version to publish"
 17 |         required: true
 18 |         type: string
 19 |     outputs:
 20 |       version:
 21 |         description: "The version that was published"
 22 |         value: ${{ jobs.prepare.outputs.version }}
 23 | 
 24 | # Adding permissions at workflow level
 25 | permissions:
 26 |   contents: write
 27 | 
 28 | jobs:
 29 |   prepare:
 30 |     runs-on: macos-latest
 31 |     outputs:
 32 |       version: ${{ steps.get-version.outputs.version }}
 33 |       agent_version: ${{ steps.update-deps.outputs.agent_version }}
 34 |       computer_version: ${{ steps.update-deps.outputs.computer_version }}
 35 |     steps:
 36 |       - uses: actions/checkout@v4
 37 | 
 38 |       - name: Determine version
 39 |         id: get-version
 40 |         run: |
 41 |           if [ "${{ github.event_name }}" == "push" ]; then
 42 |             # Extract version from tag (for package-specific tags)
 43 |             if [[ "${{ github.ref }}" =~ ^refs/tags/mcp-server-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then
 44 |               VERSION=${BASH_REMATCH[1]}
 45 |             else
 46 |               echo "Invalid tag format for mcp-server"
 47 |               exit 1
 48 |             fi
 49 |           elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
 50 |             # Use version from workflow dispatch
 51 |             VERSION=${{ github.event.inputs.version }}
 52 |           else
 53 |             # Use version from workflow_call
 54 |             VERSION=${{ inputs.version }}
 55 |           fi
 56 |           echo "VERSION=$VERSION"
 57 |           echo "version=$VERSION" >> $GITHUB_OUTPUT
 58 | 
 59 |       - name: Set up Python
 60 |         uses: actions/setup-python@v4
 61 |         with:
 62 |           python-version: "3.11"
 63 | 
 64 |       - name: Update dependencies to latest versions
 65 |         id: update-deps
 66 |         run: |
 67 |           cd libs/python/mcp-server
 68 | 
 69 |           # Install required package for PyPI API access
 70 |           pip install requests
 71 | 
 72 |           # Create a Python script for PyPI version checking
 73 |           cat > get_latest_versions.py << 'EOF'
 74 |           import requests
 75 |           import json
 76 |           import sys
 77 | 
 78 |           def get_package_version(package_name, fallback="0.1.0"):
 79 |               try:
 80 |                   response = requests.get(f'https://pypi.org/pypi/{package_name}/json')
 81 |                   print(f"API Response Status for {package_name}: {response.status_code}", file=sys.stderr)
 82 |                   
 83 |                   if response.status_code != 200:
 84 |                       print(f"API request failed for {package_name}, using fallback version", file=sys.stderr)
 85 |                       return fallback
 86 |                   
 87 |                   data = json.loads(response.text)
 88 |                   
 89 |                   if 'info' not in data:
 90 |                       print(f"Missing 'info' key in API response for {package_name}, using fallback version", file=sys.stderr)
 91 |                       return fallback
 92 |                       
 93 |                   return data['info']['version']
 94 |               except Exception as e:
 95 |                   print(f"Error fetching version for {package_name}: {str(e)}", file=sys.stderr)
 96 |                   return fallback
 97 | 
 98 |           # Get latest versions
 99 |           print(get_package_version('cua-agent'))
100 |           print(get_package_version('cua-computer'))
101 |           EOF
102 | 
103 |           # Execute the script to get the versions
104 |           VERSIONS=($(python get_latest_versions.py))
105 |           LATEST_AGENT=${VERSIONS[0]}
106 |           LATEST_COMPUTER=${VERSIONS[1]}
107 | 
108 |           echo "Latest cua-agent version: $LATEST_AGENT"
109 |           echo "Latest cua-computer version: $LATEST_COMPUTER"
110 | 
111 |           # Output the versions for the next job
112 |           echo "agent_version=$LATEST_AGENT" >> $GITHUB_OUTPUT
113 |           echo "computer_version=$LATEST_COMPUTER" >> $GITHUB_OUTPUT
114 | 
115 |           # Determine major version for version constraint
116 |           AGENT_MAJOR=$(echo $LATEST_AGENT | cut -d. -f1)
117 |           COMPUTER_MAJOR=$(echo $LATEST_COMPUTER | cut -d. -f1)
118 | 
119 |           NEXT_AGENT_MAJOR=$((AGENT_MAJOR + 1))
120 |           NEXT_COMPUTER_MAJOR=$((COMPUTER_MAJOR + 1))
121 | 
122 |           # Update dependencies in pyproject.toml
123 |           if [[ "$OSTYPE" == "darwin"* ]]; then
124 |             # macOS version of sed needs an empty string for -i
125 |             # Update cua-agent with all extras
126 |             sed -i '' "s/\"cua-agent\[all\]>=.*,<.*\"/\"cua-agent[all]>=$LATEST_AGENT,<$NEXT_AGENT_MAJOR.0.0\"/" pyproject.toml
127 |             sed -i '' "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml
128 |           else
129 |             # Linux version
130 |             sed -i "s/\"cua-agent\[all\]>=.*,<.*\"/\"cua-agent[all]>=$LATEST_AGENT,<$NEXT_AGENT_MAJOR.0.0\"/" pyproject.toml
131 |             sed -i "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml
132 |           fi
133 | 
134 |           # Display the updated dependencies
135 |           echo "Updated dependencies in pyproject.toml:"
136 |           grep -E "cua-agent|cua-computer" pyproject.toml
137 | 
138 |   publish:
139 |     needs: prepare
140 |     uses: ./.github/workflows/pypi-reusable-publish.yml
141 |     with:
142 |       package_name: "mcp-server"
143 |       package_dir: "libs/python/mcp-server"
144 |       version: ${{ needs.prepare.outputs.version }}
145 |       is_lume_package: false
146 |       base_package_name: "cua-mcp-server"
147 |     secrets:
148 |       PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
149 | 
150 |   set-env-variables:
151 |     needs: [prepare, publish]
152 |     runs-on: macos-latest
153 |     steps:
154 |       - name: Set environment variables for use in other jobs
155 |         run: |
156 |           echo "AGENT_VERSION=${{ needs.prepare.outputs.agent_version }}" >> $GITHUB_ENV
157 |           echo "COMPUTER_VERSION=${{ needs.prepare.outputs.computer_version }}" >> $GITHUB_ENV
158 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/gta1.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | GTA1 agent loop implementation for click prediction using litellm.acompletion
  3 | Paper: https://arxiv.org/pdf/2507.05791
  4 | Code: https://github.com/Yan98/GTA1
  5 | """
  6 | 
  7 | import asyncio
  8 | import json
  9 | import re
 10 | import base64
 11 | from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
 12 | from io import BytesIO
 13 | import uuid
 14 | from PIL import Image
 15 | import litellm
 16 | import math
 17 | 
 18 | from ..decorators import register_agent
 19 | from ..types import Messages, AgentResponse, Tools, AgentCapability
 20 | from ..loops.base import AsyncAgentConfig
 21 | 
 22 | SYSTEM_PROMPT = '''
 23 | You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. The image resolution is height {height} and width {width}. For elements with area, return the center point.
 24 | 
 25 | Output the coordinate pair exactly:
 26 | (x,y)
 27 | '''.strip()
 28 | 
 29 | def extract_coordinates(raw_string: str) -> Tuple[float, float]:
 30 |     """Extract coordinates from model output."""
 31 |     try:
 32 |         matches = re.findall(r"\((-?\d*\.?\d+),\s*(-?\d*\.?\d+)\)", raw_string)
 33 |         return tuple(map(float, matches[0])) # type: ignore
 34 |     except:
 35 |         return (0.0, 0.0)
 36 | 
 37 | def smart_resize(height: int, width: int, factor: int = 28, min_pixels: int = 3136, max_pixels: int = 8847360) -> Tuple[int, int]:
 38 |     """Smart resize function similar to qwen_vl_utils."""
 39 |     # Calculate the total pixels
 40 |     total_pixels = height * width
 41 |     
 42 |     # If already within bounds, return original dimensions
 43 |     if min_pixels <= total_pixels <= max_pixels:
 44 |         # Round to nearest factor
 45 |         new_height = (height // factor) * factor
 46 |         new_width = (width // factor) * factor
 47 |         return new_height, new_width
 48 |     
 49 |     # Calculate scaling factor
 50 |     if total_pixels > max_pixels:
 51 |         scale = (max_pixels / total_pixels) ** 0.5
 52 |     else:
 53 |         scale = (min_pixels / total_pixels) ** 0.5
 54 |     
 55 |     # Apply scaling
 56 |     new_height = int(height * scale)
 57 |     new_width = int(width * scale)
 58 |     
 59 |     # Round to nearest factor
 60 |     new_height = (new_height // factor) * factor
 61 |     new_width = (new_width // factor) * factor
 62 |     
 63 |     # Ensure minimum size
 64 |     new_height = max(new_height, factor)
 65 |     new_width = max(new_width, factor)
 66 |     
 67 |     return new_height, new_width
 68 | 
 69 | @register_agent(models=r".*GTA1.*")
 70 | class GTA1Config(AsyncAgentConfig):
 71 |     """GTA1 agent configuration implementing AsyncAgentConfig protocol for click prediction."""
 72 |     
 73 |     def __init__(self):
 74 |         self.current_model = None
 75 |         self.last_screenshot_b64 = None
 76 |     
 77 | 
 78 |     async def predict_step(
 79 |         self,
 80 |         messages: List[Dict[str, Any]],
 81 |         model: str,
 82 |         tools: Optional[List[Dict[str, Any]]] = None,
 83 |         max_retries: Optional[int] = None,
 84 |         stream: bool = False,
 85 |         computer_handler=None,
 86 |         _on_api_start=None,
 87 |         _on_api_end=None,
 88 |         _on_usage=None,
 89 |         _on_screenshot=None,
 90 |         **kwargs
 91 |     ) -> Dict[str, Any]:
 92 |         raise NotImplementedError()
 93 | 
 94 |     async def predict_click(
 95 |         self,
 96 |         model: str,
 97 |         image_b64: str,
 98 |         instruction: str,
 99 |         **kwargs
100 |     ) -> Optional[Tuple[float, float]]:
101 |         """
102 |         Predict click coordinates using GTA1 model via litellm.acompletion.
103 |         
104 |         Args:
105 |             model: The GTA1 model name
106 |             image_b64: Base64 encoded image
107 |             instruction: Instruction for where to click
108 |             
109 |         Returns:
110 |             Tuple of (x, y) coordinates or None if prediction fails
111 |         """
112 |         # Decode base64 image
113 |         image_data = base64.b64decode(image_b64)
114 |         image = Image.open(BytesIO(image_data))
115 |         width, height = image.width, image.height
116 |         
117 |         # Smart resize the image (similar to qwen_vl_utils)
118 |         resized_height, resized_width = smart_resize(
119 |             height, width, 
120 |             factor=28,  # Default factor for Qwen models
121 |             min_pixels=3136,
122 |             max_pixels=4096 * 2160
123 |         )
124 |         resized_image = image.resize((resized_width, resized_height))
125 |         scale_x, scale_y = width / resized_width, height / resized_height
126 |         
127 |         # Convert resized image back to base64
128 |         buffered = BytesIO()
129 |         resized_image.save(buffered, format="PNG")
130 |         resized_image_b64 = base64.b64encode(buffered.getvalue()).decode()
131 |         
132 |         # Prepare system and user messages
133 |         system_message = {
134 |             "role": "system",
135 |             "content": SYSTEM_PROMPT.format(height=resized_height, width=resized_width)
136 |         }
137 |         
138 |         user_message = {
139 |             "role": "user",
140 |             "content": [
141 |                 {
142 |                     "type": "image_url",
143 |                     "image_url": {
144 |                         "url": f"data:image/png;base64,{resized_image_b64}"
145 |                     }
146 |                 },
147 |                 {
148 |                     "type": "text",
149 |                     "text": instruction
150 |                 }
151 |             ]
152 |         }
153 |         
154 |         # Prepare API call kwargs
155 |         api_kwargs = {
156 |             "model": model,
157 |             "messages": [system_message, user_message],
158 |             "max_tokens": 2056,
159 |             "temperature": 0.0,
160 |             **kwargs
161 |         }
162 |         
163 |         # Use liteLLM acompletion
164 |         response = await litellm.acompletion(**api_kwargs)
165 |         
166 |         # Extract response text
167 |         output_text = response.choices[0].message.content # type: ignore
168 |         
169 |         # Extract and rescale coordinates
170 |         pred_x, pred_y = extract_coordinates(output_text) # type: ignore
171 |         pred_x *= scale_x
172 |         pred_y *= scale_y
173 |         
174 |         return (math.floor(pred_x), math.floor(pred_y))
175 |     
176 |     def get_capabilities(self) -> List[AgentCapability]:
177 |         """Return the capabilities supported by this agent."""
178 |         return ["click"]
179 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/benchmarks/models/gta1.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | GTA1 model implementation for benchmarking.
  3 | """
  4 | 
  5 | from typing import Optional, Tuple
  6 | from PIL import Image
  7 | import torch
  8 | import re
  9 | import gc
 10 | from qwen_vl_utils import process_vision_info, smart_resize
 11 | from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 12 | 
 13 | from .base import ModelProtocol
 14 | 
 15 | 
 16 | class GTA1Model:
 17 |     """Ground truth GTA1 model implementation."""
 18 |     
 19 |     def __init__(self, model_path: str = "HelloKKMe/GTA1-7B"):
 20 |         self.model_path = model_path
 21 |         self.model = None
 22 |         self.processor = None
 23 |         self.max_new_tokens = 32
 24 |         
 25 |         self.system_prompt = '''
 26 | You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. The image resolution is height {height} and width {width}. For elements with area, return the center point.
 27 | 
 28 | Output the coordinate pair exactly:
 29 | (x,y)
 30 | '''.strip()
 31 |     
 32 |     @property
 33 |     def model_name(self) -> str:
 34 |         """Return the name of the model."""
 35 |         return f"GTA1-{self.model_path.split('/')[-1]}"
 36 |     
 37 |     async def load_model(self) -> None:
 38 |         """Load the model into memory."""
 39 |         if self.model is None:
 40 |             print(f"Loading GTA1 model: {self.model_path}")
 41 |             self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 42 |                 self.model_path,
 43 |                 torch_dtype=torch.bfloat16,
 44 |                 device_map="auto"
 45 |             )
 46 |             self.processor = AutoProcessor.from_pretrained(
 47 |                 self.model_path,
 48 |                 min_pixels=3136,
 49 |                 max_pixels=4096 * 2160
 50 |             )
 51 |             print("GTA1 model loaded successfully")
 52 |     
 53 |     async def unload_model(self) -> None:
 54 |         """Unload the model from memory."""
 55 |         if self.model is not None:
 56 |             print("Unloading GTA1 model from GPU...")
 57 |             del self.model
 58 |             del self.processor
 59 |             self.model = None
 60 |             self.processor = None
 61 |             gc.collect()
 62 |             if torch.cuda.is_available():
 63 |                 torch.cuda.empty_cache()
 64 |             print("GTA1 model unloaded")
 65 |     
 66 |     def _extract_coordinates(self, raw_string: str) -> Tuple[int, int]:
 67 |         """Extract coordinates from model output."""
 68 |         try:
 69 |             matches = re.findall(r"\((-?\d*\.?\d+),\s*(-?\d*\.?\d+)\)", raw_string)
 70 |             return tuple(map(int, map(float, matches[0]))) # type: ignore
 71 |         except:
 72 |             return (0, 0)
 73 |     
 74 |     async def predict_click(self, image: Image.Image, instruction: str) -> Optional[Tuple[int, int]]:
 75 |         """
 76 |         Predict click coordinates for the given image and instruction.
 77 |         
 78 |         Args:
 79 |             image: PIL Image to analyze
 80 |             instruction: Text instruction describing what to click
 81 |             
 82 |         Returns:
 83 |             Tuple of (x, y) coordinates or None if prediction fails
 84 |         """
 85 |         if self.model is None or self.processor is None:
 86 |             await self.load_model()
 87 | 
 88 |         assert self.processor is not None
 89 |         assert self.model is not None
 90 |         
 91 |         try:
 92 |             width, height = image.width, image.height
 93 |             
 94 |             # Resize image according to processor requirements
 95 |             resized_height, resized_width = smart_resize(
 96 |                 image.height,
 97 |                 image.width,
 98 |                 factor=self.processor.image_processor.patch_size * self.processor.image_processor.merge_size,
 99 |                 min_pixels=self.processor.image_processor.min_pixels,
100 |                 max_pixels=self.processor.image_processor.max_pixels,
101 |             )
102 |             resized_image = image.resize((resized_width, resized_height))
103 |             scale_x, scale_y = width / resized_width, height / resized_height
104 |             
105 |             # Prepare messages
106 |             system_message = {
107 |                 "role": "system",
108 |                 "content": self.system_prompt.format(height=resized_height, width=resized_width)
109 |             }
110 |             
111 |             user_message = {
112 |                 "role": "user",
113 |                 "content": [
114 |                     {"type": "image", "image": resized_image},
115 |                     {"type": "text", "text": instruction}
116 |                 ]
117 |             }
118 |             
119 |             # Process inputs
120 |             image_inputs, video_inputs = process_vision_info([system_message, user_message]) # type: ignore
121 |             text = self.processor.apply_chat_template(
122 |                 [system_message, user_message], 
123 |                 tokenize=False, 
124 |                 add_generation_prompt=True
125 |             )
126 |             inputs = self.processor(
127 |                 text=[text], 
128 |                 images=image_inputs, 
129 |                 videos=video_inputs, 
130 |                 padding=True, 
131 |                 return_tensors="pt"
132 |             )
133 |             inputs = inputs.to(self.model.device)
134 |             
135 |             # Generate prediction
136 |             output_ids = self.model.generate(
137 |                 **inputs, 
138 |                 max_new_tokens=self.max_new_tokens, 
139 |                 do_sample=False, 
140 |                 temperature=1.0, 
141 |                 use_cache=True
142 |             )
143 |             generated_ids = [
144 |                 output_ids[len(input_ids):] 
145 |                 for input_ids, output_ids in zip(inputs.input_ids, output_ids)
146 |             ]
147 |             output_text = self.processor.batch_decode(
148 |                 generated_ids, 
149 |                 skip_special_tokens=True, 
150 |                 clean_up_tokenization_spaces=True
151 |             )[0]
152 |             
153 |             # Extract and rescale coordinates
154 |             pred_x, pred_y = self._extract_coordinates(output_text)
155 |             pred_x = int(pred_x * scale_x)
156 |             pred_y = int(pred_y * scale_y)
157 |             
158 |             return (pred_x, pred_y)
159 |             
160 |         except Exception as e:
161 |             print(f"Error in GTA1 prediction: {e}")
162 |             return None
163 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/integrations/hud/__init__.py:
--------------------------------------------------------------------------------

```python
  1 | """HUD integration: dataset runners and MCP-based computer agent export.
  2 | 
  3 | This module exposes helpers to evaluate HUD-compatible datasets and exports
  4 | the MCP-compatible computer agent implementation.
  5 | 
  6 | Exports:
  7 | - run_single_task(dataset, ...)
  8 | - run_full_dataset(dataset, ...)
  9 | - MCPComputerAgent
 10 | """
 11 | import time
 12 | from typing import Any, Optional
 13 | 
 14 | from agent.computers import is_agent_computer
 15 | from datasets import load_dataset, Dataset
 16 | from hud.datasets import Task, run_dataset
 17 | from hud import trace
 18 | 
 19 | from .agent import MCPComputerAgent
 20 | 
 21 | 
 22 | # ---------------------------------------------------------------------------
 23 | # Single-task runner
 24 | # ---------------------------------------------------------------------------
 25 | 
 26 | async def run_single_task(
 27 |     dataset: str | Dataset | list[dict[str, Any]],
 28 |     *,
 29 |     task_id: int = 0,
 30 |     model: str | None = None,
 31 |     allowed_tools: list[str] | None = None,
 32 |     # === ComputerAgent kwargs ===
 33 |     tools: list[Any] | None = None,
 34 |     custom_loop: Any | None = None,
 35 |     only_n_most_recent_images: int | None = None,
 36 |     callbacks: list[Any] | None = None,
 37 |     instructions: str | None = None,
 38 |     verbosity: int | None = None,
 39 |     trajectory_dir: str | dict | None = None,
 40 |     max_retries: int | None = 3,
 41 |     screenshot_delay: float | int = 0.5,
 42 |     use_prompt_caching: bool | None = False,
 43 |     max_trajectory_budget: float | dict | None = None,
 44 |     telemetry_enabled: bool | None = True,
 45 | ) -> None:
 46 |     """Load one task from the dataset and execute it with MCPComputerAgent."""
 47 | 
 48 |     # Load dataset and pick a sample
 49 |     if isinstance(dataset, str):
 50 |         dataset = load_dataset(dataset, split="train") # type: ignore[arg-type]
 51 |     elif isinstance(dataset, list):
 52 |         dataset = dataset
 53 |     else:
 54 |         dataset = dataset["train"]
 55 |     
 56 |     sample_task = dataset[task_id]  # type: ignore[index]
 57 |     task_prompt = sample_task.get("prompt", f"Task {sample_task.get('id', 0)}")  # type: ignore[attr-defined]
 58 | 
 59 |     # Filter any existing Computer tools
 60 |     # The eval framework will add its own Computer tool per task
 61 |     if tools:
 62 |         tools = [
 63 |             tool 
 64 |             for tool in tools 
 65 |             if not is_agent_computer(tool)
 66 |         ]
 67 |     
 68 |     with trace(name=task_prompt):
 69 |         task = Task(**sample_task)  # type: ignore[arg-type]
 70 | 
 71 |         agent = MCPComputerAgent(
 72 |             model=model or "computer-use-preview",
 73 |             allowed_tools=allowed_tools or ["openai_computer"],
 74 |             # === ComputerAgent kwargs passthrough ===
 75 |             tools=tools,
 76 |             custom_loop=custom_loop,
 77 |             only_n_most_recent_images=only_n_most_recent_images,
 78 |             callbacks=callbacks,
 79 |             instructions=instructions,
 80 |             verbosity=verbosity,
 81 |             trajectory_dir=trajectory_dir,
 82 |             max_retries=max_retries,
 83 |             screenshot_delay=screenshot_delay,
 84 |             use_prompt_caching=use_prompt_caching,
 85 |             max_trajectory_budget=max_trajectory_budget,
 86 |             telemetry_enabled=telemetry_enabled,
 87 |         )
 88 |         print(f"Running: {task_prompt}")
 89 |         result = await agent.run(task, max_steps=10)
 90 |         print(f"✅ Reward: {getattr(result, 'reward')}")
 91 | 
 92 | 
 93 | # ---------------------------------------------------------------------------
 94 | # Full-dataset runner
 95 | # ---------------------------------------------------------------------------
 96 | 
 97 | async def run_full_dataset(
 98 |     dataset: str | Dataset | list[dict[str, Any]],
 99 |     *,
100 |     job_name: Optional[str] = None,
101 |     model: str | None = None,
102 |     allowed_tools: list[str] | None = None,
103 |     max_concurrent: int = 30,
104 |     max_steps: int = 50,
105 |     split: str = "train",
106 |     trajectory_dir: str | dict | None = None,
107 |     # === ComputerAgent kwargs ===
108 |     tools: list[Any] | None = None,
109 |     custom_loop: Any | None = None,
110 |     only_n_most_recent_images: int | None = 5,
111 |     callbacks: list[Any] | None = None,
112 |     instructions: str | None = None,
113 |     verbosity: int | None = None,
114 |     max_retries: int | None = 3,
115 |     screenshot_delay: float | int = 0.5,
116 |     use_prompt_caching: bool | None = False,
117 |     max_trajectory_budget: float | dict | None = None,
118 |     telemetry_enabled: bool | None = True,
119 | ) -> list[Any]:
120 |     """Run evaluation across the entire dataset using hud.datasets.run_dataset."""
121 | 
122 |     # Run with our MCP-based agent class.
123 |     if isinstance(dataset, str):
124 |         dataset_name = dataset.split('/')[-1]
125 |         job_name = job_name or f"Evaluation {dataset_name}"
126 |         dataset = load_dataset(dataset, split=split) # type: ignore[arg-type]
127 |     else:
128 |         dataset_name = "custom"
129 |         job_name = job_name or f"Evaluation {time.strftime('%H:%M %Y-%m-%d')}"
130 | 
131 |     # Filter any existing Computer tools
132 |     # The eval framework will add its own Computer tool per task
133 |     if tools:
134 |         tools = [
135 |             tool 
136 |             for tool in tools 
137 |             if not is_agent_computer(tool)
138 |         ]
139 |     
140 |     # Execute evaluation
141 |     return await run_dataset(
142 |         name=job_name,
143 |         dataset=dataset,
144 |         agent_class=MCPComputerAgent,
145 |         agent_config={
146 |             "model": model,
147 |             "allowed_tools": allowed_tools,
148 |             "trajectory_dir": trajectory_dir,
149 |             # === ComputerAgent kwargs passthrough ===
150 |             "tools": tools,
151 |             "custom_loop": custom_loop,
152 |             "only_n_most_recent_images": only_n_most_recent_images,
153 |             "callbacks": callbacks,
154 |             "instructions": instructions,
155 |             "verbosity": verbosity,
156 |             "max_retries": max_retries,
157 |             "screenshot_delay": screenshot_delay,
158 |             "use_prompt_caching": use_prompt_caching,
159 |             "max_trajectory_budget": max_trajectory_budget,
160 |             "telemetry_enabled": telemetry_enabled,
161 |         },
162 |         max_concurrent=max_concurrent,
163 |         metadata={"dataset": dataset_name},
164 |         max_steps=max_steps,
165 |         auto_respond=True,
166 |     )
167 | 
168 | 
169 | __all__ = [
170 |     "run_single_task",
171 |     "run_full_dataset",
172 |     "MCPComputerAgent",
173 | ]
```

--------------------------------------------------------------------------------
/libs/lume/tests/VMTests.swift:
--------------------------------------------------------------------------------

```swift
  1 | import Foundation
  2 | import Testing
  3 | 
  4 | @testable import lume
  5 | 
  6 | class MockProcessRunner: ProcessRunner {
  7 |     var runCalls: [(executable: String, arguments: [String])] = []
  8 | 
  9 |     func run(executable: String, arguments: [String]) throws {
 10 |         runCalls.append((executable, arguments))
 11 |     }
 12 | }
 13 | 
 14 | private func setupVMDirectory(_ tempDir: URL) throws -> VMDirectory {
 15 |     let vmDir = VMDirectory(Path(tempDir.path))
 16 | 
 17 |     // Create disk image file
 18 |     let diskPath = vmDir.diskPath
 19 |     let diskData = Data(repeating: 0, count: 1024 * 1024)  // 1MB mock disk
 20 |     try diskData.write(to: diskPath.url)
 21 | 
 22 |     // Create nvram file
 23 |     let nvramPath = vmDir.nvramPath
 24 |     let nvramData = Data(repeating: 0, count: 1024)  // 1KB mock nvram
 25 |     try nvramData.write(to: nvramPath.url)
 26 | 
 27 |     // Create initial config file
 28 |     var config = try VMConfig(
 29 |         os: "mock-os",
 30 |         cpuCount: 1,
 31 |         memorySize: 1024,
 32 |         diskSize: 1024,
 33 |         display: "1024x768"
 34 |     )
 35 |     config.setMacAddress("00:11:22:33:44:55")
 36 |     try vmDir.saveConfig(config)
 37 | 
 38 |     // Create .initialized file to mark VM as initialized
 39 |     let initializedPath = vmDir.dir.file(".initialized")
 40 |     try Data().write(to: initializedPath.url)
 41 | 
 42 |     return vmDir
 43 | }
 44 | 
 45 | @MainActor
 46 | @Test("VM initialization and configuration")
 47 | func testVMInitialization() async throws {
 48 |     let tempDir = try createTempDirectory()
 49 |     let vmDir = try setupVMDirectory(tempDir)
 50 |     var config = try VMConfig(
 51 |         os: "mock-os",
 52 |         cpuCount: 1,
 53 |         memorySize: 1024,
 54 |         diskSize: 1024,
 55 |         display: "1024x768"
 56 |     )
 57 |     config.setMacAddress("00:11:22:33:44:55")  // Set MAC address to avoid nil
 58 |     let home = Home(fileManager: FileManager.default)
 59 |     let context = VMDirContext(dir: vmDir, config: config, home: home, storage: nil)
 60 | 
 61 |     let vm = MockVM(
 62 |         vmDirContext: context,
 63 |         virtualizationServiceFactory: { _ in MockVMVirtualizationService() },
 64 |         vncServiceFactory: { MockVNCService(vmDirectory: $0) }
 65 |     )
 66 | 
 67 |     // Test initial state
 68 |     let details = vm.details
 69 |     #expect(details.name == vmDir.name)
 70 |     #expect(details.os == "mock-os")
 71 |     #expect(details.status == "stopped")
 72 |     #expect(details.vncUrl == nil)
 73 | }
 74 | 
 75 | @MainActor
 76 | @Test("VM run and stop operations")
 77 | func testVMRunAndStop() async throws {
 78 |     let tempDir = try createTempDirectory()
 79 |     let vmDir = try setupVMDirectory(tempDir)
 80 |     var config = try VMConfig(
 81 |         os: "mock-os",
 82 |         cpuCount: 2,
 83 |         memorySize: 2048,
 84 |         diskSize: 1024,
 85 |         display: "1024x768"
 86 |     )
 87 |     config.setMacAddress("00:11:22:33:44:55")
 88 |     let home = Home(fileManager: FileManager.default)
 89 |     let context = VMDirContext(dir: vmDir, config: config, home: home, storage: nil)
 90 | 
 91 |     let vm = MockVM(
 92 |         vmDirContext: context,
 93 |         virtualizationServiceFactory: { _ in MockVMVirtualizationService() },
 94 |         vncServiceFactory: { MockVNCService(vmDirectory: $0) }
 95 |     )
 96 | 
 97 |     // Test running VM
 98 |     let runTask = Task {
 99 |         try await vm.run(
100 |             noDisplay: false, sharedDirectories: [], mount: nil as Path?, vncPort: 0,
101 |             recoveryMode: false)
102 |     }
103 | 
104 |     // Give the VM time to start
105 |     try await Task.sleep(nanoseconds: UInt64(1e9))
106 | 
107 |     // Test stopping VM
108 |     try await vm.stop()
109 |     runTask.cancel()
110 | }
111 | 
112 | @MainActor
113 | @Test("VM configuration updates")
114 | func testVMConfigurationUpdates() async throws {
115 |     let tempDir = try createTempDirectory()
116 |     let vmDir = try setupVMDirectory(tempDir)
117 |     var config = try VMConfig(
118 |         os: "mock-os",
119 |         cpuCount: 1,
120 |         memorySize: 1024,
121 |         diskSize: 1024,
122 |         display: "1024x768"
123 |     )
124 |     config.setMacAddress("00:11:22:33:44:55")
125 |     let home = Home(fileManager: FileManager.default)
126 |     let context = VMDirContext(dir: vmDir, config: config, home: home, storage: nil)
127 | 
128 |     let vm = MockVM(
129 |         vmDirContext: context,
130 |         virtualizationServiceFactory: { _ in MockVMVirtualizationService() },
131 |         vncServiceFactory: { MockVNCService(vmDirectory: $0) }
132 |     )
133 | 
134 |     // Test CPU count update
135 |     try vm.setCpuCount(4)
136 |     #expect(vm.vmDirContext.config.cpuCount == 4)
137 | 
138 |     // Test memory size update
139 |     try vm.setMemorySize(4096)
140 |     #expect(vm.vmDirContext.config.memorySize == 4096)
141 | 
142 |     // Test MAC address update
143 |     try vm.setMacAddress("00:11:22:33:44:66")
144 |     #expect(vm.vmDirContext.config.macAddress == "00:11:22:33:44:66")
145 | }
146 | 
147 | @MainActor
148 | @Test("VM setup process")
149 | func testVMSetup() async throws {
150 |     let tempDir = try createTempDirectory()
151 |     let vmDir = try setupVMDirectory(tempDir)
152 |     var config = try VMConfig(
153 |         os: "mock-os",
154 |         cpuCount: 1,
155 |         memorySize: 1024,
156 |         diskSize: 1024,
157 |         display: "1024x768"
158 |     )
159 |     config.setMacAddress("00:11:22:33:44:55")
160 |     let home = Home(fileManager: FileManager.default)
161 |     let context = VMDirContext(dir: vmDir, config: config, home: home, storage: nil)
162 | 
163 |     let vm = MockVM(
164 |         vmDirContext: context,
165 |         virtualizationServiceFactory: { _ in MockVMVirtualizationService() },
166 |         vncServiceFactory: { MockVNCService(vmDirectory: $0) }
167 |     )
168 | 
169 |     let expectedDiskSize: UInt64 = 64 * 1024 * 1024 * 1024  // 64 GB
170 | 
171 |     try await vm.setup(
172 |         ipswPath: "/path/to/mock.ipsw",
173 |         cpuCount: 2,
174 |         memorySize: 2048,
175 |         diskSize: expectedDiskSize,
176 |         display: "1024x768"
177 |     )
178 | 
179 |     #expect(vm.vmDirContext.config.cpuCount == 2)
180 |     #expect(vm.vmDirContext.config.memorySize == 2048)
181 |     let actualDiskSize = vm.vmDirContext.config.diskSize ?? 0
182 |     #expect(
183 |         actualDiskSize == expectedDiskSize,
184 |         "Expected disk size \(expectedDiskSize), but got \(actualDiskSize)")
185 |     #expect(vm.vmDirContext.config.macAddress == "00:11:22:33:44:55")
186 | }
187 | 
188 | private func createTempDirectory() throws -> URL {
189 |     let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent(UUID().uuidString)
190 |     try FileManager.default.createDirectory(at: tempDir, withIntermediateDirectories: true)
191 |     return tempDir
192 | }
193 | 
```

--------------------------------------------------------------------------------
/docs/content/docs/libraries/lume/cli-reference.mdx:
--------------------------------------------------------------------------------

```markdown
  1 | ---
  2 | title: Lume CLI Reference
  3 | description: Command Line Interface reference for Lume
  4 | ---
  5 | 
  6 | import { Callout } from 'fumadocs-ui/components/callout';
  7 | 
  8 | Once installed, you can start using Lume with these common workflows:
  9 | 
 10 | ### Run a Prebuilt VM
 11 | 
 12 | ```bash
 13 | # Run a macOS Sequoia VM
 14 | lume run macos-sequoia-vanilla:latest
 15 | 
 16 | # Run an Ubuntu VM
 17 | lume run ubuntu-noble-vanilla:latest
 18 | ```
 19 | 
 20 | <Callout>
 21 | We provide [prebuilt VM images](../lume/prebuilt-images) in our [ghcr registry](https://github.com/orgs/trycua/packages).
 22 | </Callout>
 23 | 
 24 | ### Create a Custom VM
 25 | 
 26 | ```bash
 27 | # Create a new macOS VM
 28 | lume create my-macos-vm --cpu 4 --memory 8GB --disk-size 50GB
 29 | 
 30 | # Create a Linux VM
 31 | lume create my-linux-vm --os linux --cpu 2 --memory 4GB
 32 | ```
 33 | 
 34 | <Callout title="Disk Space">
 35 | The actual disk space used by sparse images will be much lower than the logical size listed. You can resize VM disks after creation using `lume set <name> --disk-size <size>`.
 36 | </Callout>
 37 | 
 38 | ## VM Management
 39 | 
 40 |  lume create &lt;name&gt;
 41 | Create a new macOS or Linux virtual machine.
 42 | 
 43 | **Options:**
 44 | - `--os <os>` - Operating system to install (macOS or linux, default: macOS)
 45 | - `--cpu <cores>` - Number of CPU cores (default: 4)
 46 | - `--memory <size>` - Memory size, e.g., 8GB (default: 4GB)
 47 | - `--disk-size <size>` - Disk size, e.g., 50GB (default: 40GB)
 48 | - `--display <res>` - Display resolution (default: 1024x768)
 49 | - `--ipsw <path>` - Path to IPSW file or 'latest' for macOS VMs
 50 | - `--storage <name>` - VM storage location to use
 51 | 
 52 | **Examples:**
 53 | ```bash
 54 | # Create macOS VM with custom specs
 55 | lume create my-mac --cpu 6 --memory 16GB --disk-size 100GB
 56 | 
 57 | # Create Linux VM
 58 | lume create my-ubuntu --os linux --cpu 2 --memory 8GB
 59 | 
 60 | # Create macOS VM with latest IPSW
 61 | lume create my-sequoia --ipsw latest
 62 | ```
 63 | 
 64 |  lume run &lt;name&gt;
 65 | Start and run a virtual machine.
 66 | 
 67 | **Options:**
 68 | - `--no-display` - Do not start the VNC client app
 69 | - `--shared-dir <dir>` - Share directory with VM (format: path[:ro|rw])
 70 | - `--mount <path>` - For Linux VMs only, attach a read-only disk image
 71 | - `--registry <url>` - Container registry URL (default: ghcr.io)
 72 | - `--organization <org>` - Organization to pull from (default: trycua)
 73 | - `--vnc-port <port>` - Port to use for the VNC server (default: 0 for auto-assign)
 74 | - `--recovery-mode <boolean>` - For macOS VMs only, start VM in recovery mode (default: false)
 75 | - `--storage <name>` - VM storage location to use
 76 | 
 77 | **Examples:**
 78 | ```bash
 79 | # Run VM with shared directory
 80 | lume run my-vm --shared-dir /path/to/share:rw
 81 | 
 82 | # Run VM without display (headless)
 83 | lume run my-vm --no-display
 84 | 
 85 | # Run macOS VM in recovery mode
 86 | lume run my-mac --recovery-mode true
 87 | ```
 88 | 
 89 |  lume stop &lt;name&gt;
 90 | Stop a running virtual machine.
 91 | 
 92 | **Options:**
 93 | - `--storage <name>` - VM storage location to use
 94 | 
 95 | ### lume delete &lt;name&gt;
 96 | Delete a virtual machine and its associated files.
 97 | 
 98 | **Options:**
 99 | - `--force` - Force deletion without confirmation
100 | - `--storage <name>` - VM storage location to use
101 | 
102 | ### lume clone &lt;name&gt; &lt;new-name&gt;
103 | Create a copy of an existing virtual machine.
104 | 
105 | **Options:**
106 | - `--source-storage <name>` - Source VM storage location
107 | - `--dest-storage <name>` - Destination VM storage location
108 | 
109 | ## VM Information and Configuration
110 | 
111 | ### lume ls
112 | List all virtual machines and their status.
113 | 
114 | ### lume get &lt;name&gt;
115 | Get detailed information about a specific virtual machine.
116 | 
117 | **Options:**
118 | - `-f, --format <format>` - Output format (json|text)
119 | - `--storage <name>` - VM storage location to use
120 | 
121 | ### lume set &lt;name&gt;
122 | Modify virtual machine configuration.
123 | 
124 | **Options:**
125 | - `--cpu <cores>` - New number of CPU cores (e.g., 4)
126 | - `--memory <size>` - New memory size (e.g., 8192MB or 8GB)
127 | - `--disk-size <size>` - New disk size (e.g., 40960MB or 40GB)
128 | - `--display <res>` - New display resolution in format WIDTHxHEIGHT (e.g., 1024x768)
129 | - `--storage <name>` - VM storage location to use
130 | 
131 | **Examples:**
132 | ```bash
133 | # Increase VM memory
134 | lume set my-vm --memory 16GB
135 | 
136 | # Change display resolution
137 | lume set my-vm --display 1920x1080
138 | 
139 | # Add more CPU cores
140 | lume set my-vm --cpu 8
141 | ```
142 | 
143 | ## Image Management
144 | 
145 | ### lume images
146 | List available macOS images in local cache.
147 | 
148 | ### lume pull &lt;image&gt;
149 | Download a VM image from a container registry.
150 | 
151 | **Options:**
152 | - `--registry <url>` - Container registry URL (default: ghcr.io)
153 | - `--organization <org>` - Organization to pull from (default: trycua)
154 | - `--storage <name>` - VM storage location to use
155 | 
156 | ### lume push &lt;name&gt; &lt;image:tag&gt;
157 | Upload a VM image to a container registry.
158 | 
159 | **Options:**
160 | - `--additional-tags <tags...>` - Additional tags to push the same image to
161 | - `--registry <url>` - Container registry URL (default: ghcr.io)
162 | - `--organization <org>` - Organization/user to push to (default: trycua)
163 | - `--storage <name>` - VM storage location to use
164 | - `--chunk-size-mb <size>` - Chunk size for disk image upload in MB (default: 512)
165 | - `--verbose` - Enable verbose logging
166 | - `--dry-run` - Prepare files and show plan without uploading
167 | - `--reassemble` - Verify integrity by reassembling chunks (requires --dry-run)
168 | 
169 | ### lume ipsw
170 | Get the latest macOS restore image URL.
171 | 
172 | ### lume prune
173 | Remove cached images to free up disk space.
174 | 
175 | ## Configuration
176 | 
177 | ### lume config
178 | Manage Lume configuration settings.
179 | 
180 | **Subcommands:**
181 | 
182 | ##### Storage Management
183 | - `lume config storage add <name> <path>` - Add a new VM storage location
184 | - `lume config storage remove <name>` - Remove a VM storage location
185 | - `lume config storage list` - List all VM storage locations
186 | - `lume config storage default <name>` - Set the default VM storage location
187 | 
188 | ##### Cache Management
189 | - `lume config cache get` - Get current cache directory
190 | - `lume config cache set <path>` - Set cache directory
191 | 
192 | ##### Image Caching
193 | - `lume config caching get` - Show current caching status
194 | - `lume config caching set <boolean>` - Enable or disable image caching
195 | 
196 | ## API Server
197 | 
198 | ### lume serve
199 | Start the Lume API server for programmatic access.
200 | 
201 | **Options:**
202 | - `--port <port>` - Port to listen on (default: 7777)
203 | 
204 | ## Global Options
205 | 
206 | These options are available for all commands:
207 | 
208 | - `--help` - Show help information
209 | - `--version` - Show version number
```

--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/agent-loops.mdx:
--------------------------------------------------------------------------------

```markdown
  1 | ---
  2 | title: Agent Loops
  3 | description: Supported computer-using agent loops and models
  4 | ---
  5 | 
  6 | <Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/agent_nb.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.</Callout>
  7 | 
  8 | An agent can be thought of as a loop - it generates actions, executes them, and repeats until done:
  9 | 
 10 | 1. **Generate**: Your `model` generates `output_text`, `computer_call`, `function_call`
 11 | 2. **Execute**: The `computer` safely executes those items
 12 | 3. **Complete**: If the model has no more calls, it's done!
 13 | 
 14 | To run an agent loop simply do:
 15 | 
 16 | ```python
 17 | from agent import ComputerAgent
 18 | import asyncio
 19 | from computer import Computer
 20 | 
 21 | 
 22 | async def take_screenshot():
 23 |     async with Computer(
 24 |         os_type="linux",
 25 |         provider_type="cloud",
 26 |         name="your-sandbox-name",
 27 |         api_key="your-api-key"
 28 |     ) as computer:
 29 | 
 30 |         agent = ComputerAgent(
 31 |             model="anthropic/claude-3-5-sonnet-20241022",
 32 |             tools=[computer],
 33 |             max_trajectory_budget=5.0
 34 |         )
 35 | 
 36 |         messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
 37 | 
 38 |         async for result in agent.run(messages):
 39 |             for item in result["output"]:
 40 |                 if item["type"] == "message":
 41 |                     print(item["content"][0]["text"])
 42 | 
 43 | 
 44 | if __name__ == "__main__":
 45 |     asyncio.run(take_screenshot())
 46 | ```
 47 | 
 48 | For a list of supported models and configurations, see the [Supported Agents](./supported-agents/computer-use-agents) page.
 49 | 
 50 | ### Response Format
 51 | 
 52 | ```python
 53 | {
 54 |     "output": [
 55 |         {
 56 |             "type": "message",
 57 |             "role": "assistant",
 58 |             "content": [{"type": "output_text", "text": "I can see..."}]
 59 |         },
 60 |         {
 61 |             "type": "computer_call",
 62 |             "action": {"type": "screenshot"},
 63 |             "call_id": "call_123"
 64 |         },
 65 |         {
 66 |             "type": "computer_call_output",
 67 |             "call_id": "call_123",
 68 |             "output": {"image_url": "data:image/png;base64,..."}
 69 |         }
 70 |     ],
 71 |     "usage": {
 72 |         "prompt_tokens": 150,
 73 |         "completion_tokens": 75,
 74 |         "total_tokens": 225,
 75 |         "response_cost": 0.01,
 76 |     }
 77 | }
 78 | ```
 79 | 
 80 | ### Environment Variables
 81 | 
 82 | Use the following environment variables to configure the agent and its access to cloud computers and LLM providers:
 83 | 
 84 | ```bash
 85 | # Computer instance (cloud)
 86 | export CUA_CONTAINER_NAME="your-container-name"
 87 | export CUA_API_KEY="your-cua-api-key"
 88 | 
 89 | # LLM API keys
 90 | export ANTHROPIC_API_KEY="your-anthropic-key"
 91 | export OPENAI_API_KEY="your-openai-key"
 92 | ```
 93 | 
 94 | ### Input and output
 95 | 
 96 | The input prompt passed to `Agent.run` can either be a string or a list of message dictionaries:
 97 | 
 98 | ```python
 99 | messages = [
100 |     {
101 |         "role": "user",
102 |         "content": "Take a screenshot and describe what you see"
103 |     },
104 |     {
105 |         "role": "assistant", 
106 |         "content": "I'll take a screenshot for you."
107 |     }
108 | ]
109 | ```
110 | 
111 | The output is an AsyncGenerator that yields response chunks.
112 | 
113 | ### Parameters
114 | 
115 | The `ComputerAgent` constructor provides a wide range of options for customizing agent behavior, tool integration, callbacks, resource management, and more.
116 | 
117 | - `model` (`str`): Default: **required**
118 |   The LLM or agent model to use. Determines which agent loop is selected unless `custom_loop` is provided. (e.g., "claude-3-5-sonnet-20241022", "computer-use-preview", "omni+vertex_ai/gemini-pro")
119 | - `tools` (`List[Any]`):
120 |   List of tools the agent can use (e.g., `Computer`, sandboxed Python functions, etc.).
121 | - `custom_loop` (`Callable`):
122 |   Optional custom agent loop function. If provided, overrides automatic loop selection.
123 | - `only_n_most_recent_images` (`int`):
124 |   If set, only the N most recent images are kept in the message history. Useful for limiting memory usage. Automatically adds `ImageRetentionCallback`.
125 | - `callbacks` (`List[Any]`):
126 |   List of callback instances for advanced preprocessing, postprocessing, logging, or custom hooks. See [Callbacks & Extensibility](#callbacks--extensibility).
127 | - `verbosity` (`int`):
128 |   Logging level (e.g., `logging.INFO`). If set, adds a logging callback.
129 | - `trajectory_dir` (`str`):
130 |   Directory path to save full trajectory data, including screenshots and responses. Adds `TrajectorySaverCallback`.
131 | - `max_retries` (`int`): Default: `3`
132 |   Maximum number of retries for failed API calls (default: 3).
133 | - `screenshot_delay` (`float` | `int`): Default: `0.5`
134 |   Delay (in seconds) before taking screenshots (default: 0.5).
135 | - `use_prompt_caching` (`bool`): Default: `False`
136 |   Enables prompt caching for repeated prompts (mainly for Anthropic models).
137 | - `max_trajectory_budget` (`float` | `dict`):
138 |   If set (float or dict), adds a budget manager callback that tracks usage costs and stops execution if the budget is exceeded. Dict allows advanced options (e.g., `{ "max_budget": 5.0, "raise_error": True }`).
139 | - `**kwargs` (`any`):
140 |   Any additional keyword arguments are passed through to the agent loop or model provider.
141 | 
142 | **Example with advanced options:**
143 | 
144 | ```python
145 | from agent import ComputerAgent
146 | from computer import Computer
147 | from agent.callbacks import ImageRetentionCallback
148 | 
149 | agent = ComputerAgent(
150 |     model="anthropic/claude-3-5-sonnet-20241022",
151 |     tools=[Computer(...)],
152 |     only_n_most_recent_images=3,
153 |     callbacks=[ImageRetentionCallback(only_n_most_recent_images=3)],
154 |     verbosity=logging.INFO,
155 |     trajectory_dir="trajectories",
156 |     max_retries=5,
157 |     screenshot_delay=1.0,
158 |     use_prompt_caching=True,
159 |     max_trajectory_budget={"max_budget": 5.0, "raise_error": True}
160 | )
161 | ```
162 | 
163 | ### Streaming Responses
164 | 
165 | ```python
166 | async for result in agent.run(messages, stream=True):
167 |     # Process streaming chunks
168 |     for item in result["output"]:
169 |         if item["type"] == "message":
170 |             print(item["content"][0]["text"], end="", flush=True)
171 |         elif item["type"] == "computer_call":
172 |             action = item["action"]
173 |             print(f"\n[Action: {action['type']}]")
174 | ```
175 | 
176 | ### Error Handling
177 | 
178 | ```python
179 | try:
180 |     async for result in agent.run(messages):
181 |         # Process results
182 |         pass
183 | except BudgetExceededException:
184 |     print("Budget limit exceeded")
185 | except Exception as e:
186 |     print(f"Agent error: {e}")
187 | ```
188 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/proxy/examples.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Example usage of the proxy server and client requests.
  3 | """
  4 | import dotenv
  5 | dotenv.load_dotenv()
  6 | 
  7 | import asyncio
  8 | import json
  9 | import os
 10 | import aiohttp
 11 | from typing import Dict, Any
 12 | 
 13 | 
 14 | async def test_http_endpoint():
 15 |     """Test the HTTP /responses endpoint."""
 16 |     
 17 |     anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
 18 |     assert isinstance(anthropic_api_key, str), "ANTHROPIC_API_KEY environment variable must be set"
 19 | 
 20 |     # Example 1: Simple text request
 21 |     simple_request = {
 22 |         "model": "anthropic/claude-3-5-sonnet-20241022",
 23 |         "input": "Tell me a three sentence bedtime story about a unicorn.",
 24 |         "env": {
 25 |             "ANTHROPIC_API_KEY": anthropic_api_key
 26 |         }
 27 |     }
 28 |     
 29 |     # Example 2: Multi-modal request with image
 30 |     multimodal_request = {
 31 |         "model": "anthropic/claude-3-5-sonnet-20241022",
 32 |         "input": [
 33 |             {
 34 |                 "role": "user",
 35 |                 "content": [
 36 |                     {"type": "input_text", "text": "what is in this image?"},
 37 |                     {
 38 |                         "type": "input_image",
 39 |                         "image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 40 |                     }
 41 |                 ]
 42 |             }
 43 |         ],
 44 |         "env": {
 45 |             "ANTHROPIC_API_KEY": anthropic_api_key
 46 |         }
 47 |     }
 48 |     
 49 |     # Example 3: Request with custom agent and computer kwargs
 50 |     custom_request = {
 51 |         "model": "anthropic/claude-3-5-sonnet-20241022",
 52 |         "input": "Take a screenshot and tell me what you see",
 53 |         "env": {
 54 |             "ANTHROPIC_API_KEY": anthropic_api_key
 55 |         }
 56 |     }
 57 |     
 58 |     # Test requests
 59 |     base_url = "https://m-linux-96lcxd2c2k.containers.cloud.trycua.com:8443"
 60 |     # base_url = "http://localhost:8000"
 61 |     api_key = os.getenv("CUA_API_KEY")
 62 |     assert isinstance(api_key, str), "CUA_API_KEY environment variable must be set"
 63 |     
 64 |     async with aiohttp.ClientSession() as session:
 65 |         for i, request_data in enumerate([
 66 |             simple_request,
 67 |             # multimodal_request,
 68 |             custom_request
 69 |         ], 1):
 70 |             print(f"\n--- Test {i} ---")
 71 |             print(f"Request: {json.dumps(request_data, indent=2)}")
 72 |             
 73 |             try:
 74 |                 print(f"Sending request to {base_url}/responses")
 75 |                 async with session.post(
 76 |                     f"{base_url}/responses",
 77 |                     json=request_data,
 78 |                     headers={"Content-Type": "application/json", "X-API-Key": api_key}
 79 |                 ) as response:
 80 |                     result = await response.json()
 81 |                     print(f"Status: {response.status}")
 82 |                     print(f"Response: {json.dumps(result, indent=2)}")
 83 |                     
 84 |             except Exception as e:
 85 |                 print(f"Error: {e}")
 86 | 
 87 | 
 88 | def curl_examples():
 89 |     """Print curl command examples."""
 90 |     
 91 |     print("=== CURL Examples ===\n")
 92 |     
 93 |     print("1. Simple text request:")
 94 |     print("""curl http://localhost:8000/responses \\
 95 |   -H "Content-Type: application/json" \\
 96 |   -d '{
 97 |     "model": "anthropic/claude-3-5-sonnet-20241022",
 98 |     "input": "Tell me a three sentence bedtime story about a unicorn."
 99 |   }'""")
100 |     
101 |     print("\n2. Multi-modal request with image:")
102 |     print("""curl http://localhost:8000/responses \\
103 |   -H "Content-Type: application/json" \\
104 |   -d '{
105 |     "model": "anthropic/claude-3-5-sonnet-20241022",
106 |     "input": [
107 |       {
108 |         "role": "user",
109 |         "content": [
110 |           {"type": "input_text", "text": "what is in this image?"},
111 |           {
112 |             "type": "input_image",
113 |             "image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
114 |           }
115 |         ]
116 |       }
117 |     ]
118 |   }'""")
119 |     
120 |     print("\n3. Request with custom configuration:")
121 |     print("""curl http://localhost:8000/responses \\
122 |   -H "Content-Type: application/json" \\
123 |   -d '{
124 |     "model": "anthropic/claude-3-5-sonnet-20241022",
125 |     "input": "Take a screenshot and tell me what you see",
126 |     "agent_kwargs": {
127 |       "save_trajectory": true,
128 |       "verbosity": 20
129 |     },
130 |     "computer_kwargs": {
131 |       "os_type": "linux",
132 |       "provider_type": "cloud"
133 |     }
134 |   }'""")
135 | 
136 | 
137 | async def test_p2p_client():
138 |     """Example P2P client using peerjs-python."""
139 |     try:
140 |         from peerjs import Peer, PeerOptions, ConnectionEventType
141 |         from aiortc import RTCConfiguration, RTCIceServer
142 |         
143 |         # Set up client peer
144 |         options = PeerOptions(
145 |             host="0.peerjs.com",
146 |             port=443,
147 |             secure=True,
148 |             config=RTCConfiguration(
149 |                 iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]
150 |             )
151 |         )
152 |         
153 |         client_peer = Peer(id="test-client", peer_options=options)
154 |         await client_peer.start()
155 |         
156 |         # Connect to proxy server
157 |         connection = client_peer.connect("computer-agent-proxy")
158 |         
159 |         @connection.on(ConnectionEventType.Open)
160 |         async def connection_open():
161 |             print("Connected to proxy server")
162 |             
163 |             # Send a test request
164 |             request = {
165 |                 "model": "anthropic/claude-3-5-sonnet-20241022",
166 |                 "input": "Hello from P2P client!"
167 |             }
168 |             await connection.send(json.dumps(request))
169 |         
170 |         @connection.on(ConnectionEventType.Data)
171 |         async def connection_data(data):
172 |             print(f"Received response: {data}")
173 |             await client_peer.destroy()
174 |         
175 |         # Wait for connection
176 |         await asyncio.sleep(10)
177 |         
178 |     except ImportError:
179 |         print("P2P dependencies not available. Install peerjs-python for P2P testing.")
180 |     except Exception as e:
181 |         print(f"P2P test error: {e}")
182 | 
183 | 
184 | if __name__ == "__main__":
185 |     import sys
186 |     
187 |     if len(sys.argv) > 1 and sys.argv[1] == "curl":
188 |         curl_examples()
189 |     elif len(sys.argv) > 1 and sys.argv[1] == "p2p":
190 |         asyncio.run(test_p2p_client())
191 |     else:
192 |         asyncio.run(test_http_endpoint())
193 | 
```

--------------------------------------------------------------------------------
/libs/python/computer-server/computer_server/diorama/safezone.py:
--------------------------------------------------------------------------------

```python
  1 | #!/usr/bin/env python3
  2 | """
  3 | UI Safezone Helper - A utility to get accurate bounds for macOS UI elements
  4 | 
  5 | This module provides helper functions to get accurate bounds for macOS UI elements
  6 | like the menubar and dock, which are needed for proper screenshot composition.
  7 | """
  8 | 
  9 | import sys
 10 | import time
 11 | from typing import Dict, Any, Optional, Tuple
 12 | 
 13 | # Import Objective-C bridge libraries
 14 | try:
 15 |     import AppKit
 16 |     from ApplicationServices import (
 17 |         AXUIElementCreateSystemWide,
 18 |         AXUIElementCreateApplication,
 19 |         AXUIElementCopyAttributeValue,
 20 |         AXUIElementCopyAttributeValues,
 21 |         kAXChildrenAttribute,
 22 |         kAXRoleAttribute,
 23 |         kAXTitleAttribute,
 24 |         kAXPositionAttribute,
 25 |         kAXSizeAttribute,
 26 |         kAXErrorSuccess,
 27 |         AXValueGetType,
 28 |         kAXValueCGSizeType,
 29 |         kAXValueCGPointType,
 30 |         AXUIElementGetTypeID,
 31 |         AXValueGetValue,
 32 |         kAXMenuBarAttribute,
 33 |     )
 34 |     from AppKit import NSWorkspace, NSRunningApplication
 35 |     import Foundation
 36 | except ImportError:
 37 |     print("Error: This script requires PyObjC to be installed.")
 38 |     print("Please install it with: pip install pyobjc")
 39 |     sys.exit(1)
 40 | 
 41 | # Constants for accessibility API
 42 | kAXErrorSuccess = 0
 43 | kAXRoleAttribute = "AXRole"
 44 | kAXSubroleAttribute = "AXSubrole"
 45 | kAXTitleAttribute = "AXTitle"
 46 | kAXPositionAttribute = "AXPosition"
 47 | kAXSizeAttribute = "AXSize"
 48 | kAXChildrenAttribute = "AXChildren"
 49 | kAXMenuBarAttribute = "AXMenuBar"
 50 | 
 51 | 
 52 | def element_attribute(element, attribute):
 53 |     """Get an attribute from an accessibility element"""
 54 |     if attribute == kAXChildrenAttribute:
 55 |         err, value = AXUIElementCopyAttributeValues(element, attribute, 0, 999, None)
 56 |         if err == kAXErrorSuccess:
 57 |             if isinstance(value, Foundation.NSArray):
 58 |                 return list(value)
 59 |             else:
 60 |                 return value
 61 |     err, value = AXUIElementCopyAttributeValue(element, attribute, None)
 62 |     if err == kAXErrorSuccess:
 63 |         return value
 64 |     return None
 65 | 
 66 | 
 67 | def element_value(element, type):
 68 |     """Get a value from an accessibility element"""
 69 |     err, value = AXValueGetValue(element, type, None)
 70 |     if err == True:
 71 |         return value
 72 |     return None
 73 | 
 74 | 
 75 | def get_element_bounds(element):
 76 |     """Get the bounds of an accessibility element"""
 77 |     bounds = {
 78 |         "x": 0,
 79 |         "y": 0,
 80 |         "width": 0,
 81 |         "height": 0
 82 |     }
 83 |     
 84 |     # Get position
 85 |     position_value = element_attribute(element, kAXPositionAttribute)
 86 |     if position_value:
 87 |         position_value = element_value(position_value, kAXValueCGPointType)
 88 |         if position_value:
 89 |             bounds["x"] = position_value.x
 90 |             bounds["y"] = position_value.y
 91 |     
 92 |     # Get size
 93 |     size_value = element_attribute(element, kAXSizeAttribute)
 94 |     if size_value:
 95 |         size_value = element_value(size_value, kAXValueCGSizeType)
 96 |         if size_value:
 97 |             bounds["width"] = size_value.width
 98 |             bounds["height"] = size_value.height
 99 |             
100 |     return bounds
101 | 
102 | 
103 | def find_dock_process():
104 |     """Find the Dock process"""
105 |     running_apps = NSWorkspace.sharedWorkspace().runningApplications()
106 |     for app in running_apps:
107 |         if app.localizedName() == "Dock" and app.bundleIdentifier() == "com.apple.dock":
108 |             return app.processIdentifier()
109 |     return None
110 | 
111 | 
112 | def get_menubar_bounds():
113 |     """Get the bounds of the macOS menubar
114 |     
115 |     Returns:
116 |         Dictionary with x, y, width, height of the menubar
117 |     """
118 |     # Get the system-wide accessibility element
119 |     system_element = AXUIElementCreateSystemWide()
120 |     
121 |     # Try to find the menubar
122 |     menubar = element_attribute(system_element, kAXMenuBarAttribute)
123 |     if menubar is None:
124 |         # If we can't get it directly, try through the frontmost app
125 |         frontmost_app = NSWorkspace.sharedWorkspace().frontmostApplication()
126 |         if frontmost_app:
127 |             app_pid = frontmost_app.processIdentifier()
128 |             app_element = AXUIElementCreateApplication(app_pid)
129 |             menubar = element_attribute(app_element, kAXMenuBarAttribute)
130 |     
131 |     if menubar is None:
132 |         print("Error: Could not get menubar")
133 |         # Return default menubar bounds as fallback
134 |         return {"x": 0, "y": 0, "width": 1800, "height": 24}
135 |     
136 |     # Get menubar bounds
137 |     return get_element_bounds(menubar)
138 | 
139 | 
140 | def get_dock_bounds():
141 |     """Get the bounds of the macOS Dock
142 |     
143 |     Returns:
144 |         Dictionary with x, y, width, height of the Dock
145 |     """
146 |     dock_pid = find_dock_process()
147 |     if dock_pid is None:
148 |         print("Error: Could not find Dock process")
149 |         # Return empty bounds as fallback
150 |         return {"x": 0, "y": 0, "width": 0, "height": 0}
151 |         
152 |     # Create an accessibility element for the Dock
153 |     dock_element = AXUIElementCreateApplication(dock_pid)
154 |     if dock_element is None:
155 |         print(f"Error: Could not create accessibility element for Dock (PID {dock_pid})")
156 |         return {"x": 0, "y": 0, "width": 0, "height": 0}
157 |     
158 |     # Get the Dock's children
159 |     children = element_attribute(dock_element, kAXChildrenAttribute)
160 |     if not children or len(children) == 0:
161 |         print("Error: Could not get Dock children")
162 |         return {"x": 0, "y": 0, "width": 0, "height": 0}
163 |     
164 |     # Find the Dock's list (first child is usually the main dock list)
165 |     dock_list = None
166 |     for child in children:
167 |         role = element_attribute(child, kAXRoleAttribute)
168 |         if role == "AXList":
169 |             dock_list = child
170 |             break
171 |     
172 |     if dock_list is None:
173 |         print("Error: Could not find Dock list")
174 |         return {"x": 0, "y": 0, "width": 0, "height": 0}
175 |     
176 |     # Get the bounds of the dock list
177 |     return get_element_bounds(dock_list)
178 | 
179 | 
180 | def get_ui_element_bounds():
181 |     """Get the bounds of important UI elements like menubar and dock
182 |     
183 |     Returns:
184 |         Dictionary with menubar and dock bounds
185 |     """
186 |     menubar_bounds = get_menubar_bounds()
187 |     dock_bounds = get_dock_bounds()
188 |     
189 |     return {
190 |         "menubar": menubar_bounds,
191 |         "dock": dock_bounds
192 |     }
193 | 
194 | 
195 | if __name__ == "__main__":
196 |     # Example usage
197 |     bounds = get_ui_element_bounds()
198 |     print("Menubar bounds:", bounds["menubar"])
199 |     print("Dock bounds:", bounds["dock"])
200 | 
```

--------------------------------------------------------------------------------
/.github/workflows/pypi-publish-agent.yml:
--------------------------------------------------------------------------------

```yaml
  1 | name: Publish Agent Package
  2 | 
  3 | on:
  4 |   push:
  5 |     tags:
  6 |       - "agent-v*"
  7 |   workflow_dispatch:
  8 |     inputs:
  9 |       version:
 10 |         description: "Version to publish (without v prefix)"
 11 |         required: true
 12 |         default: "0.1.0"
 13 |   workflow_call:
 14 |     inputs:
 15 |       version:
 16 |         description: "Version to publish"
 17 |         required: true
 18 |         type: string
 19 | 
 20 | # Adding permissions at workflow level
 21 | permissions:
 22 |   contents: write
 23 | 
 24 | jobs:
 25 |   prepare:
 26 |     runs-on: macos-latest
 27 |     outputs:
 28 |       version: ${{ steps.get-version.outputs.version }}
 29 |       computer_version: ${{ steps.update-deps.outputs.computer_version }}
 30 |       som_version: ${{ steps.update-deps.outputs.som_version }}
 31 |       core_version: ${{ steps.update-deps.outputs.core_version }}
 32 |     steps:
 33 |       - uses: actions/checkout@v4
 34 | 
 35 |       - name: Determine version
 36 |         id: get-version
 37 |         run: |
 38 |           if [ "${{ github.event_name }}" == "push" ]; then
 39 |             # Extract version from tag (for package-specific tags)
 40 |             if [[ "${{ github.ref }}" =~ ^refs/tags/agent-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then
 41 |               VERSION=${BASH_REMATCH[1]}
 42 |             else
 43 |               echo "Invalid tag format for agent"
 44 |               exit 1
 45 |             fi
 46 |           elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
 47 |             # Use version from workflow dispatch
 48 |             VERSION=${{ github.event.inputs.version }}
 49 |           else
 50 |             # Use version from workflow_call
 51 |             VERSION=${{ inputs.version }}
 52 |           fi
 53 |           echo "VERSION=$VERSION"
 54 |           echo "version=$VERSION" >> $GITHUB_OUTPUT
 55 | 
 56 |       - name: Set up Python
 57 |         uses: actions/setup-python@v4
 58 |         with:
 59 |           python-version: "3.11"
 60 | 
 61 |       - name: Update dependencies to latest versions
 62 |         id: update-deps
 63 |         run: |
 64 |           cd libs/python/agent
 65 | 
 66 |           # Install required package for PyPI API access
 67 |           pip install requests
 68 | 
 69 |           # Create a more robust Python script for PyPI version checking
 70 |           cat > get_latest_versions.py << 'EOF'
 71 |           import requests
 72 |           import json
 73 |           import sys
 74 | 
 75 |           def get_package_version(package_name, fallback="0.1.0"):
 76 |               try:
 77 |                   response = requests.get(f'https://pypi.org/pypi/{package_name}/json')
 78 |                   print(f"API Response Status for {package_name}: {response.status_code}", file=sys.stderr)
 79 |                   
 80 |                   if response.status_code != 200:
 81 |                       print(f"API request failed for {package_name}, using fallback version", file=sys.stderr)
 82 |                       return fallback
 83 |                   
 84 |                   data = json.loads(response.text)
 85 |                   
 86 |                   if 'info' not in data:
 87 |                       print(f"Missing 'info' key in API response for {package_name}, using fallback version", file=sys.stderr)
 88 |                       return fallback
 89 |                       
 90 |                   return data['info']['version']
 91 |               except Exception as e:
 92 |                   print(f"Error fetching version for {package_name}: {str(e)}", file=sys.stderr)
 93 |                   return fallback
 94 | 
 95 |           # Get latest versions
 96 |           print(get_package_version('cua-computer'))
 97 |           print(get_package_version('cua-som'))
 98 |           print(get_package_version('cua-core'))
 99 |           EOF
100 | 
101 |           # Execute the script to get the versions
102 |           VERSIONS=($(python get_latest_versions.py))
103 |           LATEST_COMPUTER=${VERSIONS[0]}
104 |           LATEST_SOM=${VERSIONS[1]}
105 |           LATEST_CORE=${VERSIONS[2]}
106 | 
107 |           echo "Latest cua-computer version: $LATEST_COMPUTER"
108 |           echo "Latest cua-som version: $LATEST_SOM"
109 |           echo "Latest cua-core version: $LATEST_CORE"
110 | 
111 |           # Output the versions for the next job
112 |           echo "computer_version=$LATEST_COMPUTER" >> $GITHUB_OUTPUT
113 |           echo "som_version=$LATEST_SOM" >> $GITHUB_OUTPUT
114 |           echo "core_version=$LATEST_CORE" >> $GITHUB_OUTPUT
115 | 
116 |           # Determine major version for version constraint
117 |           COMPUTER_MAJOR=$(echo $LATEST_COMPUTER | cut -d. -f1)
118 |           SOM_MAJOR=$(echo $LATEST_SOM | cut -d. -f1)
119 |           CORE_MAJOR=$(echo $LATEST_CORE | cut -d. -f1)
120 | 
121 |           NEXT_COMPUTER_MAJOR=$((COMPUTER_MAJOR + 1))
122 |           NEXT_SOM_MAJOR=$((SOM_MAJOR + 1))
123 |           NEXT_CORE_MAJOR=$((CORE_MAJOR + 1))
124 | 
125 |           # Update dependencies in pyproject.toml
126 |           if [[ "$OSTYPE" == "darwin"* ]]; then
127 |             # macOS version of sed needs an empty string for -i
128 |             sed -i '' "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml
129 |             sed -i '' "s/\"cua-som>=.*,<.*\"/\"cua-som>=$LATEST_SOM,<$NEXT_SOM_MAJOR.0.0\"/" pyproject.toml
130 |             sed -i '' "s/\"cua-core>=.*,<.*\"/\"cua-core>=$LATEST_CORE,<$NEXT_CORE_MAJOR.0.0\"/" pyproject.toml
131 |           else
132 |             # Linux version
133 |             sed -i "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml
134 |             sed -i "s/\"cua-som>=.*,<.*\"/\"cua-som>=$LATEST_SOM,<$NEXT_SOM_MAJOR.0.0\"/" pyproject.toml
135 |             sed -i "s/\"cua-core>=.*,<.*\"/\"cua-core>=$LATEST_CORE,<$NEXT_CORE_MAJOR.0.0\"/" pyproject.toml
136 |           fi
137 | 
138 |           # Display the updated dependencies
139 |           echo "Updated dependencies in pyproject.toml:"
140 |           grep -E "cua-computer|cua-som|cua-core" pyproject.toml
141 | 
142 |   publish:
143 |     needs: prepare
144 |     uses: ./.github/workflows/pypi-reusable-publish.yml
145 |     with:
146 |       package_name: "agent"
147 |       package_dir: "libs/python/agent"
148 |       version: ${{ needs.prepare.outputs.version }}
149 |       is_lume_package: false
150 |       base_package_name: "cua-agent"
151 |     secrets:
152 |       PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
153 | 
154 |   set-env-variables:
155 |     needs: [prepare, publish]
156 |     runs-on: macos-latest
157 |     steps:
158 |       - name: Set environment variables for use in other jobs
159 |         run: |
160 |           echo "COMPUTER_VERSION=${{ needs.prepare.outputs.computer_version }}" >> $GITHUB_ENV
161 |           echo "SOM_VERSION=${{ needs.prepare.outputs.som_version }}" >> $GITHUB_ENV
162 |           echo "CORE_VERSION=${{ needs.prepare.outputs.core_version }}" >> $GITHUB_ENV
163 | 
```

--------------------------------------------------------------------------------
/libs/lumier/src/lib/utils.sh:
--------------------------------------------------------------------------------

```bash
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Function to wait for SSH to become available
  4 | wait_for_ssh() {
  5 |     local host_ip=$1
  6 |     local user=$2
  7 |     local password=$3
  8 |     local retry_interval=${4:-5}   # Default retry interval is 5 seconds
  9 |     local max_retries=${5:-20}    # Default maximum retries is 20 (0 for infinite)
 10 | 
 11 |     # Only show waiting message in debug mode
 12 |     if [ "${LUMIER_DEBUG:-0}" == "1" ]; then
 13 |         echo "Waiting for SSH to become available on $host_ip..."
 14 |     fi
 15 | 
 16 |     local retry_count=0
 17 |     while true; do
 18 |         # Try to connect via SSH
 19 |         # Add -q for completely silent operation, redirect stderr to /dev/null
 20 |         sshpass -p "$password" ssh -q -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR "$user@$host_ip" "exit" 2>/dev/null
 21 | 
 22 |         # Check the exit status of the SSH command
 23 |         if [ $? -eq 0 ]; then
 24 |             echo "SSH is ready on $host_ip!"
 25 |             return 0
 26 |         fi
 27 | 
 28 |         # Increment retry count
 29 |         ((retry_count++))
 30 |         
 31 |         # Exit if maximum retries are reached
 32 |         if [ $max_retries -ne 0 ] && [ $retry_count -ge $max_retries ]; then
 33 |             echo "Maximum retries reached. SSH is not available."
 34 |             return 1
 35 |         fi
 36 | 
 37 |         # Only show retry messages in debug mode
 38 |         if [ "${LUMIER_DEBUG:-0}" == "1" ]; then
 39 |             echo "SSH not ready. Retrying in $retry_interval seconds... (Attempt $retry_count)"
 40 |         fi
 41 |         sleep $retry_interval
 42 |     done
 43 | }
 44 | 
 45 | # Function to execute a script on a remote server using sshpass
 46 | execute_remote_script() {
 47 |     local host="$1"
 48 |     local user="$2"
 49 |     local password="$3"
 50 |     local script_path="$4"
 51 |     local vnc_password="$5"
 52 |     local data_folder="$6"
 53 | 
 54 |     # Check if all required arguments are provided
 55 |     if [ -z "$host" ] || [ -z "$user" ] || [ -z "$password" ] || [ -z "$script_path" ] || [ -z "$vnc_password" ]; then
 56 |         echo "Usage: execute_remote_script <host> <user> <password> <script_path> <vnc_password> [data_folder]"
 57 |         return 1
 58 |     fi
 59 | 
 60 |     # Only show VNC info in debug mode
 61 |     if [ "${LUMIER_DEBUG:-0}" == "1" ]; then
 62 |         echo "VNC password exported to VM: $vnc_password"
 63 |     fi
 64 | 
 65 |     # Set the shared folder path for the VM
 66 |     if [ -n "$data_folder" ]; then
 67 |         # VM always sees shared folders at this path, regardless of container path
 68 |         shared_folder_path="/Volumes/My Shared Files"
 69 |         
 70 |         # Only show path in debug mode
 71 |         if [ "${LUMIER_DEBUG:-0}" == "1" ]; then
 72 |             echo "Data folder path in VM: $shared_folder_path"
 73 |         fi
 74 |     else
 75 |         shared_folder_path=""
 76 |     fi
 77 | 
 78 |     # Read the script content and prepend the shebang
 79 |     script_content="#!/usr/bin/env bash\n"
 80 |     # Always export VNC_PASSWORD
 81 |     script_content+="export VNC_PASSWORD='$vnc_password'\n"
 82 |     # Export SHARED_FOLDER_PATH only if we have a data folder path
 83 |     if [ -n "$shared_folder_path" ]; then
 84 |         script_content+="export SHARED_FOLDER_PATH='$shared_folder_path'\n"
 85 |     fi
 86 |     # Pass debug setting to the VM
 87 |     script_content+="export VNC_DEBUG='${LUMIER_DEBUG:-0}'\n"
 88 |     
 89 |     # Add debug messages only if debug mode is enabled
 90 |     if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then
 91 |         script_content+="echo \"[DEBUG] Starting on-logon script execution...\"\n"
 92 |     fi
 93 |     
 94 |     # Add the original script content
 95 |     script_content+="$(<"$script_path")"
 96 |     
 97 |     # Add debug messages only if debug mode is enabled
 98 |     if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then
 99 |         script_content+="\necho \"[DEBUG] Finished executing on-logon script.\"\n"
100 |     fi
101 |     
102 |     # Print debug info only when debug mode is enabled
103 |     if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then
104 |         echo "[DEBUG] Executing remote script with content length: $(echo -n "$script_content" | wc -c) bytes"
105 |         echo "[DEBUG] Script path: $script_path"
106 |     fi
107 |     
108 |     # Use a here-document to send the script content
109 |     # We'll capture both stdout and stderr when debug is enabled
110 |     if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then
111 |         echo "[DEBUG] Connecting to $user@$host to execute script..."
112 |         sshpass -p "$password" ssh -q -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR "$user@$host" "bash -s -- '$vnc_password' '$data_folder'" 2>&1 <<EOF
113 | $script_content
114 | EOF
115 |     else
116 |         # Otherwise run quietly
117 |         sshpass -p "$password" ssh -q -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR "$user@$host" "bash -s -- '$vnc_password' '$data_folder'" 2>/dev/null <<EOF
118 | $script_content
119 | EOF
120 |     fi
121 | 
122 |     # Print completion message only in debug mode
123 |     if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then
124 |         echo "[DEBUG] Script execution completed."
125 |     fi
126 | 
127 |     # Check the exit status of the sshpass command
128 |     if [ $? -ne 0 ]; then
129 |         echo "Failed to execute script on remote host $host."
130 |         return 1
131 |     fi
132 | }
133 | 
134 | extract_json_field() {
135 |     local field_name=$1
136 |     local input=$2
137 |     local result=""
138 |     
139 |     # First attempt with jq if available (most reliable JSON parsing)
140 |     if command -v jq &> /dev/null; then
141 |         # Use jq for reliable JSON parsing
142 |         result=$(echo "$input" | jq -r ".$field_name // empty" 2>/dev/null)
143 |         if [[ -n "$result" ]]; then
144 |             echo "$result"
145 |             return 0
146 |         fi
147 |     fi
148 |     
149 |     # Fallback to grep-based approach with improvements
150 |     # First try for quoted string values
151 |     result=$(echo "$input" | tr -d '\n' | grep -o "\"$field_name\"\s*:\s*\"[^\"]*\"" | sed -E 's/.*":\s*"(.*)"$/\1/')
152 |     if [[ -n "$result" ]]; then
153 |         echo "$result"
154 |         return 0
155 |     fi
156 |     
157 |     # Try for non-quoted values (numbers, true, false, null)
158 |     result=$(echo "$input" | tr -d '\n' | grep -o "\"$field_name\"\s*:\s*[^,}]*" | sed -E 's/.*":\s*(.*)$/\1/')
159 |     if [[ -n "$result" ]]; then
160 |         echo "$result"
161 |         return 0
162 |     fi
163 |     
164 |     # Return empty string if field not found
165 |     echo ""
166 | }
167 | 
168 | extract_json_field_from_file() {
169 |     local field_name=$1
170 |     local json_file=$2
171 |     local json_text
172 |     json_text=$(<"$json_file")
173 |     extract_json_field "$field_name" "$json_text"
174 | }
175 | 
176 | extract_json_field_from_text() {
177 |     local field_name=$1
178 |     local json_text=$2
179 |     extract_json_field "$field_name" "$json_text"
180 | }
181 | 
```

--------------------------------------------------------------------------------
/libs/lume/src/FileSystem/VMDirectory.swift:
--------------------------------------------------------------------------------

```swift
  1 | import Foundation
  2 | 
  3 | // MARK: - VMDirectory
  4 | 
  5 | /// Manages a virtual machine's directory structure and files
  6 | /// Responsible for:
  7 | /// - Managing VM configuration files
  8 | /// - Handling disk operations
  9 | /// - Managing VM state and locking
 10 | /// - Providing access to VM-related paths
 11 | struct VMDirectory: Sendable {
 12 |     // MARK: - Constants
 13 |     
 14 |     private enum FileNames {
 15 |         static let nvram = "nvram.bin"
 16 |         static let disk = "disk.img"
 17 |         static let config = "config.json"
 18 |         static let sessions = "sessions.json"
 19 |     }
 20 |     
 21 |     // MARK: - Properties
 22 |     
 23 |     let dir: Path
 24 |     let nvramPath: Path
 25 |     let diskPath: Path
 26 |     let configPath: Path
 27 |     let sessionsPath: Path
 28 |     
 29 |     /// The name of the VM directory
 30 |     var name: String { dir.name }
 31 |     
 32 |     // MARK: - Initialization
 33 |     
 34 |     /// Creates a new VMDirectory instance
 35 |     /// - Parameters:
 36 |     ///   - dir: The base directory path for the VM
 37 |     init(_ dir: Path) {
 38 |         self.dir = dir
 39 |         self.nvramPath = dir.file(FileNames.nvram)
 40 |         self.diskPath = dir.file(FileNames.disk)
 41 |         self.configPath = dir.file(FileNames.config)
 42 |         self.sessionsPath = dir.file(FileNames.sessions)
 43 |     }
 44 | }
 45 | 
 46 | // MARK: - VM State Management
 47 | 
 48 | extension VMDirectory {
 49 |     /// Checks if the VM directory is fully initialized with all required files
 50 |     func initialized() -> Bool {
 51 |         // Add detailed logging for debugging
 52 |         let configExists = configPath.exists()
 53 |         let diskExists = diskPath.exists()
 54 |         let nvramExists = nvramPath.exists()
 55 |         
 56 |         // Logger.info(
 57 |         //     "VM directory initialization check", 
 58 |         //     metadata: [
 59 |         //         "directory": dir.path,
 60 |         //         "config_path": configPath.path,
 61 |         //         "config_exists": "\(configExists)",
 62 |         //         "disk_path": diskPath.path,
 63 |         //         "disk_exists": "\(diskExists)",
 64 |         //         "nvram_path": nvramPath.path,
 65 |         //         "nvram_exists": "\(nvramExists)"
 66 |         //     ]
 67 |         // )
 68 |         
 69 |         return configExists && diskExists && nvramExists
 70 |     }
 71 | 
 72 |     /// Checks if the VM directory exists
 73 |     func exists() -> Bool {
 74 |         dir.exists()
 75 |     }
 76 | }
 77 | 
 78 | // MARK: - Disk Management
 79 | 
 80 | extension VMDirectory {
 81 |     /// Resizes the VM's disk to the specified size
 82 |     /// - Parameter size: The new size in bytes
 83 |     /// - Throws: VMDirectoryError if the disk operation fails
 84 |     func setDisk(_ size: UInt64) throws {
 85 |         do {
 86 |             if !diskPath.exists() {
 87 |                 guard FileManager.default.createFile(atPath: diskPath.path, contents: nil) else {
 88 |                     throw VMDirectoryError.fileCreationFailed(diskPath.path)
 89 |                 }
 90 |             }
 91 |             
 92 |             let handle = try FileHandle(forWritingTo: diskPath.url)
 93 |             defer { try? handle.close() }
 94 |             
 95 |             try handle.truncate(atOffset: size)
 96 |         } catch {
 97 |         }
 98 |     }
 99 | }
100 | 
101 | // MARK: - Configuration Management
102 | 
103 | extension VMDirectory {
104 |     /// Saves the VM configuration to disk
105 |     /// - Parameter config: The configuration to save
106 |     /// - Throws: VMDirectoryError if the save operation fails
107 |     func saveConfig(_ config: VMConfig) throws {
108 |         let encoder = JSONEncoder()
109 |         encoder.outputFormatting = .prettyPrinted
110 |         
111 |         do {
112 |             let data = try encoder.encode(config)
113 |             guard FileManager.default.createFile(atPath: configPath.path, contents: data) else {
114 |                 throw VMDirectoryError.fileCreationFailed(configPath.path)
115 |             }
116 |         } catch {
117 |             throw VMDirectoryError.invalidConfigData
118 |         }
119 |     }
120 | 
121 |     /// Loads the VM configuration from disk
122 |     /// - Returns: The loaded configuration
123 |     /// - Throws: VMDirectoryError if the load operation fails
124 |     func loadConfig() throws -> VMConfig {
125 |         guard let data = FileManager.default.contents(atPath: configPath.path) else {
126 |             throw VMDirectoryError.configNotFound
127 |         }
128 |         
129 |         do {
130 |             let decoder = JSONDecoder()
131 |             return try decoder.decode(VMConfig.self, from: data)
132 |         } catch {
133 |             throw VMDirectoryError.invalidConfigData
134 |         }
135 |     }
136 | }
137 | 
138 | // MARK: - VNC Session Management
139 | 
140 | struct VNCSession: Codable {
141 |     let url: String
142 |     let sharedDirectories: [SharedDirectory]?
143 |     
144 |     init(url: String, sharedDirectories: [SharedDirectory]? = nil) {
145 |         self.url = url
146 |         self.sharedDirectories = sharedDirectories
147 |     }
148 | }
149 | 
150 | extension VMDirectory {
151 |     /// Saves VNC session information to disk
152 |     /// - Parameters:
153 |     ///   - session: The VNC session to save
154 |     ///   - sharedDirectories: Optional array of shared directories to save with the session
155 |     /// - Throws: VMDirectoryError if the save operation fails
156 |     func saveSession(_ session: VNCSession) throws {
157 |         let encoder = JSONEncoder()
158 |         encoder.outputFormatting = .prettyPrinted
159 |         
160 |         do {
161 |             let data = try encoder.encode(session)
162 |             guard FileManager.default.createFile(atPath: sessionsPath.path, contents: data) else {
163 |                 throw VMDirectoryError.fileCreationFailed(sessionsPath.path)
164 |             }
165 |         } catch {
166 |             throw VMDirectoryError.invalidSessionData
167 |         }
168 |     }
169 |     
170 |     /// Loads the VNC session information from disk
171 |     /// - Returns: The loaded VNC session
172 |     /// - Throws: VMDirectoryError if the load operation fails
173 |     func loadSession() throws -> VNCSession {
174 |         guard let data = FileManager.default.contents(atPath: sessionsPath.path) else {
175 |             throw VMDirectoryError.sessionNotFound
176 |         }
177 |         
178 |         do {
179 |             let decoder = JSONDecoder()
180 |             return try decoder.decode(VNCSession.self, from: data)
181 |         } catch {
182 |             throw VMDirectoryError.invalidSessionData
183 |         }
184 |     }
185 |     
186 |     /// Removes the VNC session information from disk
187 |     func clearSession() {
188 |         try? FileManager.default.removeItem(atPath: sessionsPath.path)
189 |     }
190 | }
191 | 
192 | // MARK: - CustomStringConvertible
193 | extension VMDirectory: CustomStringConvertible {
194 |     var description: String {
195 |         "VMDirectory(path: \(dir.path))"
196 |     }
197 | }
198 | 
199 | extension VMDirectory {
200 |     func delete() throws {
201 |         try FileManager.default.removeItem(atPath: dir.path)
202 |     }
203 | }
204 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/benchmarks/ss-pro.py:
--------------------------------------------------------------------------------

```python
  1 | #!/usr/bin/env python3
  2 | """
  3 | ScreenSpot-Pro Benchmark Script
  4 | 
  5 | Evaluates models on the ScreenSpot-Pro dataset for click prediction accuracy.
  6 | Supports both ComputerAgent model strings and custom model classes.
  7 | """
  8 | 
  9 | import argparse
 10 | import asyncio
 11 | import random
 12 | import statistics
 13 | import time
 14 | from typing import Optional
 15 | 
 16 | from datasets import load_dataset
 17 | from tqdm import tqdm
 18 | 
 19 | from utils import (
 20 |     ModelWrapper, 
 21 |     is_click_in_bbox, 
 22 |     save_results_to_markdown, 
 23 |     save_visualizations,
 24 |     get_available_models,
 25 |     get_gpu_memory
 26 | )
 27 | 
 28 | 
 29 | async def evaluate_model(model_wrapper: ModelWrapper, dataset, max_samples: Optional[int] = None) -> dict:
 30 |     """
 31 |     Evaluate a model on the ScreenSpot-Pro dataset.
 32 |     
 33 |     Args:
 34 |         model_wrapper: ModelWrapper instance
 35 |         dataset: ScreenSpot-Pro dataset (list of samples)
 36 |         max_samples: Maximum number of samples to evaluate (None for all)
 37 |         
 38 |     Returns:
 39 |         Dictionary with evaluation results
 40 |     """
 41 |     print(f"\nEvaluating model: {model_wrapper.model_name}")
 42 |     
 43 |     # Load model
 44 |     await model_wrapper.load_model()
 45 |     
 46 |     total_samples = len(dataset)
 47 |     if max_samples is not None:
 48 |         total_samples = min(max_samples, total_samples)
 49 |     
 50 |     correct_predictions = 0
 51 |     error_predictions = 0
 52 |     results = []
 53 |     
 54 |     for i in tqdm(range(total_samples), desc=f"Evaluating {model_wrapper.model_name}"):
 55 |         sample = dataset[i]
 56 |         
 57 |         # Extract sample data
 58 |         image = sample['image']
 59 |         instruction = sample['instruction']
 60 |         bbox = sample['bbox']  # [x1, y1, x2, y2]
 61 |         sample_id = sample['img_filename']
 62 |         
 63 |         # Predict click coordinates with timing
 64 |         start_time = time.time()
 65 |         click_coords = await model_wrapper.predict_click(image, instruction)
 66 |         prediction_time = time.time() - start_time
 67 |         
 68 |         # Check if prediction is correct
 69 |         is_correct = is_click_in_bbox(click_coords, bbox)
 70 |         
 71 |         if is_correct:
 72 |             correct_predictions += 1
 73 |         
 74 |         results.append({
 75 |             'id': sample_id,
 76 |             'instruction': instruction,
 77 |             'bbox': bbox,
 78 |             'predicted_coords': click_coords,
 79 |             'is_correct': is_correct,
 80 |             'failed': False,
 81 |             'prediction_time': prediction_time
 82 |         })
 83 |     
 84 |     # Unload model
 85 |     await model_wrapper.unload_model()
 86 |     
 87 |     # Calculate metrics
 88 |     accuracy = correct_predictions / total_samples if total_samples > 0 else 0.0
 89 |     error_rate = error_predictions / total_samples if total_samples > 0 else 0.0
 90 |     
 91 |     # Calculate timing statistics
 92 |     successful_times = [r['prediction_time'] for r in results if not r['failed']]
 93 |     avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0
 94 |     median_prediction_time = statistics.median(successful_times) if successful_times else 0.0
 95 |     min_prediction_time = min(successful_times) if successful_times else 0.0
 96 |     max_prediction_time = max(successful_times) if successful_times else 0.0
 97 |     
 98 |     # Get VRAM statistics
 99 |     vram_stats = model_wrapper.get_vram_stats()
100 |     
101 |     return {
102 |         'model_name': model_wrapper.model_name,
103 |         'total_samples': total_samples,
104 |         'correct_predictions': correct_predictions,
105 |         'failed_predictions': error_predictions,
106 |         'accuracy': accuracy,
107 |         'failure_rate': error_rate,
108 |         'avg_prediction_time': avg_prediction_time,
109 |         'median_prediction_time': median_prediction_time,
110 |         'min_prediction_time': min_prediction_time,
111 |         'max_prediction_time': max_prediction_time,
112 |         'vram_max_mb': vram_stats['max_mb'],
113 |         'vram_avg_mb': vram_stats['avg_mb'],
114 |         'results': results
115 |     }
116 | 
117 | 
118 | async def main():
119 |     """
120 |     Main function to run the benchmark.
121 |     """
122 |     # Parse command line arguments
123 |     parser = argparse.ArgumentParser(description='ScreenSpot-Pro Benchmark Script')
124 |     parser.add_argument('--samples', type=int, default=300, 
125 |                        help='Number of samples to evaluate (default: 300)')
126 |     parser.add_argument('--seed', type=int, default=42,
127 |                        help='Random seed for shuffling (default: 42)')
128 |     args = parser.parse_args()
129 |     
130 |     # Set random seed
131 |     random.seed(args.seed)
132 |     
133 |     # Load dataset
134 |     print("Loading ScreenSpot-Pro dataset...")
135 |     ds = load_dataset("lmms-lab/ScreenSpot-Pro")
136 |     dataset = ds['train'] # type: ignore
137 |     # Convert to list to support indexing
138 |     dataset_list = list(dataset)
139 |     print(f"Dataset loaded: {len(dataset_list)} samples")
140 |     
141 |     # Shuffle dataset with seed
142 |     random.shuffle(dataset_list)
143 |     print(f"Dataset shuffled with seed {args.seed}")
144 |     
145 |     # Get available models
146 |     models = get_available_models()
147 |     
148 |     # Evaluation settings
149 |     max_samples = args.samples  # Use command line argument
150 |     
151 |     # Run evaluations
152 |     all_results = []
153 |     
154 |     for model in models:
155 |         model_wrapper = ModelWrapper(model)
156 |         result = await evaluate_model(model_wrapper, dataset_list, max_samples)
157 |         all_results.append(result)
158 |         
159 |         # Print summary
160 |         print(f"\n{result['model_name']} Results:")
161 |         print(f"  Accuracy: {result['accuracy']*100:.2f}%")
162 |         print(f"  Correct: {result['correct_predictions']}/{result['total_samples']}")
163 |         print(f"  Errors: {result['failed_predictions']}")
164 |         print(f"  Error Rate: {result['failure_rate']*100:.2f}%")
165 |         print(f"  Avg Time: {result['avg_prediction_time']:.2f}s")
166 |         print(f"  Median Time: {result['median_prediction_time']:.2f}s")
167 |         print(f"  Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s")
168 |         print(f"  VRAM Max: {result['vram_max_mb']:.1f}MB")
169 |         print(f"  VRAM Avg: {result['vram_avg_mb']:.1f}MB")
170 |         
171 |         # Print GPU memory info
172 |         gpu_memory = get_gpu_memory()
173 |         if gpu_memory and gpu_memory[0] > 0:
174 |             print(f"  GPU Free Memory: {gpu_memory[0]:.1f}MB")
175 |     
176 |     # Save results
177 |     if all_results:
178 |         save_results_to_markdown(all_results)
179 |         save_visualizations(all_results, dataset_list)
180 |         print("\nBenchmark completed successfully!")
181 |     else:
182 |         print("\nNo successful evaluations completed.")
183 | 
184 | 
185 | if __name__ == "__main__":
186 |     asyncio.run(main())
```
Page 6/21FirstPrevNextLast