trycua/cua # codebase.md

This is page 10 of 21. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .all-contributorsrc
├── .cursorignore
├── .devcontainer
│   ├── devcontainer.json
│   ├── post-install.sh
│   └── README.md
├── .dockerignore
├── .gitattributes
├── .github
│   ├── FUNDING.yml
│   ├── scripts
│   │   ├── get_pyproject_version.py
│   │   └── tests
│   │       ├── __init__.py
│   │       ├── README.md
│   │       └── test_get_pyproject_version.py
│   └── workflows
│       ├── ci-lume.yml
│       ├── docker-publish-kasm.yml
│       ├── docker-publish-xfce.yml
│       ├── docker-reusable-publish.yml
│       ├── npm-publish-computer.yml
│       ├── npm-publish-core.yml
│       ├── publish-lume.yml
│       ├── pypi-publish-agent.yml
│       ├── pypi-publish-computer-server.yml
│       ├── pypi-publish-computer.yml
│       ├── pypi-publish-core.yml
│       ├── pypi-publish-mcp-server.yml
│       ├── pypi-publish-pylume.yml
│       ├── pypi-publish-som.yml
│       ├── pypi-reusable-publish.yml
│       └── test-validation-script.yml
├── .gitignore
├── .vscode
│   ├── docs.code-workspace
│   ├── launch.json
│   ├── libs-ts.code-workspace
│   ├── lume.code-workspace
│   ├── lumier.code-workspace
│   ├── py.code-workspace
│   └── settings.json
├── blog
│   ├── app-use.md
│   ├── assets
│   │   ├── composite-agents.png
│   │   ├── docker-ubuntu-support.png
│   │   ├── hack-booth.png
│   │   ├── hack-closing-ceremony.jpg
│   │   ├── hack-cua-ollama-hud.jpeg
│   │   ├── hack-leaderboard.png
│   │   ├── hack-the-north.png
│   │   ├── hack-winners.jpeg
│   │   ├── hack-workshop.jpeg
│   │   ├── hud-agent-evals.png
│   │   └── trajectory-viewer.jpeg
│   ├── bringing-computer-use-to-the-web.md
│   ├── build-your-own-operator-on-macos-1.md
│   ├── build-your-own-operator-on-macos-2.md
│   ├── composite-agents.md
│   ├── cua-hackathon.md
│   ├── hack-the-north.md
│   ├── hud-agent-evals.md
│   ├── human-in-the-loop.md
│   ├── introducing-cua-cloud-containers.md
│   ├── lume-to-containerization.md
│   ├── sandboxed-python-execution.md
│   ├── training-computer-use-models-trajectories-1.md
│   ├── trajectory-viewer.md
│   ├── ubuntu-docker-support.md
│   └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│   ├── .gitignore
│   ├── .prettierrc
│   ├── content
│   │   └── docs
│   │       ├── agent-sdk
│   │       │   ├── agent-loops.mdx
│   │       │   ├── benchmarks
│   │       │   │   ├── index.mdx
│   │       │   │   ├── interactive.mdx
│   │       │   │   ├── introduction.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── osworld-verified.mdx
│   │       │   │   ├── screenspot-pro.mdx
│   │       │   │   └── screenspot-v2.mdx
│   │       │   ├── callbacks
│   │       │   │   ├── agent-lifecycle.mdx
│   │       │   │   ├── cost-saving.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── logging.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── pii-anonymization.mdx
│   │       │   │   └── trajectories.mdx
│   │       │   ├── chat-history.mdx
│   │       │   ├── custom-computer-handlers.mdx
│   │       │   ├── custom-tools.mdx
│   │       │   ├── customizing-computeragent.mdx
│   │       │   ├── integrations
│   │       │   │   ├── hud.mdx
│   │       │   │   └── meta.json
│   │       │   ├── message-format.mdx
│   │       │   ├── meta.json
│   │       │   ├── migration-guide.mdx
│   │       │   ├── prompt-caching.mdx
│   │       │   ├── supported-agents
│   │       │   │   ├── composed-agents.mdx
│   │       │   │   ├── computer-use-agents.mdx
│   │       │   │   ├── grounding-models.mdx
│   │       │   │   ├── human-in-the-loop.mdx
│   │       │   │   └── meta.json
│   │       │   ├── supported-model-providers
│   │       │   │   ├── index.mdx
│   │       │   │   └── local-models.mdx
│   │       │   └── usage-tracking.mdx
│   │       ├── computer-sdk
│   │       │   ├── cloud-vm-management.mdx
│   │       │   ├── commands.mdx
│   │       │   ├── computer-ui.mdx
│   │       │   ├── computers.mdx
│   │       │   ├── meta.json
│   │       │   └── sandboxed-python.mdx
│   │       ├── index.mdx
│   │       ├── libraries
│   │       │   ├── agent
│   │       │   │   └── index.mdx
│   │       │   ├── computer
│   │       │   │   └── index.mdx
│   │       │   ├── computer-server
│   │       │   │   ├── Commands.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── REST-API.mdx
│   │       │   │   └── WebSocket-API.mdx
│   │       │   ├── core
│   │       │   │   └── index.mdx
│   │       │   ├── lume
│   │       │   │   ├── cli-reference.mdx
│   │       │   │   ├── faq.md
│   │       │   │   ├── http-api.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── meta.json
│   │       │   │   └── prebuilt-images.mdx
│   │       │   ├── lumier
│   │       │   │   ├── building-lumier.mdx
│   │       │   │   ├── docker-compose.mdx
│   │       │   │   ├── docker.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   └── meta.json
│   │       │   ├── mcp-server
│   │       │   │   ├── client-integrations.mdx
│   │       │   │   ├── configuration.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── llm-integrations.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── tools.mdx
│   │       │   │   └── usage.mdx
│   │       │   └── som
│   │       │       ├── configuration.mdx
│   │       │       └── index.mdx
│   │       ├── meta.json
│   │       ├── quickstart-cli.mdx
│   │       ├── quickstart-devs.mdx
│   │       └── telemetry.mdx
│   ├── next.config.mjs
│   ├── package-lock.json
│   ├── package.json
│   ├── pnpm-lock.yaml
│   ├── postcss.config.mjs
│   ├── public
│   │   └── img
│   │       ├── agent_gradio_ui.png
│   │       ├── agent.png
│   │       ├── cli.png
│   │       ├── computer.png
│   │       ├── som_box_threshold.png
│   │       └── som_iou_threshold.png
│   ├── README.md
│   ├── source.config.ts
│   ├── src
│   │   ├── app
│   │   │   ├── (home)
│   │   │   │   ├── [[...slug]]
│   │   │   │   │   └── page.tsx
│   │   │   │   └── layout.tsx
│   │   │   ├── api
│   │   │   │   └── search
│   │   │   │       └── route.ts
│   │   │   ├── favicon.ico
│   │   │   ├── global.css
│   │   │   ├── layout.config.tsx
│   │   │   ├── layout.tsx
│   │   │   ├── llms.mdx
│   │   │   │   └── [[...slug]]
│   │   │   │       └── route.ts
│   │   │   └── llms.txt
│   │   │       └── route.ts
│   │   ├── assets
│   │   │   ├── discord-black.svg
│   │   │   ├── discord-white.svg
│   │   │   ├── logo-black.svg
│   │   │   └── logo-white.svg
│   │   ├── components
│   │   │   ├── iou.tsx
│   │   │   └── mermaid.tsx
│   │   ├── lib
│   │   │   ├── llms.ts
│   │   │   └── source.ts
│   │   └── mdx-components.tsx
│   └── tsconfig.json
├── examples
│   ├── agent_examples.py
│   ├── agent_ui_examples.py
│   ├── cloud_api_examples.py
│   ├── computer_examples_windows.py
│   ├── computer_examples.py
│   ├── computer_ui_examples.py
│   ├── computer-example-ts
│   │   ├── .env.example
│   │   ├── .gitignore
│   │   ├── .prettierrc
│   │   ├── package-lock.json
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── README.md
│   │   ├── src
│   │   │   ├── helpers.ts
│   │   │   └── index.ts
│   │   └── tsconfig.json
│   ├── docker_examples.py
│   ├── evals
│   │   ├── hud_eval_examples.py
│   │   └── wikipedia_most_linked.txt
│   ├── pylume_examples.py
│   ├── sandboxed_functions_examples.py
│   ├── som_examples.py
│   ├── utils.py
│   └── winsandbox_example.py
├── img
│   ├── agent_gradio_ui.png
│   ├── agent.png
│   ├── cli.png
│   ├── computer.png
│   ├── logo_black.png
│   └── logo_white.png
├── libs
│   ├── kasm
│   │   ├── Dockerfile
│   │   ├── LICENSE
│   │   ├── README.md
│   │   └── src
│   │       └── ubuntu
│   │           └── install
│   │               └── firefox
│   │                   ├── custom_startup.sh
│   │                   ├── firefox.desktop
│   │                   └── install_firefox.sh
│   ├── lume
│   │   ├── .cursorignore
│   │   ├── CONTRIBUTING.md
│   │   ├── Development.md
│   │   ├── img
│   │   │   └── cli.png
│   │   ├── Package.resolved
│   │   ├── Package.swift
│   │   ├── README.md
│   │   ├── resources
│   │   │   └── lume.entitlements
│   │   ├── scripts
│   │   │   ├── build
│   │   │   │   ├── build-debug.sh
│   │   │   │   ├── build-release-notarized.sh
│   │   │   │   └── build-release.sh
│   │   │   └── install.sh
│   │   ├── src
│   │   │   ├── Commands
│   │   │   │   ├── Clone.swift
│   │   │   │   ├── Config.swift
│   │   │   │   ├── Create.swift
│   │   │   │   ├── Delete.swift
│   │   │   │   ├── Get.swift
│   │   │   │   ├── Images.swift
│   │   │   │   ├── IPSW.swift
│   │   │   │   ├── List.swift
│   │   │   │   ├── Logs.swift
│   │   │   │   ├── Options
│   │   │   │   │   └── FormatOption.swift
│   │   │   │   ├── Prune.swift
│   │   │   │   ├── Pull.swift
│   │   │   │   ├── Push.swift
│   │   │   │   ├── Run.swift
│   │   │   │   ├── Serve.swift
│   │   │   │   ├── Set.swift
│   │   │   │   └── Stop.swift
│   │   │   ├── ContainerRegistry
│   │   │   │   ├── ImageContainerRegistry.swift
│   │   │   │   ├── ImageList.swift
│   │   │   │   └── ImagesPrinter.swift
│   │   │   ├── Errors
│   │   │   │   └── Errors.swift
│   │   │   ├── FileSystem
│   │   │   │   ├── Home.swift
│   │   │   │   ├── Settings.swift
│   │   │   │   ├── VMConfig.swift
│   │   │   │   ├── VMDirectory.swift
│   │   │   │   └── VMLocation.swift
│   │   │   ├── LumeController.swift
│   │   │   ├── Main.swift
│   │   │   ├── Server
│   │   │   │   ├── Handlers.swift
│   │   │   │   ├── HTTP.swift
│   │   │   │   ├── Requests.swift
│   │   │   │   ├── Responses.swift
│   │   │   │   └── Server.swift
│   │   │   ├── Utils
│   │   │   │   ├── CommandRegistry.swift
│   │   │   │   ├── CommandUtils.swift
│   │   │   │   ├── Logger.swift
│   │   │   │   ├── NetworkUtils.swift
│   │   │   │   ├── Path.swift
│   │   │   │   ├── ProcessRunner.swift
│   │   │   │   ├── ProgressLogger.swift
│   │   │   │   ├── String.swift
│   │   │   │   └── Utils.swift
│   │   │   ├── Virtualization
│   │   │   │   ├── DarwinImageLoader.swift
│   │   │   │   ├── DHCPLeaseParser.swift
│   │   │   │   ├── ImageLoaderFactory.swift
│   │   │   │   └── VMVirtualizationService.swift
│   │   │   ├── VM
│   │   │   │   ├── DarwinVM.swift
│   │   │   │   ├── LinuxVM.swift
│   │   │   │   ├── VM.swift
│   │   │   │   ├── VMDetails.swift
│   │   │   │   ├── VMDetailsPrinter.swift
│   │   │   │   ├── VMDisplayResolution.swift
│   │   │   │   └── VMFactory.swift
│   │   │   └── VNC
│   │   │       ├── PassphraseGenerator.swift
│   │   │       └── VNCService.swift
│   │   └── tests
│   │       ├── Mocks
│   │       │   ├── MockVM.swift
│   │       │   ├── MockVMVirtualizationService.swift
│   │       │   └── MockVNCService.swift
│   │       ├── VM
│   │       │   └── VMDetailsPrinterTests.swift
│   │       ├── VMTests.swift
│   │       ├── VMVirtualizationServiceTests.swift
│   │       └── VNCServiceTests.swift
│   ├── lumier
│   │   ├── .dockerignore
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   └── src
│   │       ├── bin
│   │       │   └── entry.sh
│   │       ├── config
│   │       │   └── constants.sh
│   │       ├── hooks
│   │       │   └── on-logon.sh
│   │       └── lib
│   │           ├── utils.sh
│   │           └── vm.sh
│   ├── python
│   │   ├── agent
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── agent
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── adapters
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── huggingfacelocal_adapter.py
│   │   │   │   │   ├── human_adapter.py
│   │   │   │   │   ├── mlxvlm_adapter.py
│   │   │   │   │   └── models
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── generic.py
│   │   │   │   │       ├── internvl.py
│   │   │   │   │       ├── opencua.py
│   │   │   │   │       └── qwen2_5_vl.py
│   │   │   │   ├── agent.py
│   │   │   │   ├── callbacks
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── budget_manager.py
│   │   │   │   │   ├── image_retention.py
│   │   │   │   │   ├── logging.py
│   │   │   │   │   ├── operator_validator.py
│   │   │   │   │   ├── pii_anonymization.py
│   │   │   │   │   ├── prompt_instructions.py
│   │   │   │   │   ├── telemetry.py
│   │   │   │   │   └── trajectory_saver.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── computers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cua.py
│   │   │   │   │   └── custom.py
│   │   │   │   ├── decorators.py
│   │   │   │   ├── human_tool
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   ├── server.py
│   │   │   │   │   └── ui.py
│   │   │   │   ├── integrations
│   │   │   │   │   └── hud
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── agent.py
│   │   │   │   │       └── proxy.py
│   │   │   │   ├── loops
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── anthropic.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── composed_grounded.py
│   │   │   │   │   ├── gemini.py
│   │   │   │   │   ├── glm45v.py
│   │   │   │   │   ├── gta1.py
│   │   │   │   │   ├── holo.py
│   │   │   │   │   ├── internvl.py
│   │   │   │   │   ├── model_types.csv
│   │   │   │   │   ├── moondream3.py
│   │   │   │   │   ├── omniparser.py
│   │   │   │   │   ├── openai.py
│   │   │   │   │   ├── opencua.py
│   │   │   │   │   └── uitars.py
│   │   │   │   ├── proxy
│   │   │   │   │   ├── examples.py
│   │   │   │   │   └── handlers.py
│   │   │   │   ├── responses.py
│   │   │   │   ├── types.py
│   │   │   │   └── ui
│   │   │   │       ├── __init__.py
│   │   │   │       ├── __main__.py
│   │   │   │       └── gradio
│   │   │   │           ├── __init__.py
│   │   │   │           ├── app.py
│   │   │   │           └── ui_components.py
│   │   │   ├── benchmarks
│   │   │   │   ├── .gitignore
│   │   │   │   ├── contrib.md
│   │   │   │   ├── interactive.py
│   │   │   │   ├── models
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   └── gta1.py
│   │   │   │   ├── README.md
│   │   │   │   ├── ss-pro.py
│   │   │   │   ├── ss-v2.py
│   │   │   │   └── utils.py
│   │   │   ├── example.py
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── computer
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer
│   │   │   │   ├── __init__.py
│   │   │   │   ├── computer.py
│   │   │   │   ├── diorama_computer.py
│   │   │   │   ├── helpers.py
│   │   │   │   ├── interface
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   ├── models.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── logger.py
│   │   │   │   ├── models.py
│   │   │   │   ├── providers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cloud
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── docker
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── lume
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── lume_api.py
│   │   │   │   │   ├── lumier
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── types.py
│   │   │   │   │   └── winsandbox
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── provider.py
│   │   │   │   │       └── setup_script.ps1
│   │   │   │   ├── ui
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   └── gradio
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       └── app.py
│   │   │   │   └── utils.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── computer-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── diorama
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── diorama_computer.py
│   │   │   │   │   ├── diorama.py
│   │   │   │   │   ├── draw.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── safezone.py
│   │   │   │   ├── handlers
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── main.py
│   │   │   │   ├── server.py
│   │   │   │   └── watchdog.py
│   │   │   ├── examples
│   │   │   │   ├── __init__.py
│   │   │   │   └── usage_example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   ├── run_server.py
│   │   │   └── test_connection.py
│   │   ├── core
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── core
│   │   │   │   ├── __init__.py
│   │   │   │   └── telemetry
│   │   │   │       ├── __init__.py
│   │   │   │       └── posthog.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── mcp-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── CONCURRENT_SESSIONS.md
│   │   │   ├── mcp_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── server.py
│   │   │   │   └── session_manager.py
│   │   │   ├── pdm.lock
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── scripts
│   │   │       ├── install_mcp_server.sh
│   │   │       └── start_mcp_server.sh
│   │   ├── pylume
│   │   │   ├── __init__.py
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── pylume
│   │   │   │   ├── __init__.py
│   │   │   │   ├── client.py
│   │   │   │   ├── exceptions.py
│   │   │   │   ├── lume
│   │   │   │   ├── models.py
│   │   │   │   ├── pylume.py
│   │   │   │   └── server.py
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   └── som
│   │       ├── .bumpversion.cfg
│   │       ├── LICENSE
│   │       ├── poetry.toml
│   │       ├── pyproject.toml
│   │       ├── README.md
│   │       ├── som
│   │       │   ├── __init__.py
│   │       │   ├── detect.py
│   │       │   ├── detection.py
│   │       │   ├── models.py
│   │       │   ├── ocr.py
│   │       │   ├── util
│   │       │   │   └── utils.py
│   │       │   └── visualization.py
│   │       └── tests
│   │           └── test_omniparser.py
│   ├── typescript
│   │   ├── .gitignore
│   │   ├── .nvmrc
│   │   ├── agent
│   │   │   ├── examples
│   │   │   │   ├── playground-example.html
│   │   │   │   └── README.md
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── client.ts
│   │   │   │   ├── index.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   └── client.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── biome.json
│   │   ├── computer
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── computer
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── providers
│   │   │   │   │   │   ├── base.ts
│   │   │   │   │   │   ├── cloud.ts
│   │   │   │   │   │   └── index.ts
│   │   │   │   │   └── types.ts
│   │   │   │   ├── index.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── base.ts
│   │   │   │   │   ├── factory.ts
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── linux.ts
│   │   │   │   │   ├── macos.ts
│   │   │   │   │   └── windows.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   ├── computer
│   │   │   │   │   └── cloud.test.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── factory.test.ts
│   │   │   │   │   ├── index.test.ts
│   │   │   │   │   ├── linux.test.ts
│   │   │   │   │   ├── macos.test.ts
│   │   │   │   │   └── windows.test.ts
│   │   │   │   └── setup.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── core
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── index.ts
│   │   │   │   └── telemetry
│   │   │   │       ├── clients
│   │   │   │       │   ├── index.ts
│   │   │   │       │   └── posthog.ts
│   │   │   │       └── index.ts
│   │   │   ├── tests
│   │   │   │   └── telemetry.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── pnpm-workspace.yaml
│   │   └── README.md
│   └── xfce
│       ├── .dockerignore
│       ├── .gitignore
│       ├── Dockerfile
│       ├── README.md
│       └── src
│           ├── scripts
│           │   ├── resize-display.sh
│           │   ├── start-computer-server.sh
│           │   ├── start-novnc.sh
│           │   ├── start-vnc.sh
│           │   └── xstartup.sh
│           ├── supervisor
│           │   └── supervisord.conf
│           └── xfce-config
│               ├── helpers.rc
│               ├── xfce4-power-manager.xml
│               └── xfce4-session.xml
├── LICENSE.md
├── Makefile
├── notebooks
│   ├── agent_nb.ipynb
│   ├── blog
│   │   ├── build-your-own-operator-on-macos-1.ipynb
│   │   └── build-your-own-operator-on-macos-2.ipynb
│   ├── composite_agents_docker_nb.ipynb
│   ├── computer_nb.ipynb
│   ├── computer_server_nb.ipynb
│   ├── customizing_computeragent.ipynb
│   ├── eval_osworld.ipynb
│   ├── ollama_nb.ipynb
│   ├── pylume_nb.ipynb
│   ├── README.md
│   ├── sota_hackathon_cloud.ipynb
│   └── sota_hackathon.ipynb
├── pdm.lock
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── samples
│   └── community
│       ├── global-online
│       │   └── README.md
│       └── hack-the-north
│           └── README.md
├── scripts
│   ├── build-uv.sh
│   ├── build.ps1
│   ├── build.sh
│   ├── cleanup.sh
│   ├── playground-docker.sh
│   ├── playground.sh
│   └── run-docker-dev.sh
└── tests
    ├── pytest.ini
    ├── shell_cmd.py
    ├── test_files.py
    ├── test_mcp_server_session_management.py
    ├── test_mcp_server_streaming.py
    ├── test_shell_bash.py
    ├── test_telemetry.py
    ├── test_venv.py
    └── test_watchdog.py
```

# Files

--------------------------------------------------------------------------------
/libs/python/agent/agent/adapters/models/internvl.py:
--------------------------------------------------------------------------------

```python
  1 | from __future__ import annotations
  2 | from typing import List, Dict, Any, Optional
  3 | 
  4 | # Hugging Face imports are local to avoid hard dependency at module import
  5 | try:
  6 |     import torch  # type: ignore
  7 |     from transformers import AutoModel, AutoTokenizer  # type: ignore
  8 |     # Attempt to import InternVL's model dependencies
  9 |     import einops as _  # type: ignore
 10 |     import timm as _  # type: ignore
 11 |     from PIL import Image  # type: ignore
 12 |     import torchvision.transforms as T  # type: ignore
 13 |     from torchvision.transforms.functional import InterpolationMode  # type: ignore
 14 |     import base64  # type: ignore
 15 |     from io import BytesIO  # type: ignore
 16 |     import requests  # type: ignore
 17 |     HF_AVAILABLE = True
 18 | except Exception:
 19 |     HF_AVAILABLE = False
 20 | 
 21 | 
 22 | class InternVLModel:
 23 |     """Generic Hugging Face vision-language model handler.
 24 |     Uses InternVL's native `model.chat()` interface with `AutoTokenizer`.
 25 |     Provides preprocessing to support multi-turn conversations with multiple images.
 26 |     """
 27 | 
 28 |     def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
 29 |         if not HF_AVAILABLE:
 30 |             raise ImportError(
 31 |                 "InternVL dependencies not found. Install with: pip install \"cua-agent[internvl-hf]\""
 32 |             )
 33 |         self.model_name = model_name
 34 |         self.device = device
 35 |         self.model = None
 36 |         self.tokenizer = None
 37 |         self.trust_remote_code = trust_remote_code
 38 |         self._load()
 39 | 
 40 |     def _load(self) -> None:
 41 |         # Load model
 42 |         self.model = AutoModel.from_pretrained(
 43 |             self.model_name,
 44 |             torch_dtype=torch.bfloat16,
 45 |             low_cpu_mem_usage=True,
 46 |             use_flash_attn=True,
 47 |             device_map=self.device,
 48 |             trust_remote_code=self.trust_remote_code,
 49 |         ).eval()
 50 |         # Load tokenizer (InternVL requires trust_remote_code=True and often use_fast=False)
 51 |         self.tokenizer = AutoTokenizer.from_pretrained(
 52 |             self.model_name,
 53 |             trust_remote_code=self.trust_remote_code,
 54 |             use_fast=False,
 55 |         )
 56 | 
 57 |     # ---- Image preprocessing utilities adapted from InternVL docs ----
 58 |     IMAGENET_MEAN = (0.485, 0.456, 0.406)
 59 |     IMAGENET_STD = (0.229, 0.224, 0.225)
 60 | 
 61 |     def _build_transform(self, input_size: int) -> T.Compose:
 62 |         MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD
 63 |         transform = T.Compose([
 64 |             T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
 65 |             T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
 66 |             T.ToTensor(),
 67 |             T.Normalize(mean=MEAN, std=STD)
 68 |         ])
 69 |         return transform
 70 | 
 71 |     def _find_closest_aspect_ratio(self, aspect_ratio: float, target_ratios: List[tuple], width: int, height: int, image_size: int):
 72 |         best_ratio_diff = float('inf')
 73 |         best_ratio = (1, 1)
 74 |         area = width * height
 75 |         for ratio in target_ratios:
 76 |             target_aspect_ratio = ratio[0] / ratio[1]
 77 |             ratio_diff = abs(aspect_ratio - target_aspect_ratio)
 78 |             if ratio_diff < best_ratio_diff:
 79 |                 best_ratio_diff = ratio_diff
 80 |                 best_ratio = ratio
 81 |             elif ratio_diff == best_ratio_diff:
 82 |                 if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
 83 |                     best_ratio = ratio
 84 |         return best_ratio
 85 | 
 86 |     def _dynamic_preprocess(self, image: Image.Image, min_num: int = 1, max_num: int = 12, image_size: int = 448, use_thumbnail: bool = True) -> List[Image.Image]:
 87 |         orig_width, orig_height = image.size
 88 |         aspect_ratio = orig_width / orig_height
 89 | 
 90 |         target_ratios = set(
 91 |             (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
 92 |             i * j <= max_num and i * j >= min_num)
 93 |         target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
 94 | 
 95 |         target_aspect_ratio = self._find_closest_aspect_ratio(
 96 |             aspect_ratio, target_ratios, orig_width, orig_height, image_size)
 97 | 
 98 |         target_width = image_size * target_aspect_ratio[0]
 99 |         target_height = image_size * target_aspect_ratio[1]
100 |         blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
101 | 
102 |         resized_img = image.resize((target_width, target_height))
103 |         processed_images: List[Image.Image] = []
104 |         for i in range(blocks):
105 |             box = (
106 |                 (i % (target_width // image_size)) * image_size,
107 |                 (i // (target_width // image_size)) * image_size,
108 |                 ((i % (target_width // image_size)) + 1) * image_size,
109 |                 ((i // (target_width // image_size)) + 1) * image_size
110 |             )
111 |             split_img = resized_img.crop(box)
112 |             processed_images.append(split_img)
113 |         assert len(processed_images) == blocks
114 |         if use_thumbnail and len(processed_images) != 1:
115 |             thumbnail_img = image.resize((image_size, image_size))
116 |             processed_images.append(thumbnail_img)
117 |         return processed_images
118 | 
119 |     def _load_image_from_source(self, src: str) -> Image.Image:
120 |         """Load PIL image from various sources: data URL, http(s), or local path."""
121 |         if src.startswith("data:image/"):
122 |             # data URL base64
123 |             header, b64data = src.split(",", 1)
124 |             img_bytes = base64.b64decode(b64data)
125 |             return Image.open(BytesIO(img_bytes)).convert('RGB')
126 |         if src.startswith("http://") or src.startswith("https://"):
127 |             resp = requests.get(src, timeout=10)
128 |             resp.raise_for_status()
129 |             return Image.open(BytesIO(resp.content)).convert('RGB')
130 |         # Assume local file path
131 |         return Image.open(src).convert('RGB')
132 | 
133 |     def _images_to_pixel_values(self, images: List[Image.Image], input_size: int = 448, max_num: int = 12):
134 |         transform = self._build_transform(input_size=input_size)
135 |         pixel_values_list = []
136 |         num_patches_list: List[int] = []
137 |         for img in images:
138 |             tiles = self._dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
139 |             pv = [transform(tile) for tile in tiles]
140 |             pv = torch.stack(pv)
141 |             num_patches_list.append(pv.shape[0])
142 |             pixel_values_list.append(pv)
143 |         if not pixel_values_list:
144 |             return None, []
145 |         pixel_values = torch.cat(pixel_values_list)
146 |         return pixel_values, num_patches_list
147 | 
148 |     def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
149 |         """Generate text for the given HF-format messages.
150 |         messages: [{ role, content: [{type:'text'|'image', text|image}] }]
151 | 
152 |         This implementation constructs InternVL-compatible inputs and uses
153 |         `model.chat(tokenizer, pixel_values, question, history=...)` to avoid
154 |         relying on AutoProcessor (which fails for some tokenizers).
155 |         """
156 |         assert self.model is not None and self.tokenizer is not None
157 | 
158 |         # Build textual context and collect images and the final question
159 |         context_lines: List[str] = []
160 |         all_images: List[Image.Image] = []
161 |         last_user_text_parts: List[str] = []
162 | 
163 |         for msg in messages:
164 |             role = msg.get("role", "user")
165 |             content = msg.get("content", [])
166 |             if isinstance(content, str):
167 |                 content_items = [{"type": "text", "text": content}]
168 |             else:
169 |                 content_items = content
170 | 
171 |             if role == "user":
172 |                 # Collect text and images
173 |                 parts_text: List[str] = []
174 |                 for item in content_items:
175 |                     if item.get("type") == "text":
176 |                         t = item.get("text", "")
177 |                         if t:
178 |                             parts_text.append(t)
179 |                     elif item.get("type") == "image":
180 |                         url = item.get("image", "")
181 |                         if url:
182 |                             try:
183 |                                 all_images.append(self._load_image_from_source(url))
184 |                             except Exception:
185 |                                 # Ignore failed image loads but keep going
186 |                                 pass
187 |                 text = "\n".join(parts_text).strip()
188 |                 if text:
189 |                     context_lines.append(f"User: {text}")
190 |                 # Track last user text separately for question
191 |                 last_user_text_parts = parts_text or last_user_text_parts
192 |             elif role == "assistant":
193 |                 # Only keep text content for history
194 |                 parts_text = [item.get("text", "") for item in content_items if item.get("type") == "text"]
195 |                 text = "\n".join(parts_text).strip()
196 |                 if text:
197 |                     context_lines.append(f"Assistant: {text}")
198 | 
199 |         # Prepare pixel values for all collected images (across turns)
200 |         pixel_values = None
201 |         num_patches_list: List[int] = []
202 |         if all_images:
203 |             pixel_values, num_patches_list = self._images_to_pixel_values(all_images, input_size=448, max_num=12)
204 |             if pixel_values is not None:
205 |                 # Convert dtype/device as in docs
206 |                 pixel_values = pixel_values.to(torch.bfloat16)
207 |                 # Chat API expects tensors on CUDA when model is on CUDA
208 |                 try:
209 |                     pixel_values = pixel_values.to(self.model.device)
210 |                 except Exception:
211 |                     pass
212 | 
213 |         # Build question with any prior context and numbered image placeholders
214 |         if all_images:
215 |             # Separate images layout: Image-1: <image> ... then question text
216 |             prefix_lines = [f"Image-{i+1}: <image>" for i in range(len(all_images))]
217 |             prefix = "\n".join(prefix_lines) + "\n"
218 |         else:
219 |             prefix = ""
220 | 
221 |         last_user_text = "\n".join(last_user_text_parts).strip()
222 |         # Combine prior text-only turns as context to emulate multi-turn
223 |         context_text = "\n".join(context_lines[:-1]) if len(context_lines) > 1 else ""
224 |         base_question = last_user_text if last_user_text else "Describe the image(s) in detail."
225 |         if context_text:
226 |             question = (context_text + "\n" + prefix + base_question).strip()
227 |         else:
228 |             question = (prefix + base_question).strip()
229 | 
230 |         # Generation config
231 |         generation_config = dict(max_new_tokens=max_new_tokens, do_sample=False)
232 | 
233 |         # Call InternVL chat
234 |         try:
235 |             if pixel_values is None:
236 |                 # Pure-text conversation (embed prior turns in question)
237 |                 response = self.model.chat(self.tokenizer, None, question, generation_config)
238 |             else:
239 |                 # Multi-image: pass num_patches_list if >1 image
240 |                 if len(num_patches_list) > 1:
241 |                     response = self.model.chat(
242 |                         self.tokenizer,
243 |                         pixel_values,
244 |                         question,
245 |                         generation_config,
246 |                         num_patches_list=num_patches_list,
247 |                     )
248 |                 else:
249 |                     response = self.model.chat(self.tokenizer, pixel_values, question, generation_config)
250 |         except Exception as e:
251 |             # Fallback: return empty string to avoid crashing the adapter
252 |             return ""
253 | 
254 |         return response or ""
255 | 
```

--------------------------------------------------------------------------------
/scripts/playground.sh:
--------------------------------------------------------------------------------

```bash
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | 
  5 | echo "🚀 Launching Cua Computer-Use Agent UI..."
  6 | 
  7 | # Save the original working directory
  8 | ORIGINAL_DIR="$(pwd)"
  9 | 
 10 | # Directories used by the script
 11 | DEMO_DIR="$HOME/.cua-demo"
 12 | VENV_DIR="$DEMO_DIR/venv"
 13 | 
 14 | # Function to clean up on exit
 15 | cleanup() {
 16 |   cd ~
 17 |   rm -rf "$TMP_DIR" 2>/dev/null || true
 18 | }
 19 | 
 20 | # Create a temporary directory for our work
 21 | TMP_DIR=$(mktemp -d)
 22 | cd "$TMP_DIR"
 23 | trap cleanup EXIT
 24 | 
 25 | # Ask user to choose between local macOS VMs or Cua Cloud Sandbox
 26 | echo ""
 27 | echo "Choose your Cua setup:"
 28 | echo "1) ☁️  Cua Cloud Sandbox (works on any system)"
 29 | echo "2) 🖥️  Local macOS VMs (requires Apple Silicon Mac + macOS 15+)"
 30 | echo ""
 31 | read -p "Enter your choice (1 or 2): " CHOICE
 32 | 
 33 | if [[ "$CHOICE" == "1" ]]; then
 34 |   # Cua Cloud Sandbox setup
 35 |   echo ""
 36 |   echo "☁️ Setting up Cua Cloud Sandbox..."
 37 |   echo ""
 38 |   
 39 |   # Check if existing .env.local already has CUA_API_KEY (check current dir and demo dir)
 40 |   # Look for .env.local in the original working directory (before cd to temp dir)
 41 |   CURRENT_ENV_FILE="$ORIGINAL_DIR/.env.local"
 42 |   DEMO_ENV_FILE="$DEMO_DIR/.env.local"
 43 |   
 44 |   CUA_API_KEY=""
 45 |   
 46 |   # First check current directory
 47 |   if [[ -f "$CURRENT_ENV_FILE" ]] && grep -q "CUA_API_KEY=" "$CURRENT_ENV_FILE"; then
 48 |     EXISTING_CUA_KEY=$(grep "CUA_API_KEY=" "$CURRENT_ENV_FILE" | cut -d'=' -f2- | tr -d '"' | tr -d "'" | xargs)
 49 |     if [[ -n "$EXISTING_CUA_KEY" && "$EXISTING_CUA_KEY" != "your_cua_api_key_here" && "$EXISTING_CUA_KEY" != "" ]]; then
 50 |       CUA_API_KEY="$EXISTING_CUA_KEY"
 51 |     fi
 52 |   fi
 53 |   
 54 |   # Then check demo directory if not found in current dir
 55 |   if [[ -z "$CUA_API_KEY" ]] && [[ -f "$DEMO_ENV_FILE" ]] && grep -q "CUA_API_KEY=" "$DEMO_ENV_FILE"; then
 56 |     EXISTING_CUA_KEY=$(grep "CUA_API_KEY=" "$DEMO_ENV_FILE" | cut -d'=' -f2- | tr -d '"' | tr -d "'" | xargs)
 57 |     if [[ -n "$EXISTING_CUA_KEY" && "$EXISTING_CUA_KEY" != "your_cua_api_key_here" && "$EXISTING_CUA_KEY" != "" ]]; then
 58 |       CUA_API_KEY="$EXISTING_CUA_KEY"
 59 |     fi
 60 |   fi
 61 |   
 62 |   # If no valid API key found, prompt for one
 63 |   if [[ -z "$CUA_API_KEY" ]]; then
 64 |     echo "To use Cua Cloud Sandbox, you need to:"
 65 |     echo "1. Sign up at https://trycua.com"
 66 |     echo "2. Create a Cloud Sandbox"
 67 |     echo "3. Generate an Api Key"
 68 |     echo ""
 69 |     read -p "Enter your Cua Api Key: " CUA_API_KEY
 70 |     
 71 |     if [[ -z "$CUA_API_KEY" ]]; then
 72 |       echo "❌ Cua Api Key is required for Cloud Sandbox."
 73 |       exit 1
 74 |     fi
 75 |   fi
 76 |   
 77 |   USE_CLOUD=true
 78 | 
 79 | elif [[ "$CHOICE" == "2" ]]; then
 80 |   # Local macOS VM setup
 81 |   echo ""
 82 |   echo "🖥️ Setting up local macOS VMs..."
 83 |   
 84 |   # Check for Apple Silicon Mac
 85 |   if [[ $(uname -s) != "Darwin" || $(uname -m) != "arm64" ]]; then
 86 |     echo "❌ Local macOS VMs require an Apple Silicon Mac (M1/M2/M3/M4)."
 87 |     echo "💡 Consider using Cua Cloud Sandbox instead (option 1)."
 88 |     exit 1
 89 |   fi
 90 | 
 91 |   # Check for macOS 15 (Sequoia) or newer
 92 |   OSVERSION=$(sw_vers -productVersion)
 93 |   if [[ $(echo "$OSVERSION 15.0" | tr " " "\n" | sort -V | head -n 1) != "15.0" ]]; then
 94 |     echo "❌ Local macOS VMs require macOS 15 (Sequoia) or newer. You have $OSVERSION."
 95 |     echo "💡 Consider using Cua Cloud Sandbox instead (option 1)."
 96 |     exit 1
 97 |   fi
 98 | 
 99 |   USE_CLOUD=false
100 | 
101 | else
102 |   echo "❌ Invalid choice. Please run the script again and choose 1 or 2."
103 |   exit 1
104 | fi
105 | 
106 | # Install Lume if not already installed (only for local VMs)
107 | if [[ "$USE_CLOUD" == "false" ]]; then
108 |   if ! command -v lume &> /dev/null; then
109 |     echo "📦 Installing Lume CLI..."
110 |     curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh | bash
111 |     
112 |     # Add lume to PATH for this session if it's not already there
113 |     if ! command -v lume &> /dev/null; then
114 |       export PATH="$PATH:$HOME/.local/bin"
115 |     fi
116 |   fi
117 | 
118 |   # Pull the macOS CUA image if not already present
119 |   if ! lume ls | grep -q "macos-sequoia-cua"; then
120 |     # Check available disk space
121 |     IMAGE_SIZE_GB=30
122 |     AVAILABLE_SPACE_KB=$(df -k $HOME | tail -1 | awk '{print $4}')
123 |     AVAILABLE_SPACE_GB=$(($AVAILABLE_SPACE_KB / 1024 / 1024))
124 |     
125 |     echo "📊 The macOS CUA image will use approximately ${IMAGE_SIZE_GB}GB of disk space."
126 |     echo "   You currently have ${AVAILABLE_SPACE_GB}GB available on your system."
127 |     
128 |     # Prompt for confirmation
129 |     read -p "   Continue? [y]/n: " CONTINUE
130 |     CONTINUE=${CONTINUE:-y}
131 |     
132 |     if [[ $CONTINUE =~ ^[Yy]$ ]]; then
133 |       echo "📥 Pulling macOS CUA image (this may take a while)..."
134 |       lume pull macos-sequoia-cua:latest
135 |     else
136 |       echo "❌ Installation cancelled."
137 |       exit 1
138 |     fi
139 |   fi
140 | fi
141 | 
142 | # Create a Python virtual environment
143 | echo "🐍 Setting up Python environment..."
144 | 
145 | # Try different Python commands in order of preference
146 | PYTHON_CMD=""
147 | for cmd in python3.11 python3 python; do
148 |   if command -v $cmd &> /dev/null; then
149 |     # Check this Python version
150 |     PYTHON_VERSION=$($cmd --version 2>&1 | cut -d" " -f2)
151 |     PYTHON_MAJOR=$(echo $PYTHON_VERSION | cut -d. -f1)
152 |     PYTHON_MINOR=$(echo $PYTHON_VERSION | cut -d. -f2)
153 |     
154 |     if [ "$PYTHON_MAJOR" -eq 3 ] && [ "$PYTHON_MINOR" -eq 11 ]; then
155 |       PYTHON_CMD=$cmd
156 |       echo "✅ Found suitable Python: $cmd (version $PYTHON_VERSION)"
157 |       break
158 |     elif [ "$PYTHON_MAJOR" -eq 3 ] && [ "$PYTHON_MINOR" -gt 11 ]; then
159 |       PYTHON_CMD=$cmd
160 |       PYTHON_TOO_NEW=true
161 |       echo "⚠️  Found $cmd (version $PYTHON_VERSION) but only Python 3.11.x is supported."
162 |       break
163 |     else
164 |       echo "⚠️  Found $cmd (version $PYTHON_VERSION) but it's too old, trying next..."
165 |     fi
166 |   fi
167 | done
168 | 
169 | # If no suitable Python was found, or if Python is too new, offer to exit or continue
170 | if [ -z "$PYTHON_CMD" ] || [ "$PYTHON_TOO_NEW" = true ]; then
171 |   OS_TYPE=$(uname -s)
172 |   if [ "$PYTHON_TOO_NEW" = true ]; then
173 |     echo -e "\n❌ Python version $PYTHON_VERSION detected. Only Python 3.11.x is supported. Newer versions (e.g., 3.12+) are not yet supported."
174 |   else
175 |     if [[ "$OS_TYPE" == "Darwin" ]]; then
176 |       echo -e "\n❌ python3.11 not found. To continue, we recommend running this:\n\n    $ brew install [email protected]\n"
177 |     elif [[ "$OS_TYPE" == "MINGW"* || "$OS_TYPE" == "CYGWIN"* || "$OS_TYPE" == "MSYS"* ]]; then
178 |       echo -e "\n❌ python3.11 not found. Please install Python 3.11 from https://www.python.org/downloads/\n"
179 |     else
180 |       echo -e "\n❌ python3.11 not found. Please install Python 3.11 from your package manager or https://www.python.org/downloads/\n"
181 |     fi
182 |   fi
183 |   while true; do
184 |     echo "Would you like to exit so you can install Python 3.11, or continue anyway? (e = exit, c = continue): "
185 |     read -n 1 -r PYTHON_CONT_CHOICE
186 |     echo
187 |     if [[ "$PYTHON_CONT_CHOICE" =~ ^[Ee]$ ]]; then
188 |       echo "Exiting so you can install Python 3.11."
189 |       exit 1
190 |     elif [[ "$PYTHON_CONT_CHOICE" =~ ^[Cc]$ ]]; then
191 |       echo "⚠️  Continuing without Python 3.11. Some features may not work as expected."
192 |       break
193 |     else
194 |       echo "Please enter 'e' to exit or 'c' to continue."
195 |     fi
196 |   done
197 | fi
198 | 
199 | # Create a virtual environment
200 | if [ ! -d "$VENV_DIR" ]; then
201 |   $PYTHON_CMD -m venv "$VENV_DIR"
202 | fi
203 | 
204 | # Activate the virtual environment
205 | source "$VENV_DIR/bin/activate"
206 | 
207 | # Install required packages
208 | echo "📦 Updating Cua packages..."
209 | pip install -U pip setuptools wheel Cmake
210 | pip install -U cua-computer "cua-agent[all]"
211 | 
212 | # Create a simple demo script
213 | mkdir -p "$DEMO_DIR"
214 | 
215 | # Create .env.local file with API keys (only if it doesn't exist)
216 | if [[ ! -f "$DEMO_DIR/.env.local" ]]; then
217 |   cat > "$DEMO_DIR/.env.local" << EOF
218 | # Uncomment and add your API keys here
219 | # OPENAI_API_KEY=your_openai_api_key_here
220 | # ANTHROPIC_API_KEY=your_anthropic_api_key_here
221 | CUA_API_KEY=your_cua_api_key_here
222 | EOF
223 |   echo "📝 Created .env.local file with API key placeholders"
224 | else
225 |   echo "📝 Found existing .env.local file - keeping your current settings"
226 | fi
227 | 
228 | if [[ "$USE_CLOUD" == "true" ]]; then
229 |   # Add CUA API key to .env.local if not already present
230 |   if ! grep -q "CUA_API_KEY" "$DEMO_DIR/.env.local"; then
231 |     echo "CUA_API_KEY=$CUA_API_KEY" >> "$DEMO_DIR/.env.local"
232 |     echo "🔑 Added CUA_API_KEY to .env.local"
233 |   elif grep -q "CUA_API_KEY=your_cua_api_key_here" "$DEMO_DIR/.env.local"; then
234 |     # Update placeholder with actual key
235 |     sed -i.bak "s/CUA_API_KEY=your_cua_api_key_here/CUA_API_KEY=$CUA_API_KEY/" "$DEMO_DIR/.env.local"
236 |     echo "🔑 Updated CUA_API_KEY in .env.local"
237 |   fi
238 | fi
239 | 
240 | # Create a convenience script to run the demo
241 | cat > "$DEMO_DIR/start_ui.sh" << EOF
242 | #!/bin/bash
243 | source "$VENV_DIR/bin/activate"
244 | cd "$DEMO_DIR"
245 | python run_demo.py
246 | EOF
247 | chmod +x "$DEMO_DIR/start_ui.sh"
248 | 
249 | echo "✅ Setup complete!"
250 | 
251 | if [[ "$USE_CLOUD" == "true" ]]; then
252 |   # Create run_demo.py for cloud sandbox
253 |   cat > "$DEMO_DIR/run_demo.py" << 'EOF'
254 | import asyncio
255 | import os
256 | from pathlib import Path
257 | from dotenv import load_dotenv
258 | from computer import Computer
259 | from agent import ComputerAgent, LLM, AgentLoop, LLMProvider
260 | from agent.ui.gradio.ui_components import create_gradio_ui
261 | 
262 | # Load environment variables from .env.local
263 | load_dotenv(Path(__file__).parent / ".env.local")
264 | 
265 | # Check for required API keys
266 | cua_api_key = os.environ.get("CUA_API_KEY", "")
267 | if not cua_api_key:
268 |     print("\n❌ CUA_API_KEY not found in .env.local file.")
269 |     print("Please add your CUA API key to the .env.local file.")
270 |     exit(1)
271 | 
272 | openai_key = os.environ.get("OPENAI_API_KEY", "")
273 | anthropic_key = os.environ.get("ANTHROPIC_API_KEY", "")
274 | 
275 | if not openai_key and not anthropic_key:
276 |     print("\n⚠️  No OpenAI or Anthropic API keys found in .env.local.")
277 |     print("Please add at least one API key to use AI agents.")
278 | 
279 | print("🚀 Starting CUA playground with Cloud Sandbox...")
280 | print("📝 Edit .env.local to update your API keys")
281 | 
282 | # Launch the Gradio UI and open it in the browser
283 | app = create_gradio_ui()
284 | app.launch(share=False, inbrowser=True)
285 | EOF
286 | else
287 |   # Create run_demo.py for local macOS VMs
288 |   cat > "$DEMO_DIR/run_demo.py" << 'EOF'
289 | import asyncio
290 | import os
291 | from pathlib import Path
292 | from dotenv import load_dotenv
293 | from computer import Computer
294 | from agent import ComputerAgent, LLM, AgentLoop, LLMProvider
295 | from agent.ui.gradio.ui_components import create_gradio_ui
296 | 
297 | # Load environment variables from .env.local
298 | load_dotenv(Path(__file__).parent / ".env.local")
299 | 
300 | # Try to load API keys from environment
301 | openai_key = os.environ.get("OPENAI_API_KEY", "")
302 | anthropic_key = os.environ.get("ANTHROPIC_API_KEY", "")
303 | 
304 | if not openai_key and not anthropic_key:
305 |     print("\n⚠️  No OpenAI or Anthropic API keys found in .env.local.")
306 |     print("Please add at least one API key to use AI agents.")
307 | 
308 | print("🚀 Starting CUA playground with local macOS VMs...")
309 | print("📝 Edit .env.local to update your API keys")
310 | 
311 | # Launch the Gradio UI and open it in the browser
312 | app = create_gradio_ui()
313 | app.launch(share=False, inbrowser=True)
314 | EOF
315 | fi
316 | 
317 | echo "☁️  CUA Cloud Sandbox setup complete!"
318 | echo "📝 Edit $DEMO_DIR/.env.local to update your API keys"
319 | echo "🖥️  Start the playground by running: $DEMO_DIR/start_ui.sh"
320 | 
321 | # Check if the VM is running (only for local setup)
322 | if [[ "$USE_CLOUD" == "false" ]]; then
323 |   echo "🔍 Checking if the macOS CUA VM is running..."
324 |   VM_RUNNING=$(lume ls | grep "macos-sequoia-cua" | grep "running" || echo "")
325 | 
326 |   if [ -z "$VM_RUNNING" ]; then
327 |     echo "🚀 Starting the macOS CUA VM in the background..."
328 |     lume run macos-sequoia-cua:latest &
329 |     # Wait a moment for the VM to initialize
330 |     sleep 5
331 |     echo "✅ VM started successfully."
332 |   else
333 |     echo "✅ macOS CUA VM is already running."
334 |   fi
335 | fi
336 | 
337 | # Ask if the user wants to start the demo now
338 | echo
339 | read -p "Would you like to start the Cua Computer-Use Agent UI now? (y/n) " -n 1 -r
340 | echo
341 | if [[ $REPLY =~ ^[Yy]$ ]]; then
342 |   echo "🚀 Starting the Cua Computer-Use Agent UI..."
343 |   echo ""
344 |   "$DEMO_DIR/start_ui.sh"
345 | fi
346 | 
```

--------------------------------------------------------------------------------
/libs/python/som/som/visualization.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import List, Dict, Any, Tuple
  2 | import numpy as np
  3 | from PIL import Image, ImageDraw, ImageFont
  4 | import supervision as sv
  5 | import platform
  6 | import os
  7 | import logging
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | class BoxAnnotator:
 13 |     """Class for drawing bounding boxes and labels on images."""
 14 | 
 15 |     def __init__(self):
 16 |         """Initialize the box annotator with a color palette."""
 17 |         # WCAG 2.1 compliant color palette optimized for accessibility
 18 |         self.colors = [
 19 |             "#2E7D32",  # Green
 20 |             "#C62828",  # Red
 21 |             "#1565C0",  # Blue
 22 |             "#6A1B9A",  # Purple
 23 |             "#EF6C00",  # Orange
 24 |             "#283593",  # Indigo
 25 |             "#4527A0",  # Deep Purple
 26 |             "#00695C",  # Teal
 27 |             "#D84315",  # Deep Orange
 28 |             "#1B5E20",  # Dark Green
 29 |             "#B71C1C",  # Dark Red
 30 |             "#0D47A1",  # Dark Blue
 31 |             "#4A148C",  # Dark Purple
 32 |             "#E65100",  # Dark Orange
 33 |             "#1A237E",  # Dark Indigo
 34 |             "#311B92",  # Darker Purple
 35 |             "#004D40",  # Dark Teal
 36 |             "#BF360C",  # Darker Orange
 37 |             "#33691E",  # Darker Green
 38 |             "#880E4F",  # Pink
 39 |         ]
 40 |         self.color_index = 0
 41 |         self.default_font = None
 42 |         self._initialize_font()
 43 | 
 44 |     def _initialize_font(self) -> None:
 45 |         """Initialize the default font."""
 46 |         # Try to load a system font first
 47 |         system = platform.system()
 48 |         font_paths = []
 49 | 
 50 |         if system == "Darwin":  # macOS
 51 |             font_paths = [
 52 |                 "/System/Library/Fonts/Helvetica.ttc",
 53 |                 "/System/Library/Fonts/Arial.ttf",
 54 |                 "/Library/Fonts/Arial.ttf",
 55 |             ]
 56 |         elif system == "Linux":
 57 |             font_paths = [
 58 |                 "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
 59 |                 "/usr/share/fonts/TTF/DejaVuSans.ttf",
 60 |                 "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
 61 |             ]
 62 |         else:  # Windows
 63 |             font_paths = ["C:\\Windows\\Fonts\\arial.ttf"]
 64 | 
 65 |         # Try each font path
 66 |         for font_path in font_paths:
 67 |             if os.path.exists(font_path):
 68 |                 try:
 69 |                     # Test the font with a small size
 70 |                     test_font = ImageFont.truetype(font_path, 12)
 71 |                     # Test if the font can render text
 72 |                     test_font.getbbox("1")
 73 |                     self.default_font = font_path
 74 |                     return
 75 |                 except Exception:
 76 |                     continue
 77 | 
 78 |     def _get_next_color(self) -> str:
 79 |         """Get the next color from the palette."""
 80 |         color = self.colors[self.color_index]
 81 |         self.color_index = (self.color_index + 1) % len(self.colors)
 82 |         return color
 83 | 
 84 |     def _hex_to_rgb(self, hex_color: str) -> Tuple[int, int, int]:
 85 |         """Convert hex color to RGB tuple."""
 86 |         hex_color = hex_color.lstrip("#")
 87 |         # Create explicit tuple of 3 integers to match the return type
 88 |         r = int(hex_color[0:2], 16)
 89 |         g = int(hex_color[2:4], 16)
 90 |         b = int(hex_color[4:6], 16)
 91 |         return (r, g, b)
 92 | 
 93 |     def draw_boxes(
 94 |         self, image: Image.Image, detections: List[Dict[str, Any]], draw_config: Dict[str, Any]
 95 |     ) -> Image.Image:
 96 |         """Draw bounding boxes and labels on the image."""
 97 |         draw = ImageDraw.Draw(image)
 98 | 
 99 |         # Create smaller font while keeping contrast
100 |         try:
101 |             if self.default_font:
102 |                 font = ImageFont.truetype(self.default_font, size=12)  # Reduced from 16 to 12
103 |             else:
104 |                 # If no TrueType font available, use default
105 |                 font = ImageFont.load_default()
106 |         except Exception:
107 |             font = ImageFont.load_default()
108 | 
109 |         padding = 2  # Reduced padding for smaller overall box
110 |         spacing = 1  # Reduced spacing between elements
111 | 
112 |         # Keep track of used label areas to check for collisions
113 |         used_areas = []
114 | 
115 |         # Store label information for third pass
116 |         labels_to_draw = []
117 | 
118 |         # First pass: Initialize used_areas with all bounding boxes
119 |         for detection in detections:
120 |             box = detection["bbox"]
121 |             x1, y1, x2, y2 = [
122 |                 int(coord * dim) for coord, dim in zip(box, [image.width, image.height] * 2)
123 |             ]
124 |             used_areas.append((x1, y1, x2, y2))
125 | 
126 |         # Second pass: Draw all bounding boxes
127 |         for idx, detection in enumerate(detections, 1):
128 |             # Get box coordinates
129 |             box = detection["bbox"]
130 |             x1, y1, x2, y2 = [
131 |                 int(coord * dim) for coord, dim in zip(box, [image.width, image.height] * 2)
132 |             ]
133 | 
134 |             # Get color for this detection
135 |             color = self._get_next_color()
136 |             rgb_color = self._hex_to_rgb(color)
137 | 
138 |             # Draw bounding box with original width
139 |             draw.rectangle(((x1, y1), (x2, y2)), outline=rgb_color, width=2)
140 | 
141 |             # Use detection number as label
142 |             label = str(idx)
143 | 
144 |             # Get text dimensions using getbbox
145 |             bbox = font.getbbox(label)
146 |             text_width = bbox[2] - bbox[0]
147 |             text_height = bbox[3] - bbox[1]
148 | 
149 |             # Create box dimensions with padding
150 |             box_width = text_width + (padding * 2)  # Removed multiplier for tighter box
151 |             box_height = text_height + (padding * 2)  # Removed multiplier for tighter box
152 | 
153 |             def is_inside_bbox(x, y):
154 |                 """Check if a label box would be inside the bounding box."""
155 |                 return x >= x1 and x + box_width <= x2 and y >= y1 and y + box_height <= y2
156 | 
157 |             # Try different positions until we find one without collision
158 |             positions = [
159 |                 # Top center (above bbox)
160 |                 lambda: (x1 + ((x2 - x1) - box_width) // 2, y1 - box_height - spacing),
161 |                 # Bottom center (below bbox)
162 |                 lambda: (x1 + ((x2 - x1) - box_width) // 2, y2 + spacing),
163 |                 # Right center (right of bbox)
164 |                 lambda: (x2 + spacing, y1 + ((y2 - y1) - box_height) // 2),
165 |                 # Left center (left of bbox)
166 |                 lambda: (x1 - box_width - spacing, y1 + ((y2 - y1) - box_height) // 2),
167 |                 # Top right (outside corner)
168 |                 lambda: (x2 + spacing, y1 - box_height - spacing),
169 |                 # Top left (outside corner)
170 |                 lambda: (x1 - box_width - spacing, y1 - box_height - spacing),
171 |                 # Bottom right (outside corner)
172 |                 lambda: (x2 + spacing, y2 + spacing),
173 |                 # Bottom left (outside corner)
174 |                 lambda: (x1 - box_width - spacing, y2 + spacing),
175 |             ]
176 | 
177 |             def check_occlusion(x, y):
178 |                 """Check if a label box occludes any existing ones or is inside bbox."""
179 |                 # First check if it's inside the bounding box
180 |                 if is_inside_bbox(x, y):
181 |                     return True
182 | 
183 |                 # Then check collision with other labels
184 |                 new_box = (x, y, x + box_width, y + box_height)
185 |                 label_width = new_box[2] - new_box[0]
186 |                 label_height = new_box[3] - new_box[1]
187 |                 
188 |                 for used_box in used_areas:
189 |                     if not (
190 |                         new_box[2] < used_box[0]  # new box is left of used box
191 |                         or new_box[0] > used_box[2]  # new box is right of used box
192 |                         or new_box[3] < used_box[1]  # new box is above used box
193 |                         or new_box[1] > used_box[3]  # new box is below used box
194 |                     ):
195 |                         # Calculate dimensions of the used box
196 |                         used_box_width = used_box[2] - used_box[0]
197 |                         used_box_height = used_box[3] - used_box[1]
198 |                         
199 |                         # Only consider as collision if used box is NOT more than 5x bigger in both dimensions
200 |                         if not (used_box_width > 5 * label_width and used_box_height > 5 * label_height):
201 |                             return True
202 |                 return False
203 | 
204 |             # Try each position until we find one without collision
205 |             label_x = None
206 |             label_y = None
207 | 
208 |             for get_pos in positions:
209 |                 x, y = get_pos()
210 |                 # Ensure position is within image bounds
211 |                 if x < 0 or y < 0 or x + box_width > image.width or y + box_height > image.height:
212 |                     continue
213 |                 if not check_occlusion(x, y):
214 |                     label_x = x
215 |                     label_y = y
216 |                     break
217 | 
218 |             # If all positions collide or are out of bounds, find the best possible position
219 |             if label_x is None:
220 |                 # Try to place it in the nearest valid position outside the bbox
221 |                 best_pos = positions[0]()  # Default to top center
222 |                 label_x = max(0, min(image.width - box_width, best_pos[0]))
223 |                 label_y = max(0, min(image.height - box_height, best_pos[1]))
224 | 
225 |                 # Ensure it's not inside the bounding box
226 |                 if is_inside_bbox(label_x, label_y):
227 |                     # Force it above the bounding box
228 |                     label_y = max(0, y1 - box_height - spacing)
229 | 
230 |             # Add this label area to used areas
231 |             if (
232 |                 label_x is not None
233 |                 and label_y is not None
234 |                 and box_width is not None
235 |                 and box_height is not None
236 |             ):
237 |                 used_areas.append((label_x, label_y, label_x + box_width, label_y + box_height))
238 | 
239 |             # Store label information for second pass
240 |             labels_to_draw.append(
241 |                 {
242 |                     "label": label,
243 |                     "x": label_x,
244 |                     "y": label_y,
245 |                     "width": box_width,
246 |                     "height": box_height,
247 |                     "text_width": text_width,
248 |                     "text_height": text_height,
249 |                     "color": rgb_color,
250 |                 }
251 |             )
252 | 
253 |         # Third pass: Draw all labels on top
254 |         for label_info in labels_to_draw:
255 |             # Draw background box with white outline
256 |             draw.rectangle(
257 |                 (
258 |                     (label_info["x"] - 1, label_info["y"] - 1),
259 |                     (
260 |                         label_info["x"] + label_info["width"] + 1,
261 |                         label_info["y"] + label_info["height"] + 1,
262 |                     ),
263 |                 ),
264 |                 outline="white",
265 |                 width=2,
266 |             )
267 |             draw.rectangle(
268 |                 (
269 |                     (label_info["x"], label_info["y"]),
270 |                     (label_info["x"] + label_info["width"], label_info["y"] + label_info["height"]),
271 |                 ),
272 |                 fill=label_info["color"],
273 |             )
274 | 
275 |             # Center text in box
276 |             text_x = label_info["x"] + (label_info["width"] - label_info["text_width"]) // 2
277 |             text_y = label_info["y"] + (label_info["height"] - label_info["text_height"]) // 2
278 | 
279 |             # Draw text with black outline for better visibility
280 |             outline_width = 1
281 |             for dx in [-outline_width, outline_width]:
282 |                 for dy in [-outline_width, outline_width]:
283 |                     draw.text(
284 |                         (text_x + dx, text_y + dy), label_info["label"], fill="black", font=font
285 |                     )
286 | 
287 |             # Draw the main white text
288 |             draw.text((text_x, text_y), label_info["label"], fill=(255, 255, 255), font=font)
289 | 
290 |         logger.info("Finished drawing all boxes")
291 |         return image
292 | 
```

--------------------------------------------------------------------------------
/examples/evals/wikipedia_most_linked.txt:
--------------------------------------------------------------------------------

```
   1 | ISBN (identifier)
   2 | United States
   3 | Main Page
   4 | Tilde
   5 | Doi (identifier)
   6 | Fair use
   7 | Association football
   8 | Years
   9 | Wayback Machine
  10 | ISSN (identifier)
  11 | India
  12 | Wikimedia Foundation
  13 | Wikidata
  14 | Animal
  15 | Taxonomy (biology)
  16 | Australia
  17 | France
  18 | Eukaryote
  19 | IP address
  20 | U.S. state
  21 | Time zone
  22 | City
  23 | Copyright
  24 | Canada
  25 | Town
  26 | ASCII
  27 | Greek alphabet
  28 | Typographic ligature
  29 | Diacritical mark
  30 | Wikipedia
  31 | Germany
  32 | Human settlement
  33 | Open Tree of Life
  34 | IMDb (identifier)
  35 | United Kingdom
  36 | Catalogue of Life
  37 | Insect
  38 | Russia
  39 | Japan
  40 | Italy
  41 | Arthropod
  42 | Television show
  43 | Public domain
  44 | INaturalist
  45 | Poland
  46 | England
  47 | PMID (identifier)
  48 | Daylight saving time
  49 | S2CID (identifier)
  50 | China
  51 | Encyclopedia of Life
  52 | Spain
  53 | OCLC (identifier)
  54 | Plant
  55 | Flickr
  56 | Wikispecies
  57 | Africa
  58 | Song
  59 | Record label
  60 | Lepidoptera
  61 | Iran
  62 | English language
  63 | Music genre
  64 | News aggregator
  65 | Web feed
  66 | Proxy server
  67 | X-Forwarded-For
  68 | College football
  69 | World War II
  70 | Brazil
  71 | Sweden
  72 | Politics
  73 | Olympics
  74 | Netherlands
  75 | Record producer
  76 | California
  77 | New York City
  78 | Surname
  79 | The New York Times
  80 | London
  81 | New Zealand
  82 | PMC (identifier)
  83 | Logo
  84 | Synonym (taxonomy)
  85 | Switzerland
  86 | Turkey
  87 | Sport
  88 | Video game
  89 | Architecture
  90 | Norway
  91 | Bibcode (identifier)
  92 | Mexico
  93 | Botany
  94 | JSTOR (identifier)
  95 | Rail transport
  96 | Field hockey
  97 | Ireland
  98 | Scotland
  99 | Belgium
 100 | South Africa
 101 | Common name
 102 | Professional sports
 103 | Sport governing body
 104 | Sport industry
 105 | Olympic games
 106 | Election
 107 | Austria
 108 | Ukraine
 109 | Anthroponymy
 110 | Pakistan
 111 | Baseball
 112 | Denmark
 113 | Christianity
 114 | Philippines
 115 | Woman
 116 | Romania
 117 | Czech Republic
 118 | Album
 119 | Godzilla Minus One
 120 | Single (music)
 121 | Electoral reform
 122 | Nofollow
 123 | Basketball
 124 | New York (state)
 125 | Argentina
 126 | Finland
 127 | Soviet Union
 128 | Greece
 129 | Russian language
 130 | Historic site
 131 | Free content
 132 | YouTube
 133 | Catholic Church
 134 | Hungary
 135 | Kingdom Hearts
 136 | Beetle
 137 | Company
 138 | Tetris
 139 | Portugal
 140 | BioShock
 141 | Abandonware
 142 | Deus Ex (video game)
 143 | 4A Engine
 144 | Yoshi's New Island
 145 | Kaboom! (video game)
 146 | Rain World
 147 | Juno (Overwatch)
 148 | Crash Team Rumble
 149 | Vault 101
 150 | Tales of Commons
 151 | NHL Hockey
 152 | Clutch Gaming
 153 | Haseo
 154 | Allin Kempthorne
 155 | Ilyas El Maliki
 156 | Ratalaika Games
 157 | 3D mousepad
 158 | HaptX
 159 | Walid Sultan Midani
 160 | Rustler (video game)
 161 | Look Outside
 162 | Ducks Ahoy!
 163 | Fusion Engine
 164 | Cricket
 165 | Geography
 166 | Chordate
 167 | The Guardian
 168 | Israel
 169 | Billboard (magazine)
 170 | Ice hockey
 171 | Given name
 172 | Chicago
 173 | World War I
 174 | Pennsylvania
 175 | Indonesia
 176 | Alma mater
 177 | Vascular plant
 178 | Amorphea
 179 | Wikimedia Commons
 180 | Novel
 181 | Village
 182 | Visual arts
 183 | Film poster
 184 | Flowering plant
 185 | Opisthokont
 186 | Obazoa
 187 | County seat
 188 | Short story
 189 | First-class cricket
 190 | Law
 191 | Europe
 192 | University
 193 | Croatia
 194 | Sport of athletics
 195 | Holozoa
 196 | Choanozoa
 197 | Filozoa
 198 | German language
 199 | Tennis
 200 | Eumetazoa
 201 | Serbia
 202 | ParaHoxozoa
 203 | Thailand
 204 | History
 205 | Midfielder
 206 | Bilateria
 207 | Unincorporated area
 208 | French language
 209 | AllMusic
 210 | Astronomy
 211 | Nephrozoa
 212 | Novella
 213 | Ship
 214 | Twitter
 215 | Character (arts)
 216 | College
 217 | Malaysia
 218 | Conflict of interest
 219 | Higher education
 220 | IUCN Red List
 221 | Rock music
 222 | Gastropoda
 223 | Creative Commons
 224 | Wales
 225 | Bulgaria
 226 | UTC+2
 227 | Paris
 228 | Species
 229 | Illinois
 230 | HTML element
 231 | South Korea
 232 | BBC
 233 | Persian language
 234 | Moth
 235 | Conservation status
 236 | Pop music
 237 | Colombia
 238 | Wicket
 239 | American football
 240 | Jazz
 241 | World Flora Online
 242 | Los Angeles
 243 | Songwriter
 244 | Hong Kong
 245 | Hdl (identifier)
 246 | Genus
 247 | Spanish language
 248 | Egypt
 249 | Not out
 250 | Slovenia
 251 | Chile
 252 | Korea
 253 | Tropicos
 254 | Slovakia
 255 | Bishop
 256 | Family (biology)
 257 | Rugby union
 258 | Women's history
 259 | Nigeria
 260 | College basketball
 261 | Sports Reference
 262 | Washington, D.C.
 263 | GFDL
 264 | Afghanistan
 265 | Sri Lanka
 266 | Newspapers.com
 267 | UTC+1
 268 | Eudicots
 269 | Estonia
 270 | Los Angeles Times
 271 | Olympedia
 272 | Bangladesh
 273 | Peru
 274 | Singapore
 275 | Typographical error
 276 | UTC
 277 | Virginia
 278 | Taiwan
 279 | Fast bowling
 280 | COVID-19 pandemic
 281 | Food
 282 | Fish
 283 | River
 284 | Republic of Ireland
 285 | Beer
 286 | Caribbean
 287 | Michigan
 288 | Drink
 289 | Chinese language
 290 | Business
 291 | Leg break
 292 | Women's Test cricket
 293 | Women's cricket
 294 | Innings
 295 | New Jersey
 296 | Protostome
 297 | Spin bowling
 298 | Sugar
 299 | Underarm bowling
 300 | Roger Federer
 301 | Googly
 302 | Apple
 303 | Comics
 304 | Cricket Australia XI
 305 | Fair and unfair play
 306 | Anime
 307 | Rafael Nadal
 308 | Leander Paes
 309 | Kazakhstan
 310 | Capital city
 311 | Blessed Virgin Mary
 312 | Venezuela
 313 | Case sensitivity
 314 | Arabic language
 315 | North America
 316 | Texas
 317 | Burger King
 318 | The Plant List
 319 | Justine Henin
 320 | Sushi
 321 | Angelus
 322 | Beef
 323 | Sanctification
 324 | Cuthbert Tunstall
 325 | Bread
 326 | Saint Mungo
 327 | Incumbent
 328 | Americanism (heresy)
 329 | Curry
 330 | Ensoulment
 331 | Associated Press
 332 | Adolph John Paschang
 333 | French cuisine
 334 | Altar Society
 335 | UTC-5
 336 | Philadelphia
 337 | Bill Mallon
 338 | Yogurt
 339 | Soy sauce
 340 | Open Era (tennis)
 341 | Belarus
 342 | Manga
 343 | English Wikipedia
 344 | Islam
 345 | Trademark
 346 | ISO 4
 347 | Wisconsin
 348 | Lithuania
 349 | The Washington Post
 350 | Agaricus bisporus
 351 | Reptile
 352 | Sociology
 353 | Organizations
 354 | Death
 355 | Ham and eggs
 356 | Asia
 357 | Swimming (sport)
 358 | South America
 359 | Northern Ireland
 360 | Observation.org
 361 | European Union
 362 | Astronomical object
 363 | Georgia (U.S. state)
 364 | Gmina
 365 | Provinces of Iran
 366 | Computing
 367 | Counties of Iran
 368 | Discogs
 369 | Mathematics
 370 | Powiat
 371 | Missouri
 372 | Bachelor of Arts
 373 | Iran Standard Time
 374 | Florida
 375 | Bakhsh
 376 | Minnesota
 377 | Oregon
 378 | Nepal
 379 | Variety (magazine)
 380 | Japanese language
 381 | Journalism
 382 | Rome
 383 | Computer
 384 | Ohio
 385 | Ontario
 386 | Internet Archive
 387 | Latvia
 388 | Comedy
 389 | Azerbaijan
 390 | BBC News
 391 | Morocco
 392 | Ecdysozoa
 393 | Print-on-demand
 394 | Bengali language
 395 | A5 paper
 396 | Pedia Press
 397 | Education
 398 | Mollusca
 399 | American Civil War
 400 | Berlin
 401 | Taxon
 402 | Maryland
 403 | Panarthropoda
 404 | Hebrew language
 405 | Toronto
 406 | Tactopoda
 407 | Episode
 408 | Cuba
 409 | Country music
 410 | Religion
 411 | Rotten Tomatoes
 412 | Georgia (country)
 413 | Classical music
 414 | Month
 415 | Puerto Rico
 416 | GEOnet Names Server
 417 | Sydney
 418 | The Times
 419 | Iraq
 420 | Polyphaga
 421 | Derivative work
 422 | Lisbon
 423 | Syria
 424 | Ecuador
 425 | Uzbekistan
 426 | Greek language
 427 | Latin
 428 | United Nations
 429 | Literature
 430 | Animation
 431 | Physics
 432 | Amphibian
 433 | Romanize
 434 | List of countries
 435 | Moscow
 436 | Politician
 437 | Philosophy
 438 | Metacritic
 439 | Mammal
 440 | Pinyin
 441 | Open access
 442 | New South Wales
 443 | Theatre
 444 | Allmusic
 445 | Syntax
 446 | Women in music
 447 | Fly
 448 | Colorado
 449 | Academic journal
 450 | LGBTQ
 451 | Seal (emblem)
 452 | Rolling Stone
 453 | Saudi Arabia
 454 | Science fiction
 455 | Tweet (social media)
 456 | Heavy metal music
 457 | Boston
 458 | Vietnam
 459 | Molecular biology
 460 | Facebook
 461 | Iceland
 462 | Albania
 463 | Cycling
 464 | Tennessee
 465 | Armenia
 466 | Massachusetts
 467 | Mandibulata
 468 | United States Navy
 469 | Communes of France
 470 | Census
 471 | Algeria
 472 | United States Army
 473 | Wikilink
 474 | Pancrustacea
 475 | Alternative rock
 476 | American English
 477 | Radio stations
 478 | History of Romania
 479 | Endemism
 480 | San Francisco
 481 | Award
 482 | Ghana
 483 | Judaism
 484 | Alabama
 485 | Blog
 486 | The Independent
 487 | Melbourne
 488 | Cantons of France
 489 | Lebanon
 490 | West Germany
 491 | Quotation mark
 492 | Regions of France
 493 | Chernivtsi Oblast
 494 | Tokyo
 495 | Italian language
 496 | Connecticut
 497 | Country
 498 | Screenshot
 499 | Ghost town
 500 | Iran Daylight Time
 501 | NatureServe
 502 | Mongolia
 503 | Cyprus
 504 | Northern Bukovina
 505 | Rugby league
 506 | Northern Bessarabia
 507 | State highway
 508 | Harvard University
 509 | Yorkshire
 510 | Pterygota
 511 | Slash (punctuation)
 512 | Prize
 513 | Science
 514 | Asian Games
 515 | Eastern Time Zone
 516 | Myanmar
 517 | Nazi Germany
 518 | Ottoman Empire
 519 | Quebec
 520 | Billboard Hot 100
 521 | United Arab Emirates
 522 | Neoptera
 523 | Hexapoda
 524 | Least Concern
 525 | Type species
 526 | EPPO Code
 527 | Wikisource
 528 | Kyrgyzstan
 529 | Allotriocarida
 530 | Volleyball
 531 | Geology
 532 | Second World War
 533 | British Columbia
 534 | Socialism
 535 | Zoology
 536 | The Daily Telegraph
 537 | Paleontology
 538 | Vienna
 539 | Dicondylia
 540 | BugGuide
 541 | United States Senate
 542 | Hermit crab
 543 | Paraphrase
 544 | CNN
 545 | Royal Navy
 546 | Indian Standard Time
 547 | Billboard 200
 548 | Kenya
 549 | DVD
 550 | Sipuncula
 551 | Tajikistan
 552 | National park
 553 | Economics
 554 | Heterocyathus
 555 | Uruguay
 556 | Heteropsammia
 557 | Road
 558 | Spanish name
 559 | Luxembourg
 560 | Korean language
 561 | UK Singles Chart
 562 | Queensland
 563 | Montreal
 564 | New York Times
 565 | Bolivia
 566 | CP/M
 567 | Timestamp
 568 | Electronic music
 569 | INSEE code
 570 | ArXiv (identifier)
 571 | PubMed
 572 | SVG
 573 | USA Today
 574 | Omnivore
 575 | Tunisia
 576 | Psychology
 577 | ESPN
 578 | UEFA
 579 | Hawaii
 580 | Gastropod
 581 | Aliyah
 582 | North Carolina
 583 | Russian Empire
 584 | Tibet
 585 | Fungi
 586 | Oklahoma
 587 | Fauna Europaea
 588 | Turkmenistan
 589 | British English
 590 | The London Gazette
 591 | Civil township
 592 | Boxing
 593 | Barack Obama
 594 | Animal Diversity Web
 595 | Reuters
 596 | Eumetabola
 597 | Voter turnout
 598 | Transport
 599 | False positive
 600 | Donald Trump
 601 | Kansas
 602 | Antarctica
 603 | Lake
 604 | Ethiopia
 605 | Time (magazine)
 606 | Marriage
 607 | NBC
 608 | Beijing
 609 | Vertebrate
 610 | Czechoslovakia
 611 | Protected area
 612 | Energy
 613 | Poetry
 614 | Archaeology
 615 | Columbia University
 616 | Poverty line
 617 | Alaska
 618 | Computing platform
 619 | British Empire
 620 | University of Oxford
 621 | Costa Rica
 622 | Dublin
 623 | A-side and B-side
 624 | ZIP code
 625 | Actinopterygii
 626 | UTC-6
 627 | Photoperiodism
 628 | Mayor
 629 | Sphaeriidae
 630 | Animal suicide
 631 | Atka mackerel
 632 | Starling
 633 | Arizona
 634 | Entertainment Weekly
 635 | Sphaerium beckmani
 636 | Junqueira cow
 637 | Zaniolepis frenata
 638 | Campocraspedon
 639 | Zimbabwe
 640 | Motorsport
 641 | Bird flight
 642 | Cnemophilidae
 643 | Hinduism
 644 | Phalarope
 645 | Indiana
 646 | Museums
 647 | Holometabola
 648 | Pytilia
 649 | North Macedonia
 650 | Malta
 651 | Cathartiformes
 652 | Darter
 653 | Saker falcon
 654 | Cathartes
 655 | Avian malaria
 656 | Coal tit
 657 | Magpie duck
 658 | Video game developer
 659 | Bird bath
 660 | Vesper sparrow
 661 | Gouldian finch
 662 | Debeaking
 663 | Vector graphics
 664 | Semiplumbeous hawk
 665 | Scottish crossbill
 666 | Bullfinch
 667 | Fregata
 668 | Nidicolous
 669 | Plushcap
 670 | Pallid scops owl
 671 | Hip-hop
 672 | Blyth's frogmouth
 673 | Sunda scops owl
 674 | Argus (bird)
 675 | Operation Migration
 676 | Nik Borrow
 677 | Per capita income
 678 | Guy Oseary
 679 | Madrid
 680 | Buddhism
 681 | Drainage basin
 682 | Sephardic Haredim
 683 | Rami Kleinstein
 684 | Guy Bavli
 685 | David Bar-Hayim
 686 | Levin Kipnis
 687 | Edna Arbel
 688 | Prisoner of Zion
 689 | Ayala Procaccia
 690 | Nachum Heiman
 691 | Zman Tel Aviv
 692 | CBS
 693 | ARIA Charts
 694 | Cucujiformia
 695 | Away colours
 696 | Regex
 697 | 2019 African Games
 698 | 1962 Asian Games
 699 | 1958 Asian Games
 700 | Chemistry
 701 | Olympic Games
 702 | The Middle Ages
 703 | Central Asia
 704 | Bengalis
 705 | Southeast Asia
 706 | Find a Grave
 707 | Microsoft Windows
 708 | Swing (politics)
 709 | White (U.S. Census)
 710 | Roman Catholic
 711 | Maine
 712 | The Times of India
 713 | Season (sports)
 714 | Jamaica
 715 | Video game genre
 716 | Munich
 717 | Asterids
 718 | Rosids
 719 | Golf
 720 | Language
 721 | Hangul
 722 | Atlanta
 723 | Glasgow
 724 | UTC+3
 725 | Library of Congress
 726 | Deuterostome
 727 | COVID-19
 728 | Video game publisher
 729 | Montenegro
 730 | ESPNcricinfo
 731 | Brand
 732 | UTC-4
 733 | IGN
 734 | Stockholm
 735 | Istanbul
 736 | NASA
 737 | Gnathostomata
 738 | Ukrainian language
 739 | Human rights
 740 | Chicago Tribune
 741 | ProQuest
 742 | IMDb
 743 | River mouth
 744 | Hip hop music
 745 | Gene
 746 | Netflix
 747 | Moldova
 748 | Barcelona
 749 | Paraguay
 750 | Olfactores
 751 | Labour Party (UK)
 752 | United States dollar
 753 | Qatar
 754 | Photography
 755 | Guatemala
 756 | Summit
 757 | Cold War
 758 | Running
 759 | First World War
 760 | Precipitation
 761 | Edinburgh
 762 | Amsterdam
 763 | Lima
 764 | New Eskaton
 765 | Computer program
 766 | Xinjiang
 767 | Women in science
 768 | Manhattan
 769 | Warsaw
 770 | Magazine
 771 | Horror film
 772 | Deadline Hollywood
 773 | Jordan
 774 | Aparaglossata
 775 | Agriculture
 776 | Internet
 777 | Prague
 778 | The Hindu
 779 | Cretaceous
 780 | Latino (U.S. Census)
 781 | Vietnam War
 782 | Music download
 783 | Encyclopedia
 784 | Chemical compounds
 785 | Pittsburgh
 786 | Soap opera
 787 | Budapest
 788 | George W. Bush
 789 | Seattle
 790 | Extended play
 791 | Washington (state)
 792 | Listed building
 793 | Palestine
 794 | LCCN (identifier)
 795 | Portland, Oregon
 796 | Panama
 797 | Plagiarism
 798 | Brooklyn
 799 | Teleostomi
 800 | Manchester
 801 | Bird
 802 | Mollusk
 803 | Automobile
 804 | Historic England
 805 | Linguistics
 806 | Dependent territory
 807 | Athens
 808 | Civil engineering
 809 | Sea snail
 810 | Population density
 811 | Finance
 812 | Disaster management
 813 | Tanzania
 814 | Jurassic
 815 | Districts of Russia
 816 | Western Australia
 817 | Louisiana
 818 | Portuguese language
 819 | Anatomy
 820 | The Beatles
 821 | Tamil language
 822 | Milan
 823 | Uganda
 824 | Natural environment
 825 | FIFA
 826 | Cameroon
 827 | Blu-ray
 828 | Mexico City
 829 | Chemical formula
 830 | Jimmy Wales
 831 | Papua New Guinea
 832 | Diaphoretickes
 833 | UNESCO
 834 | Forbes
 835 | Technology
 836 | Buenos Aires
 837 | Vancouver
 838 | Dominican Republic
 839 | 2007
 840 | Species description
 841 | East Germany
 842 | Folk music
 843 | Kentucky
 844 | Multimedia
 845 | Monocotyledon
 846 | Rio de Janeiro
 847 | Automated
 848 | Hindi
 849 | Houston
 850 | Google
 851 | Devonian
 852 | Member of Parliament
 853 | Bible
 854 | Mumbai
 855 | FishBase
 856 | African diaspora
 857 | Carboniferous
 858 | Cambrian
 859 | Triassic
 860 | Montana
 861 | Handball
 862 | Ordovician
 863 | San Diego
 864 | Archive.today
 865 | Stanford University
 866 | British Army
 867 | Middle Ages
 868 | Frequency
 869 | Ultratop
 870 | Permian
 871 | Detroit
 872 | Earth
 873 | Precambrian
 874 | Hamburg
 875 | Alberta
 876 | Tamil Nadu
 877 | Madagascar
 878 | Lancashire
 879 | Guitar
 880 | Trade union
 881 | Instagram
 882 | Engineering
 883 | 2006
 884 | Silurian
 885 | NPR
 886 | Railway station
 887 | CAS Registry Number
 888 | Yemen
 889 | Noctuoidea
 890 | Fiji
 891 | Haiti
 892 | Rowing (sport)
 893 | New Orleans
 894 | NME
 895 | Alternative media
 896 | North Korea
 897 | Microsoft
 898 | Jerusalem
 899 | Paleogene
 900 | Audery Mill Creek
 901 | Horse racing
 902 | Post town
 903 | Piano
 904 | Bavaria
 905 | Polish language
 906 | Horror fiction
 907 | Neogene
 908 | Kerala
 909 | Copenhagen
 910 | Google Books
 911 | Central Time Zone
 912 | Island
 913 | Birmingham
 914 | Anglicanism
 915 | Software
 916 | Mountain range
 917 | Investment
 918 | Brussels
 919 | Muhammad Ali
 920 | Asian (U.S. Census)
 921 | Video game culture
 922 | Brisbane
 923 | Church of England
 924 | Kosovo
 925 | Bachelor of Science
 926 | Molar mass
 927 | Arachnid
 928 | Own goal
 929 | Yale University
 930 | Caenogastropoda
 931 | Auckland
 932 | World Athletics
 933 | Trinidad and Tobago
 934 | Hanyu Pinyin
 935 | Sound bite
 936 | Time
 937 | El Salvador
 938 | Microbiology
 939 | Columbia Records
 940 | Seoul
 941 | Cerambycidae
 942 | Maharashtra
 943 | Chelicerata
 944 | Fungus
 945 | Media influence
 946 | South Carolina
 947 | Radio
 948 | Telenovela
 949 | FA Cup
 950 | Senegal
 951 | Internet trolling
 952 | Nashville, Tennessee
 953 | Demonym
 954 | Standard Chinese
 955 | Sculpture
 956 | Liverpool
 957 | Thesis
 958 | Bass guitar
 959 | Chess
 960 | Women artists
 961 | Icon (computing)
 962 | PubChem
 963 | UK Albums Chart
 964 | Head coach
 965 | Roman Empire
 966 | Grand Slam (tennis)
 967 | JSmol
 968 | Formula One
 969 | Biology
 970 | Kent
 971 | Ancient Rome
 972 | Inner Carniola
 973 | Oslo
 974 | Dutch language
 975 | Wingspan
 976 | Archaeplastida
 977 | MTV
 978 | Edvard Ravnikar
 979 | ITunes
 980 | Feminism
 981 | German Empire
 982 | Pacific Ocean
 983 | Atlantic Ocean
 984 | Pharmacology
 985 | Track gauge
 986 | ChemSpider
 987 | Doctor of Philosophy
 988 | Regions of England
 989 | Districts of England
 990 | Christmas
 991 | Pavel Golia
 992 | Predjama Castle
 993 | Overtime (sports)
 994 | Forum
 995 | Swiss Hitparade
 996 | Stumped
 997 | Majority
 998 | Male
 999 | Shanghai
1000 | Siddharta (band)
```

--------------------------------------------------------------------------------
/blog/training-computer-use-models-trajectories-1.md:
--------------------------------------------------------------------------------

```markdown
  1 | # Training Computer-Use Models: Creating Human Trajectories with Cua
  2 | 
  3 | *Published on May 1, 2025 by Dillon DuPont*
  4 | 
  5 | In our previous posts, we covered [building your own Computer-Use Operator](build-your-own-operator-on-macos-1) and [using the Agent framework](build-your-own-operator-on-macos-2) to simplify development. Today, we'll focus on a critical aspect of improving computer-use agents and models: gathering high-quality demonstration data using Cua's Computer-Use Interface (CUI) and its Gradio UI to create and share human-generated trajectories.
  6 | 
  7 | Why is this important? Underlying models used by Computer-use agents need examples of how humans interact with computers to learn effectively. By creating a dataset of diverse, well-executed tasks, we can help train better models that understand how to navigate user interfaces and accomplish real tasks.
  8 | 
  9 | <video src="https://github.com/user-attachments/assets/c586d460-3877-4b5f-a736-3248886d2134" controls width="600"></video>
 10 | 
 11 | 
 12 | ## What You'll Learn
 13 | 
 14 | By the end of this tutorial, you'll be able to:
 15 | - Set up the Computer-Use Interface (CUI) with Gradio UI support
 16 | - Record your own computer interaction trajectories
 17 | - Organize and tag your demonstrations
 18 | - Upload your datasets to Hugging Face for community sharing
 19 | - Contribute to improving computer-use AI for everyone
 20 | 
 21 | **Prerequisites:**
 22 | - macOS Sonoma (14.0) or later
 23 | - Python 3.10+
 24 | - Basic familiarity with Python and terminal commands
 25 | - A Hugging Face account (for uploading datasets)
 26 | 
 27 | **Estimated Time:** 20-30 minutes
 28 | 
 29 | ## Understanding Human Trajectories
 30 | 
 31 | ### What are Human Trajectories?
 32 | 
 33 | Human trajectories, in the context of Computer-use AI Agents, are recordings of how humans interact with computer interfaces to complete tasks. These interactions include:
 34 | 
 35 | - Mouse movements, clicks, and scrolls
 36 | - Keyboard input
 37 | - Changes in the UI state
 38 | - Time spent on different elements
 39 | 
 40 | These trajectories serve as examples for AI models to learn from, helping them understand the relationship between:
 41 | 1. The visual state of the screen
 42 | 2. The user's goal or task
 43 | 3. The most appropriate action to take
 44 | 
 45 | ### Why Human Demonstrations Matter
 46 | 
 47 | Unlike synthetic data or rule-based automation, human demonstrations capture the nuanced decision-making that happens during computer interaction:
 48 | 
 49 | - **Natural Pacing**: Humans pause to think, accelerate through familiar patterns, and adjust to unexpected UI changes
 50 | - **Error Recovery**: Humans demonstrate how to recover from mistakes or handle unexpected states
 51 | - **Context-Sensitive Actions**: The same UI element might be used differently depending on the task context
 52 | 
 53 | By contributing high-quality demonstrations, you're helping to create more capable, human-like computer-use AI systems.
 54 | 
 55 | ## Setting Up Your Environment
 56 | 
 57 | ### Installing the CUI with Gradio Support
 58 | 
 59 | The Computer-Use Interface includes an optional Gradio UI specifically designed to make recording and sharing demonstrations easy. Let's set it up:
 60 | 
 61 | 1. **Create a Python environment** (optional but recommended):
 62 |    ```bash
 63 |    # Using conda
 64 |    conda create -n cua-trajectories python=3.10
 65 |    conda activate cua-trajectories
 66 |    
 67 |    # Using venv
 68 |    python -m venv cua-trajectories
 69 |    source cua-trajectories/bin/activate  # On macOS/Linux
 70 |    ```
 71 | 
 72 | 2. **Install the CUI package with UI support**:
 73 |    ```bash
 74 |    pip install "cua-computer[ui]"
 75 |    ```
 76 | 
 77 | 3. **Set up your Hugging Face access token**:
 78 |    Create a `.env` file in your project directory and add your Hugging Face token:
 79 |    ```bash
 80 |    echo "HF_TOKEN=your_huggingface_token" > .env
 81 |    ```
 82 |    You can get your token from your [Hugging Face account settings](https://huggingface.co/settings/tokens).
 83 | 
 84 | ### Understanding the Gradio UI
 85 | 
 86 | The Computer-Use Interface Gradio UI provides three main components:
 87 | 
 88 | 1. **Recording Panel**: Captures your screen, mouse, and keyboard activity during demonstrations
 89 | 2. **Review Panel**: Allows you to review, tag, and organize your demonstration recordings
 90 | 3. **Upload Panel**: Lets you share your demonstrations with the community via Hugging Face
 91 | 
 92 | The UI is designed to make the entire process seamless, from recording to sharing, without requiring deep technical knowledge of the underlying systems.
 93 | 
 94 | ## Creating Your First Trajectory Dataset
 95 | 
 96 | ### Launching the UI
 97 | 
 98 | To get started, create a simple Python script to launch the Gradio UI:
 99 | 
100 | ```python
101 | # launch_trajectory_ui.py
102 | from computer.ui.gradio.app import create_gradio_ui
103 | from dotenv import load_dotenv
104 | 
105 | # Load your Hugging Face token from .env
106 | load_dotenv('.env')
107 | 
108 | # Create and launch the UI
109 | app = create_gradio_ui()
110 | app.launch(share=False)
111 | ```
112 | 
113 | Run this script to start the UI:
114 | 
115 | ```bash
116 | python launch_trajectory_ui.py
117 | ```
118 | 
119 | ### Recording a Demonstration
120 | 
121 | Let's walk through the process of recording your first demonstration:
122 | 
123 | 1. **Start the VM**: Click the "Initialize Computer" button in the UI to initialize a fresh macOS sandbox. This ensures your demonstrations are clean and reproducible.
124 | 2. **Perform a Task**: Complete a simple task like creating a document, organizing files, or searching for information. Natural, everyday tasks make the best demonstrations.
125 | 3. **Review Recording**: Click the "Conversation Logs" or "Function Logs" tabs to review your captured interactions, making sure there is no personal information that you wouldn't want to share.
126 | 4. **Add Metadata**: In the "Save/Share Demonstrations" tab, give your recording a descriptive name (e.g., "Creating a Calendar Event") and add relevant tags (e.g., "productivity", "time-management").
127 | 5. **Save Your Demonstration**: Click "Save" to store your recording locally.
128 | 
129 | <video src="https://github.com/user-attachments/assets/de3c3477-62fe-413c-998d-4063e48de176" controls width="600"></video>
130 | 
131 | ### Key Tips for Quality Demonstrations
132 | 
133 | To create the most valuable demonstrations:
134 | 
135 | - **Start and end at logical points**: Begin with a clear starting state and end when the task is visibly complete
136 | - **Narrate your thought process**: Use the message input to describe what you're trying to do and why
137 | - **Move at a natural pace**: Don't rush or perform actions artificially slowly
138 | - **Include error recovery**: If you make a mistake, keep going and show how to correct it
139 | - **Demonstrate variations**: Record multiple ways to complete the same task
140 | 
141 | ## Organizing and Tagging Demonstrations
142 | 
143 | Effective tagging and organization make your demonstrations more valuable to researchers and model developers. Consider these tagging strategies:
144 | 
145 | ### Task-Based Tags
146 | 
147 | Describe what the demonstration accomplishes:
148 | - `web-browsing`
149 | - `document-editing`
150 | - `file-management`
151 | - `email`
152 | - `scheduling`
153 | 
154 | ### Application Tags
155 | 
156 | Identify the applications used:
157 | - `finder`
158 | - `safari`
159 | - `notes`
160 | - `terminal`
161 | - `calendar`
162 | 
163 | ### Complexity Tags
164 | 
165 | Indicate the difficulty level:
166 | - `beginner`
167 | - `intermediate`
168 | - `advanced`
169 | - `multi-application`
170 | 
171 | ### UI Element Tags
172 | 
173 | Highlight specific UI interactions:
174 | - `drag-and-drop`
175 | - `menu-navigation`
176 | - `form-filling`
177 | - `search`
178 | 
179 | The Computer-Use Interface UI allows you to apply and manage these tags across all your saved demonstrations, making it easy to create cohesive, well-organized datasets.
180 | 
181 | <video src="https://github.com/user-attachments/assets/5ad1df37-026a-457f-8b49-922ae805faef" controls width="600"></video>
182 | 
183 | ## Uploading to Hugging Face
184 | 
185 | Sharing your demonstrations helps advance research in computer-use AI. The Gradio UI makes uploading to Hugging Face simple:
186 | 
187 | ### Preparing for Upload
188 | 
189 | 1. **Review Your Demonstrations**: Use the review panel to ensure all demonstrations are complete and correctly tagged.
190 | 
191 | 2. **Select Demonstrations to Upload**: You can upload all demonstrations or filter by specific tags.
192 | 
193 | 3. **Configure Dataset Information**:
194 |    - **Repository Name**: Format as `{your_username}/{dataset_name}`, e.g., `johndoe/productivity-tasks`
195 |    - **Visibility**: Choose `public` to contribute to the community or `private` for personal use
196 |    - **License**: Standard licenses like CC-BY or MIT are recommended for public datasets
197 | 
198 | ### The Upload Process
199 | 
200 | 1. **Click "Upload to Hugging Face"**: This initiates the upload preparation.
201 | 
202 | 2. **Review Dataset Summary**: Confirm the number of demonstrations and total size.
203 | 
204 | 3. **Confirm Upload**: The UI will show progress as files are transferred.
205 | 
206 | 4. **Receive Confirmation**: Once complete, you'll see a link to your new dataset on Hugging Face.
207 | 
208 | <video src="https://github.com/user-attachments/assets/c586d460-3877-4b5f-a736-3248886d2134" controls width="600"></video>
209 | 
210 | Your uploaded dataset will have a standardized format with the following structure:
211 | 
212 | ```json
213 | {
214 |   "timestamp": "2025-05-01T09:20:40.594878",
215 |   "session_id": "1fe9f0fe-9331-4078-aacd-ec7ffb483b86",
216 |   "name": "penguin lemon forest",
217 |   "tool_calls": [...],  // Detailed interaction records
218 |   "messages": [...],    // User/assistant messages
219 |   "tags": ["highquality", "tasks"],
220 |   "images": [...]       // Screenshots of each state
221 | }
222 | ```
223 | 
224 | This structured format makes it easy for researchers to analyze patterns across different demonstrations and build better computer-use models.
225 | 
226 | ```python
227 | from computer import Computer
228 | 
229 | computer = Computer(os_type="macos", display="1024x768", memory="8GB", cpu="4")
230 | try:
231 |     await computer.run()
232 |     
233 |     screenshot = await computer.interface.screenshot()
234 |     with open("screenshot.png", "wb") as f:
235 |         f.write(screenshot)
236 |     
237 |     await computer.interface.move_cursor(100, 100)
238 |     await computer.interface.left_click()
239 |     await computer.interface.right_click(300, 300)
240 |     await computer.interface.double_click(400, 400)
241 | 
242 |     await computer.interface.type("Hello, World!")
243 |     await computer.interface.press_key("enter")
244 | 
245 |     await computer.interface.set_clipboard("Test clipboard")
246 |     content = await computer.interface.copy_to_clipboard()
247 |     print(f"Clipboard content: {content}")
248 | finally:
249 |     await computer.stop()
250 | ```
251 | 
252 | ## Example: Shopping List Demonstration
253 | 
254 | Let's walk through a concrete example of creating a valuable demonstration:
255 | 
256 | ### Task: Adding Shopping List Items to a Doordash Cart
257 | 
258 | 1. **Start Recording**: Begin with a clean desktop and a text file containing a shopping list.
259 | 
260 | 2. **Task Execution**: Open the file, read the list, open Safari, navigate to Doordash, and add each item to the cart.
261 | 
262 | 3. **Narration**: Add messages like "Reading the shopping list" and "Searching for rice on Doordash" to provide context.
263 | 
264 | 4. **Completion**: Verify all items are in the cart and end the recording.
265 | 
266 | 5. **Tagging**: Add tags like `shopping`, `web-browsing`, `task-completion`, and `multi-step`.
267 | 
268 | This type of demonstration is particularly valuable because it showcases real-world task completion requiring multiple applications and context switching.
269 | 
270 | ### Exploring Community Datasets
271 | 
272 | You can also learn from existing trajectory datasets contributed by the community:
273 | 
274 | 1. Visit [Hugging Face Datasets tagged with 'cua'](https://huggingface.co/datasets?other=cua)
275 | 2. Explore different approaches to similar tasks
276 | 3. Download and analyze high-quality demonstrations
277 | 
278 | ## Conclusion
279 | 
280 | ### Summary
281 | 
282 | In this guide, we've covered how to:
283 | - Set up the Computer-Use Interface with Gradio UI
284 | - Record high-quality human demonstrations
285 | - Organize and tag your trajectories
286 | - Share your datasets with the community
287 | 
288 | By contributing your own demonstrations, you're helping to build more capable, human-like AI systems that can understand and execute complex computer tasks.
289 | 
290 | ### Next Steps
291 | 
292 | Now that you know how to create and share trajectories, consider these advanced techniques:
293 | 
294 | - Create themed collections around specific productivity workflows
295 | - Collaborate with others to build comprehensive datasets
296 | - Use your datasets to fine-tune your own computer-use models
297 | 
298 | ### Resources
299 | 
300 | - [Computer-Use Interface GitHub](https://github.com/trycua/cua/tree/main/libs/computer)
301 | - [Hugging Face Datasets Documentation](https://huggingface.co/docs/datasets)
302 | - [Example Dataset: ddupont/test-dataset](https://huggingface.co/datasets/ddupont/test-dataset)
303 | 
```

--------------------------------------------------------------------------------
/libs/python/pylume/pylume/pylume.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import sys
  3 | import json
  4 | import time
  5 | import asyncio
  6 | import subprocess
  7 | from typing import Optional, List, Union, Callable, TypeVar, Any
  8 | from functools import wraps
  9 | import re
 10 | import signal
 11 | 
 12 | from .server import LumeServer
 13 | from .client import LumeClient
 14 | from .models import (
 15 |     VMConfig,
 16 |     VMStatus,
 17 |     VMRunOpts,
 18 |     VMUpdateOpts,
 19 |     ImageRef,
 20 |     CloneSpec,
 21 |     SharedDirectory,
 22 |     ImageList,
 23 | )
 24 | from .exceptions import (
 25 |     LumeError,
 26 |     LumeServerError,
 27 |     LumeConnectionError,
 28 |     LumeTimeoutError,
 29 |     LumeNotFoundError,
 30 |     LumeConfigError,
 31 |     LumeVMError,
 32 |     LumeImageError,
 33 | )
 34 | 
 35 | # Type variable for the decorator
 36 | T = TypeVar("T")
 37 | 
 38 | 
 39 | def ensure_server(func: Callable[..., T]) -> Callable[..., T]:
 40 |     """Decorator to ensure server is running before executing the method."""
 41 | 
 42 |     @wraps(func)
 43 |     async def wrapper(self: "PyLume", *args: Any, **kwargs: Any) -> T:
 44 |         # ensure_running is an async method, so we need to await it
 45 |         await self.server.ensure_running()
 46 |         # Initialize client if needed
 47 |         await self._init_client()
 48 |         return await func(self, *args, **kwargs)  # type: ignore
 49 | 
 50 |     return wrapper  # type: ignore
 51 | 
 52 | 
 53 | class PyLume:
 54 |     def __init__(
 55 |         self,
 56 |         debug: bool = False,
 57 |         server_start_timeout: int = 60,
 58 |         port: Optional[int] = None,
 59 |         use_existing_server: bool = False,
 60 |         host: str = "localhost",
 61 |     ):
 62 |         """Initialize the async PyLume client.
 63 | 
 64 |         Args:
 65 |             debug: Enable debug logging
 66 |             auto_start_server: Whether to automatically start the lume server if not running
 67 |             server_start_timeout: Timeout in seconds to wait for server to start
 68 |             port: Port number for the lume server. Required when use_existing_server is True.
 69 |             use_existing_server: If True, will try to connect to an existing server on the specified port
 70 |                                instead of starting a new one.
 71 |             host: Host to use for connections (e.g., "localhost", "127.0.0.1", "host.docker.internal")
 72 |         """
 73 |         if use_existing_server and port is None:
 74 |             raise LumeConfigError("Port must be specified when using an existing server")
 75 | 
 76 |         self.server = LumeServer(
 77 |             debug=debug,
 78 |             server_start_timeout=server_start_timeout,
 79 |             port=port,
 80 |             use_existing_server=use_existing_server,
 81 |             host=host,
 82 |         )
 83 |         self.client = None
 84 | 
 85 |     async def __aenter__(self) -> "PyLume":
 86 |         """Async context manager entry."""
 87 |         if self.server.use_existing_server:
 88 |             # Just ensure base_url is set for existing server
 89 |             if self.server.requested_port is None:
 90 |                 raise LumeConfigError("Port must be specified when using an existing server")
 91 | 
 92 |             if not self.server.base_url:
 93 |                 self.server.port = self.server.requested_port
 94 |                 self.server.base_url = f"http://{self.server.host}:{self.server.port}/lume"
 95 | 
 96 |         # Ensure the server is running (will connect to existing or start new as needed)
 97 |         await self.server.ensure_running()
 98 | 
 99 |         # Initialize the client
100 |         await self._init_client()
101 |         return self
102 | 
103 |     async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
104 |         """Async context manager exit."""
105 |         if self.client is not None:
106 |             await self.client.close()
107 |         await self.server.stop()
108 | 
109 |     async def _init_client(self) -> None:
110 |         """Initialize the client if not already initialized."""
111 |         if self.client is None:
112 |             if self.server.base_url is None:
113 |                 raise RuntimeError("Server base URL not set")
114 |             self.client = LumeClient(self.server.base_url, debug=self.server.debug)
115 | 
116 |     def _log_debug(self, message: str, **kwargs) -> None:
117 |         """Log debug information if debug mode is enabled."""
118 |         if self.server.debug:
119 |             print(f"DEBUG: {message}")
120 |             if kwargs:
121 |                 print(json.dumps(kwargs, indent=2))
122 | 
123 |     async def _handle_api_error(self, e: Exception, operation: str) -> None:
124 |         """Handle API errors and raise appropriate custom exceptions."""
125 |         if isinstance(e, subprocess.SubprocessError):
126 |             raise LumeConnectionError(f"Failed to connect to PyLume server: {str(e)}")
127 |         elif isinstance(e, asyncio.TimeoutError):
128 |             raise LumeTimeoutError(f"Request timed out: {str(e)}")
129 | 
130 |         if not hasattr(e, "status") and not isinstance(e, subprocess.CalledProcessError):
131 |             raise LumeServerError(f"Unknown error during {operation}: {str(e)}")
132 | 
133 |         status_code = getattr(e, "status", 500)
134 |         response_text = str(e)
135 | 
136 |         self._log_debug(
137 |             f"{operation} request failed", status_code=status_code, response_text=response_text
138 |         )
139 | 
140 |         if status_code == 404:
141 |             raise LumeNotFoundError(f"Resource not found during {operation}")
142 |         elif status_code == 400:
143 |             raise LumeConfigError(f"Invalid configuration for {operation}: {response_text}")
144 |         elif status_code >= 500:
145 |             raise LumeServerError(
146 |                 f"Server error during {operation}",
147 |                 status_code=status_code,
148 |                 response_text=response_text,
149 |             )
150 |         else:
151 |             raise LumeServerError(
152 |                 f"Error during {operation}", status_code=status_code, response_text=response_text
153 |             )
154 | 
155 |     async def _read_output(self) -> None:
156 |         """Read and log server output."""
157 |         try:
158 |             while True:
159 |                 if not self.server.server_process or self.server.server_process.poll() is not None:
160 |                     self._log_debug("Server process ended")
161 |                     break
162 | 
163 |                 # Read stdout without blocking
164 |                 if self.server.server_process.stdout:
165 |                     while True:
166 |                         line = self.server.server_process.stdout.readline()
167 |                         if not line:
168 |                             break
169 |                         line = line.strip()
170 |                         self._log_debug(f"Server stdout: {line}")
171 |                         if "Server started" in line.decode("utf-8"):
172 |                             self._log_debug("Detected server started message")
173 |                             return
174 | 
175 |                 # Read stderr without blocking
176 |                 if self.server.server_process.stderr:
177 |                     while True:
178 |                         line = self.server.server_process.stderr.readline()
179 |                         if not line:
180 |                             break
181 |                         line = line.strip()
182 |                         self._log_debug(f"Server stderr: {line}")
183 |                         if "error" in line.decode("utf-8").lower():
184 |                             raise RuntimeError(f"Server error: {line}")
185 | 
186 |                 await asyncio.sleep(0.1)  # Small delay to prevent CPU spinning
187 |         except Exception as e:
188 |             self._log_debug(f"Error in output reader: {str(e)}")
189 |             raise
190 | 
191 |     @ensure_server
192 |     async def create_vm(self, spec: Union[VMConfig, dict]) -> None:
193 |         """Create a VM with the given configuration."""
194 |         # Ensure client is initialized
195 |         await self._init_client()
196 | 
197 |         if isinstance(spec, VMConfig):
198 |             spec = spec.model_dump(by_alias=True, exclude_none=True)
199 | 
200 |         # Suppress optional attribute access errors
201 |         self.client.print_curl("POST", "/vms", spec)  # type: ignore[attr-defined]
202 |         await self.client.post("/vms", spec)  # type: ignore[attr-defined]
203 | 
204 |     @ensure_server
205 |     async def run_vm(self, name: str, opts: Optional[Union[VMRunOpts, dict]] = None) -> None:
206 |         """Run a VM."""
207 |         if opts is None:
208 |             opts = VMRunOpts(no_display=False)  # type: ignore[attr-defined]
209 |         elif isinstance(opts, dict):
210 |             opts = VMRunOpts(**opts)
211 | 
212 |         payload = opts.model_dump(by_alias=True, exclude_none=True)
213 |         self.client.print_curl("POST", f"/vms/{name}/run", payload)  # type: ignore[attr-defined]
214 |         await self.client.post(f"/vms/{name}/run", payload)  # type: ignore[attr-defined]
215 | 
216 |     @ensure_server
217 |     async def list_vms(self) -> List[VMStatus]:
218 |         """List all VMs."""
219 |         data = await self.client.get("/vms")  # type: ignore[attr-defined]
220 |         return [VMStatus.model_validate(vm) for vm in data]
221 | 
222 |     @ensure_server
223 |     async def get_vm(self, name: str) -> VMStatus:
224 |         """Get VM details."""
225 |         data = await self.client.get(f"/vms/{name}")  # type: ignore[attr-defined]
226 |         return VMStatus.model_validate(data)
227 | 
228 |     @ensure_server
229 |     async def update_vm(self, name: str, params: Union[VMUpdateOpts, dict]) -> None:
230 |         """Update VM settings."""
231 |         if isinstance(params, dict):
232 |             params = VMUpdateOpts(**params)
233 | 
234 |         payload = params.model_dump(by_alias=True, exclude_none=True)
235 |         self.client.print_curl("PATCH", f"/vms/{name}", payload)  # type: ignore[attr-defined]
236 |         await self.client.patch(f"/vms/{name}", payload)  # type: ignore[attr-defined]
237 | 
238 |     @ensure_server
239 |     async def stop_vm(self, name: str) -> None:
240 |         """Stop a VM."""
241 |         await self.client.post(f"/vms/{name}/stop")  # type: ignore[attr-defined]
242 | 
243 |     @ensure_server
244 |     async def delete_vm(self, name: str) -> None:
245 |         """Delete a VM."""
246 |         await self.client.delete(f"/vms/{name}")  # type: ignore[attr-defined]
247 | 
248 |     @ensure_server
249 |     async def pull_image(
250 |         self, spec: Union[ImageRef, dict, str], name: Optional[str] = None
251 |     ) -> None:
252 |         """Pull a VM image."""
253 |         await self._init_client()
254 |         if isinstance(spec, str):
255 |             if ":" in spec:
256 |                 image_str = spec
257 |             else:
258 |                 image_str = f"{spec}:latest"
259 |             registry = "ghcr.io"
260 |             organization = "trycua"
261 |         elif isinstance(spec, dict):
262 |             image = spec.get("image", "")
263 |             tag = spec.get("tag", "latest")
264 |             image_str = f"{image}:{tag}"
265 |             registry = spec.get("registry", "ghcr.io")
266 |             organization = spec.get("organization", "trycua")
267 |         else:
268 |             image_str = f"{spec.image}:{spec.tag}"
269 |             registry = spec.registry
270 |             organization = spec.organization
271 | 
272 |         payload = {
273 |             "image": image_str,
274 |             "name": name,
275 |             "registry": registry,
276 |             "organization": organization,
277 |         }
278 | 
279 |         self.client.print_curl("POST", "/pull", payload)  # type: ignore[attr-defined]
280 |         await self.client.post("/pull", payload, timeout=300.0)  # type: ignore[attr-defined]
281 | 
282 |     @ensure_server
283 |     async def clone_vm(self, name: str, new_name: str) -> None:
284 |         """Clone a VM with the given name to a new VM with new_name."""
285 |         config = CloneSpec(name=name, newName=new_name)
286 |         self.client.print_curl("POST", "/vms/clone", config.model_dump())  # type: ignore[attr-defined]
287 |         await self.client.post("/vms/clone", config.model_dump())  # type: ignore[attr-defined]
288 | 
289 |     @ensure_server
290 |     async def get_latest_ipsw_url(self) -> str:
291 |         """Get the latest IPSW URL."""
292 |         await self._init_client()
293 |         data = await self.client.get("/ipsw")  # type: ignore[attr-defined]
294 |         return data["url"]
295 | 
296 |     @ensure_server
297 |     async def get_images(self, organization: Optional[str] = None) -> ImageList:
298 |         """Get list of available images."""
299 |         await self._init_client()
300 |         params = {"organization": organization} if organization else None
301 |         data = await self.client.get("/images", params)  # type: ignore[attr-defined]
302 |         return ImageList(root=data)
303 | 
304 |     async def close(self) -> None:
305 |         """Close the client and stop the server."""
306 |         if self.client is not None:
307 |             await self.client.close()
308 |             self.client = None
309 |         await asyncio.sleep(1)
310 |         await self.server.stop()
311 | 
312 |     async def _ensure_client(self) -> None:
313 |         """Ensure client is initialized."""
314 |         if self.client is None:
315 |             await self._init_client()
316 | 
```

--------------------------------------------------------------------------------
/libs/python/mcp-server/mcp_server/session_manager.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Session Manager for MCP Server - Handles concurrent client sessions with proper resource isolation.
  3 | 
  4 | This module provides:
  5 | - Per-session computer instance management
  6 | - Resource pooling and lifecycle management
  7 | - Graceful session cleanup
  8 | - Concurrent task execution support
  9 | """
 10 | 
 11 | import asyncio
 12 | import logging
 13 | import time
 14 | import uuid
 15 | from typing import Dict, Optional, Any, List, Set
 16 | from dataclasses import dataclass, field
 17 | from contextlib import asynccontextmanager
 18 | import weakref
 19 | 
 20 | logger = logging.getLogger("mcp-server.session_manager")
 21 | 
 22 | @dataclass
 23 | class SessionInfo:
 24 |     """Information about an active session."""
 25 |     session_id: str
 26 |     computer: Any  # Computer instance
 27 |     created_at: float
 28 |     last_activity: float
 29 |     active_tasks: Set[str] = field(default_factory=set)
 30 |     is_shutting_down: bool = False
 31 | 
 32 | class ComputerPool:
 33 |     """Pool of computer instances for efficient resource management."""
 34 |     
 35 |     def __init__(self, max_size: int = 5, idle_timeout: float = 300.0):
 36 |         self.max_size = max_size
 37 |         self.idle_timeout = idle_timeout
 38 |         self._available: List[Any] = []
 39 |         self._in_use: Set[Any] = set()
 40 |         self._creation_lock = asyncio.Lock()
 41 |         
 42 |     async def acquire(self) -> Any:
 43 |         """Acquire a computer instance from the pool."""
 44 |         # Try to get an available instance
 45 |         if self._available:
 46 |             computer = self._available.pop()
 47 |             self._in_use.add(computer)
 48 |             logger.debug(f"Reusing computer instance from pool")
 49 |             return computer
 50 |             
 51 |         # Check if we can create a new one
 52 |         async with self._creation_lock:
 53 |             if len(self._in_use) < self.max_size:
 54 |                 logger.debug("Creating new computer instance")
 55 |                 from computer import Computer
 56 |                 computer = Computer(verbosity=logging.INFO)
 57 |                 await computer.run()
 58 |                 self._in_use.add(computer)
 59 |                 return computer
 60 |                 
 61 |         # Wait for an instance to become available
 62 |         logger.debug("Waiting for computer instance to become available")
 63 |         while not self._available:
 64 |             await asyncio.sleep(0.1)
 65 |             
 66 |         computer = self._available.pop()
 67 |         self._in_use.add(computer)
 68 |         return computer
 69 |     
 70 |     async def release(self, computer: Any) -> None:
 71 |         """Release a computer instance back to the pool."""
 72 |         if computer in self._in_use:
 73 |             self._in_use.remove(computer)
 74 |             self._available.append(computer)
 75 |             logger.debug("Released computer instance back to pool")
 76 |     
 77 |     async def cleanup_idle(self) -> None:
 78 |         """Clean up idle computer instances."""
 79 |         current_time = time.time()
 80 |         idle_instances = []
 81 |         
 82 |         for computer in self._available[:]:
 83 |             # Check if computer has been idle too long
 84 |             # Note: We'd need to track last use time per instance for this
 85 |             # For now, we'll keep instances in the pool
 86 |             pass
 87 |             
 88 |     async def shutdown(self) -> None:
 89 |         """Shutdown all computer instances in the pool."""
 90 |         logger.info("Shutting down computer pool")
 91 |         
 92 |         # Close all available instances
 93 |         for computer in self._available:
 94 |             try:
 95 |                 if hasattr(computer, 'close'):
 96 |                     await computer.close()
 97 |                 elif hasattr(computer, 'stop'):
 98 |                     await computer.stop()
 99 |             except Exception as e:
100 |                 logger.warning(f"Error closing computer instance: {e}")
101 |         
102 |         # Close all in-use instances
103 |         for computer in self._in_use:
104 |             try:
105 |                 if hasattr(computer, 'close'):
106 |                     await computer.close()
107 |                 elif hasattr(computer, 'stop'):
108 |                     await computer.stop()
109 |             except Exception as e:
110 |                 logger.warning(f"Error closing computer instance: {e}")
111 |         
112 |         self._available.clear()
113 |         self._in_use.clear()
114 | 
115 | class SessionManager:
116 |     """Manages concurrent client sessions with proper resource isolation."""
117 |     
118 |     def __init__(self, max_concurrent_sessions: int = 10):
119 |         self.max_concurrent_sessions = max_concurrent_sessions
120 |         self._sessions: Dict[str, SessionInfo] = {}
121 |         self._computer_pool = ComputerPool()
122 |         self._session_lock = asyncio.Lock()
123 |         self._cleanup_task: Optional[asyncio.Task] = None
124 |         self._shutdown_event = asyncio.Event()
125 |         
126 |     async def start(self) -> None:
127 |         """Start the session manager and cleanup task."""
128 |         logger.info("Starting session manager")
129 |         self._cleanup_task = asyncio.create_task(self._cleanup_loop())
130 |         
131 |     async def stop(self) -> None:
132 |         """Stop the session manager and cleanup all resources."""
133 |         logger.info("Stopping session manager")
134 |         self._shutdown_event.set()
135 |         
136 |         if self._cleanup_task:
137 |             self._cleanup_task.cancel()
138 |             try:
139 |                 await self._cleanup_task
140 |             except asyncio.CancelledError:
141 |                 pass
142 |         
143 |         # Force cleanup all sessions
144 |         async with self._session_lock:
145 |             session_ids = list(self._sessions.keys())
146 |             
147 |         for session_id in session_ids:
148 |             await self._force_cleanup_session(session_id)
149 |             
150 |         await self._computer_pool.shutdown()
151 |         
152 |     @asynccontextmanager
153 |     async def get_session(self, session_id: Optional[str] = None) -> Any:
154 |         """Get or create a session with proper resource management."""
155 |         if session_id is None:
156 |             session_id = str(uuid.uuid4())
157 |             
158 |         # Check if session exists and is not shutting down
159 |         async with self._session_lock:
160 |             if session_id in self._sessions:
161 |                 session = self._sessions[session_id]
162 |                 if session.is_shutting_down:
163 |                     raise RuntimeError(f"Session {session_id} is shutting down")
164 |                 session.last_activity = time.time()
165 |                 computer = session.computer
166 |             else:
167 |                 # Create new session
168 |                 if len(self._sessions) >= self.max_concurrent_sessions:
169 |                     raise RuntimeError(f"Maximum concurrent sessions ({self.max_concurrent_sessions}) reached")
170 |                 
171 |                 computer = await self._computer_pool.acquire()
172 |                 session = SessionInfo(
173 |                     session_id=session_id,
174 |                     computer=computer,
175 |                     created_at=time.time(),
176 |                     last_activity=time.time()
177 |                 )
178 |                 self._sessions[session_id] = session
179 |                 logger.info(f"Created new session: {session_id}")
180 |         
181 |         try:
182 |             yield session
183 |         finally:
184 |             # Update last activity
185 |             async with self._session_lock:
186 |                 if session_id in self._sessions:
187 |                     self._sessions[session_id].last_activity = time.time()
188 |     
189 |     async def register_task(self, session_id: str, task_id: str) -> None:
190 |         """Register a task for a session."""
191 |         async with self._session_lock:
192 |             if session_id in self._sessions:
193 |                 self._sessions[session_id].active_tasks.add(task_id)
194 |                 logger.debug(f"Registered task {task_id} for session {session_id}")
195 |     
196 |     async def unregister_task(self, session_id: str, task_id: str) -> None:
197 |         """Unregister a task from a session."""
198 |         async with self._session_lock:
199 |             if session_id in self._sessions:
200 |                 self._sessions[session_id].active_tasks.discard(task_id)
201 |                 logger.debug(f"Unregistered task {task_id} from session {session_id}")
202 |     
203 |     async def cleanup_session(self, session_id: str) -> None:
204 |         """Cleanup a specific session."""
205 |         async with self._session_lock:
206 |             if session_id not in self._sessions:
207 |                 return
208 |                 
209 |             session = self._sessions[session_id]
210 |             
211 |             # Check if session has active tasks
212 |             if session.active_tasks:
213 |                 logger.info(f"Session {session_id} has active tasks, marking for shutdown")
214 |                 session.is_shutting_down = True
215 |                 return
216 |             
217 |             # Actually cleanup the session
218 |             await self._force_cleanup_session(session_id)
219 |     
220 |     async def _force_cleanup_session(self, session_id: str) -> None:
221 |         """Force cleanup a session regardless of active tasks."""
222 |         async with self._session_lock:
223 |             if session_id not in self._sessions:
224 |                 return
225 |                 
226 |             session = self._sessions[session_id]
227 |             logger.info(f"Cleaning up session: {session_id}")
228 |             
229 |             # Release computer back to pool
230 |             await self._computer_pool.release(session.computer)
231 |             
232 |             # Remove session
233 |             del self._sessions[session_id]
234 |     
235 |     async def _cleanup_loop(self) -> None:
236 |         """Background task to cleanup idle sessions."""
237 |         while not self._shutdown_event.is_set():
238 |             try:
239 |                 await asyncio.sleep(60)  # Run cleanup every minute
240 |                 
241 |                 current_time = time.time()
242 |                 idle_timeout = 600.0  # 10 minutes
243 |                 
244 |                 async with self._session_lock:
245 |                     idle_sessions = []
246 |                     for session_id, session in self._sessions.items():
247 |                         if not session.is_shutting_down and not session.active_tasks:
248 |                             if current_time - session.last_activity > idle_timeout:
249 |                                 idle_sessions.append(session_id)
250 |                 
251 |                 # Cleanup idle sessions
252 |                 for session_id in idle_sessions:
253 |                     await self._force_cleanup_session(session_id)
254 |                     logger.info(f"Cleaned up idle session: {session_id}")
255 |                     
256 |             except asyncio.CancelledError:
257 |                 break
258 |             except Exception as e:
259 |                 logger.error(f"Error in cleanup loop: {e}")
260 |     
261 |     def get_session_stats(self) -> Dict[str, Any]:
262 |         """Get statistics about active sessions."""
263 |         async def _get_stats():
264 |             async with self._session_lock:
265 |                 return {
266 |                     "total_sessions": len(self._sessions),
267 |                     "max_concurrent": self.max_concurrent_sessions,
268 |                     "sessions": {
269 |                         session_id: {
270 |                             "created_at": session.created_at,
271 |                             "last_activity": session.last_activity,
272 |                             "active_tasks": len(session.active_tasks),
273 |                             "is_shutting_down": session.is_shutting_down
274 |                         }
275 |                         for session_id, session in self._sessions.items()
276 |                     }
277 |                 }
278 |         
279 |         # Run in current event loop or create new one
280 |         try:
281 |             loop = asyncio.get_running_loop()
282 |             return asyncio.run_coroutine_threadsafe(_get_stats(), loop).result()
283 |         except RuntimeError:
284 |             # No event loop running, create a new one
285 |             return asyncio.run(_get_stats())
286 | 
287 | # Global session manager instance
288 | _session_manager: Optional[SessionManager] = None
289 | 
290 | def get_session_manager() -> SessionManager:
291 |     """Get the global session manager instance."""
292 |     global _session_manager
293 |     if _session_manager is None:
294 |         _session_manager = SessionManager()
295 |     return _session_manager
296 | 
297 | async def initialize_session_manager() -> None:
298 |     """Initialize the global session manager."""
299 |     global _session_manager
300 |     if _session_manager is None:
301 |         _session_manager = SessionManager()
302 |         await _session_manager.start()
303 |     return _session_manager
304 | 
305 | async def shutdown_session_manager() -> None:
306 |     """Shutdown the global session manager."""
307 |     global _session_manager
308 |     if _session_manager is not None:
309 |         await _session_manager.stop()
310 |         _session_manager = None
311 | 
```

--------------------------------------------------------------------------------
/.github/workflows/pypi-reusable-publish.yml:
--------------------------------------------------------------------------------

```yaml
  1 | name: Reusable Package Publish Workflow
  2 | 
  3 | on:
  4 |   workflow_call:
  5 |     inputs:
  6 |       package_name:
  7 |         description: "Name of the package (e.g. pylume, computer, agent)"
  8 |         required: true
  9 |         type: string
 10 |       package_dir:
 11 |         description: "Directory containing the package relative to workspace root (e.g. libs/python/pylume)"
 12 |         required: true
 13 |         type: string
 14 |       version:
 15 |         description: "Version to publish"
 16 |         required: true
 17 |         type: string
 18 |       is_lume_package:
 19 |         description: "Whether this package includes the lume binary"
 20 |         required: false
 21 |         type: boolean
 22 |         default: false
 23 |       base_package_name:
 24 |         description: "PyPI package name (e.g. pylume, cua-agent)"
 25 |         required: true
 26 |         type: string
 27 |       make_latest:
 28 |         description: "Whether to mark this release as latest (should only be true for lume)"
 29 |         required: false
 30 |         type: boolean
 31 |         default: false
 32 |     secrets:
 33 |       PYPI_TOKEN:
 34 |         required: true
 35 |     outputs:
 36 |       version:
 37 |         description: "The version that was published"
 38 |         value: ${{ jobs.build-and-publish.outputs.version }}
 39 | 
 40 | jobs:
 41 |   build-and-publish:
 42 |     runs-on: macos-latest
 43 |     permissions:
 44 |       contents: write # This permission is needed for creating releases
 45 |     outputs:
 46 |       version: ${{ steps.set-version.outputs.version }}
 47 |     steps:
 48 |       - uses: actions/checkout@v4
 49 |         with:
 50 |           fetch-depth: 0 # Full history for release creation
 51 | 
 52 |       - name: Set up Python
 53 |         uses: actions/setup-python@v4
 54 |         with:
 55 |           python-version: "3.11"
 56 | 
 57 |       - name: Create root pdm.lock file
 58 |         run: |
 59 |           # Create an empty pdm.lock file in the root
 60 |           touch pdm.lock
 61 | 
 62 |       - name: Install PDM
 63 |         uses: pdm-project/setup-pdm@v3
 64 |         with:
 65 |           python-version: "3.11"
 66 |           cache: true
 67 | 
 68 |       - name: Set version
 69 |         id: set-version
 70 |         run: |
 71 |           echo "VERSION=${{ inputs.version }}" >> $GITHUB_ENV
 72 |           echo "version=${{ inputs.version }}" >> $GITHUB_OUTPUT
 73 | 
 74 |       - name: Verify version consistency
 75 |         run: |
 76 |           # Install toml parser
 77 |           pip install toml
 78 | 
 79 |           # Verify version matches using script (exits with error if mismatch)
 80 |           python ${GITHUB_WORKSPACE}/.github/scripts/get_pyproject_version.py \
 81 |             ${{ inputs.package_dir }}/pyproject.toml \
 82 |             ${{ inputs.version }}
 83 | 
 84 |       - name: Initialize PDM in package directory
 85 |         run: |
 86 |           # Make sure we're working with a properly initialized PDM project
 87 |           cd ${{ inputs.package_dir }}
 88 | 
 89 |           # Create pdm.lock if it doesn't exist
 90 |           if [ ! -f "pdm.lock" ]; then
 91 |             echo "No pdm.lock found, initializing PDM project..."
 92 |             pdm lock
 93 |           fi
 94 | 
 95 |       # Conditional step for lume binary download (only for pylume package)
 96 |       - name: Download and setup lume binary
 97 |         if: inputs.is_lume_package
 98 |         run: |
 99 |           # Create a temporary directory for extraction
100 |           mkdir -p temp_lume
101 | 
102 |           # Download the latest lume release directly
103 |           echo "Downloading latest lume version..."
104 |           curl -sL "https://github.com/trycua/lume/releases/latest/download/lume.tar.gz" -o temp_lume/lume.tar.gz
105 | 
106 |           # Extract the tar file (ignore ownership and suppress warnings)
107 |           cd temp_lume && tar --no-same-owner -xzf lume.tar.gz
108 | 
109 |           # Make the binary executable
110 |           chmod +x lume
111 | 
112 |           # Copy the lume binary to the correct location in the pylume package
113 |           mkdir -p "${GITHUB_WORKSPACE}/${{ inputs.package_dir }}/pylume"
114 |           cp lume "${GITHUB_WORKSPACE}/${{ inputs.package_dir }}/pylume/lume"
115 | 
116 |           # Verify the binary exists and is executable
117 |           test -x "${GITHUB_WORKSPACE}/${{ inputs.package_dir }}/pylume/lume" || { echo "lume binary not found or not executable"; exit 1; }
118 | 
119 |           # Get the version from the downloaded binary for reference
120 |           LUME_VERSION=$(./lume --version | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' || echo "unknown")
121 |           echo "Using lume version: $LUME_VERSION"
122 | 
123 |           # Cleanup
124 |           cd "${GITHUB_WORKSPACE}" && rm -rf temp_lume
125 | 
126 |           # Save the lume version for reference
127 |           echo "LUME_VERSION=${LUME_VERSION}" >> $GITHUB_ENV
128 | 
129 |       - name: Build and publish
130 |         env:
131 |           PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
132 |         run: |
133 |           cd ${{ inputs.package_dir }}
134 |           # Build with PDM
135 |           pdm build
136 | 
137 |           # For pylume package, verify the binary is in the wheel
138 |           if [ "${{ inputs.is_lume_package }}" = "true" ]; then
139 |             python -m pip install wheel
140 |             wheel unpack dist/*.whl --dest temp_wheel
141 |             echo "Listing contents of wheel directory:"
142 |             find temp_wheel -type f
143 |             test -f temp_wheel/pylume-*/pylume/lume || { echo "lume binary not found in wheel"; exit 1; }
144 |             rm -rf temp_wheel
145 |             echo "Publishing ${{ inputs.base_package_name }} ${VERSION} with lume ${LUME_VERSION}"
146 |           else
147 |             echo "Publishing ${{ inputs.base_package_name }} ${VERSION}"
148 |           fi
149 | 
150 |           # Install and use twine directly instead of PDM publish
151 |           echo "Installing twine for direct publishing..."
152 |           pip install twine
153 | 
154 |           echo "Publishing to PyPI using twine..."
155 |           TWINE_USERNAME="__token__" TWINE_PASSWORD="$PYPI_TOKEN" python -m twine upload dist/*
156 | 
157 |           # Save the wheel file path for the release
158 |           WHEEL_FILE=$(ls dist/*.whl | head -1)
159 |           echo "WHEEL_FILE=${WHEEL_FILE}" >> $GITHUB_ENV
160 | 
161 |       - name: Prepare Simple Release Notes
162 |         if: startsWith(github.ref, 'refs/tags/')
163 |         run: |
164 |           # Create release notes based on package type
165 |           echo "# ${{ inputs.base_package_name }} v${VERSION}" > release_notes.md
166 |           echo "" >> release_notes.md
167 | 
168 |           if [ "${{ inputs.package_name }}" = "pylume" ]; then
169 |             echo "## Python SDK for lume - run macOS and Linux VMs on Apple Silicon" >> release_notes.md
170 |             echo "" >> release_notes.md
171 |             echo "This package provides Python bindings for the lume virtualization tool." >> release_notes.md
172 |             echo "" >> release_notes.md
173 |             echo "## Dependencies" >> release_notes.md
174 |             echo "* lume binary: v${LUME_VERSION}" >> release_notes.md
175 |           elif [ "${{ inputs.package_name }}" = "computer" ]; then
176 |             echo "## Computer control library for the Computer Universal Automation (CUA) project" >> release_notes.md
177 |             echo "" >> release_notes.md
178 |             echo "## Dependencies" >> release_notes.md
179 |             echo "* pylume: ${PYLUME_VERSION:-latest}" >> release_notes.md
180 |           elif [ "${{ inputs.package_name }}" = "agent" ]; then
181 |             echo "## Dependencies" >> release_notes.md
182 |             echo "* cua-computer: ${COMPUTER_VERSION:-latest}" >> release_notes.md
183 |             echo "* cua-som: ${SOM_VERSION:-latest}" >> release_notes.md
184 |             echo "" >> release_notes.md
185 |             echo "## Installation Options" >> release_notes.md
186 |             echo "" >> release_notes.md
187 |             echo "### Basic installation with Anthropic" >> release_notes.md
188 |             echo '```bash' >> release_notes.md
189 |             echo "pip install cua-agent[anthropic]==${VERSION}" >> release_notes.md
190 |             echo '```' >> release_notes.md
191 |             echo "" >> release_notes.md
192 |             echo "### With SOM (recommended)" >> release_notes.md
193 |             echo '```bash' >> release_notes.md
194 |             echo "pip install cua-agent[som]==${VERSION}" >> release_notes.md
195 |             echo '```' >> release_notes.md
196 |             echo "" >> release_notes.md
197 |             echo "### All features" >> release_notes.md
198 |             echo '```bash' >> release_notes.md
199 |             echo "pip install cua-agent[all]==${VERSION}" >> release_notes.md
200 |             echo '```' >> release_notes.md
201 |           elif [ "${{ inputs.package_name }}" = "som" ]; then
202 |             echo "## Computer Vision and OCR library for detecting and analyzing UI elements" >> release_notes.md
203 |             echo "" >> release_notes.md
204 |             echo "This package provides enhanced UI understanding capabilities through computer vision and OCR." >> release_notes.md
205 |           elif [ "${{ inputs.package_name }}" = "computer-server" ]; then
206 |             echo "## Computer Server for the Computer Universal Automation (CUA) project" >> release_notes.md
207 |             echo "" >> release_notes.md
208 |             echo "A FastAPI-based server implementation for computer control." >> release_notes.md
209 |             echo "" >> release_notes.md
210 |             echo "## Dependencies" >> release_notes.md
211 |             echo "* cua-computer: ${COMPUTER_VERSION:-latest}" >> release_notes.md
212 |             echo "" >> release_notes.md
213 |             echo "## Usage" >> release_notes.md
214 |             echo '```bash' >> release_notes.md
215 |             echo "# Run the server" >> release_notes.md
216 |             echo "cua-computer-server" >> release_notes.md
217 |             echo '```' >> release_notes.md
218 |           elif [ "${{ inputs.package_name }}" = "mcp-server" ]; then
219 |             echo "## MCP Server for the Computer-Use Agent (CUA)" >> release_notes.md
220 |             echo "" >> release_notes.md
221 |             echo "This package provides MCP (Model Context Protocol) integration for CUA agents, allowing them to be used with Claude Desktop, Cursor, and other MCP clients." >> release_notes.md
222 |             echo "" >> release_notes.md
223 |             echo "## Dependencies" >> release_notes.md
224 |             echo "* cua-computer: ${COMPUTER_VERSION:-latest}" >> release_notes.md
225 |             echo "* cua-agent: ${AGENT_VERSION:-latest}" >> release_notes.md
226 |             echo "" >> release_notes.md
227 |             echo "## Usage" >> release_notes.md
228 |             echo '```bash' >> release_notes.md
229 |             echo "# Run the MCP server directly" >> release_notes.md
230 |             echo "cua-mcp-server" >> release_notes.md
231 |             echo '```' >> release_notes.md
232 |             echo "" >> release_notes.md
233 |             echo "## Claude Desktop Integration" >> release_notes.md
234 |             echo "Add to your Claude Desktop configuration (~/.config/claude-desktop/claude_desktop_config.json or OS-specific location):" >> release_notes.md
235 |             echo '```json' >> release_notes.md
236 |             echo '"mcpServers": {' >> release_notes.md
237 |             echo '  "cua-agent": {' >> release_notes.md
238 |             echo '    "command": "cua-mcp-server",' >> release_notes.md
239 |             echo '    "args": [],' >> release_notes.md
240 |             echo '    "env": {' >> release_notes.md
241 |             echo '      "CUA_AGENT_LOOP": "OMNI",' >> release_notes.md
242 |             echo '      "CUA_MODEL_PROVIDER": "ANTHROPIC",' >> release_notes.md
243 |             echo '      "CUA_MODEL_NAME": "claude-3-opus-20240229",' >> release_notes.md
244 |             echo '      "ANTHROPIC_API_KEY": "your-api-key",' >> release_notes.md
245 |             echo '      "PYTHONIOENCODING": "utf-8"' >> release_notes.md
246 |             echo '    }' >> release_notes.md
247 |             echo '  }' >> release_notes.md
248 |             echo '}' >> release_notes.md
249 |             echo '```' >> release_notes.md
250 |           fi
251 | 
252 |           # Add installation section if not agent (which has its own installation section)
253 |           if [ "${{ inputs.package_name }}" != "agent" ]; then
254 |             echo "" >> release_notes.md
255 |             echo "## Installation" >> release_notes.md
256 |             echo '```bash' >> release_notes.md
257 |             echo "pip install ${{ inputs.base_package_name }}==${VERSION}" >> release_notes.md
258 |             echo '```' >> release_notes.md
259 |           fi
260 | 
261 |           echo "Release notes created:"
262 |           cat release_notes.md
263 | 
264 |       - name: Create GitHub Release
265 |         uses: softprops/action-gh-release@v2
266 |         if: startsWith(github.ref, 'refs/tags/')
267 |         with:
268 |           name: "${{ inputs.base_package_name }} v${{ env.VERSION }}"
269 |           body_path: release_notes.md
270 |           files: ${{ inputs.package_dir }}/${{ env.WHEEL_FILE }}
271 |           draft: false
272 |           prerelease: false
273 |           make_latest: ${{ inputs.package_name == 'lume' }}
274 |         env:
275 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
276 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/composed_grounded.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Composed-grounded agent loop implementation that combines grounding and thinking models.
  3 | Uses a two-stage approach: grounding model for element detection, thinking model for reasoning.
  4 | """
  5 | 
  6 | import uuid
  7 | import asyncio
  8 | import json
  9 | import base64
 10 | from typing import Dict, List, Any, Optional, Tuple
 11 | from io import BytesIO
 12 | from PIL import Image
 13 | import litellm
 14 | 
 15 | from ..decorators import register_agent
 16 | from ..types import Messages, AgentResponse, Tools, AgentCapability
 17 | from ..loops.base import AsyncAgentConfig
 18 | from ..responses import (
 19 |     convert_computer_calls_xy2desc,
 20 |     convert_responses_items_to_completion_messages,
 21 |     convert_completion_messages_to_responses_items,
 22 |     convert_computer_calls_desc2xy,
 23 |     get_all_element_descriptions
 24 | )
 25 | from ..agent import find_agent_config
 26 | 
 27 | GROUNDED_COMPUTER_TOOL_SCHEMA = {
 28 |   "type": "function",
 29 |   "function": {
 30 |     "name": "computer",
 31 |     "description": "Control a computer by taking screenshots and interacting with UI elements. This tool uses element descriptions to locate and interact with UI elements on the screen (e.g., 'red submit button', 'search text field', 'hamburger menu icon', 'close button in top right corner').",
 32 |     "parameters": {
 33 |         "type": "object",
 34 |         "properties": {
 35 |         "action": {
 36 |             "type": "string",
 37 |             "enum": [
 38 |             "screenshot",
 39 |             "click",
 40 |             "double_click",
 41 |             "drag",
 42 |             "type",
 43 |             "keypress",
 44 |             "scroll",
 45 |             "move",
 46 |             "wait",
 47 |             "get_current_url",
 48 |             "get_dimensions",
 49 |             "get_environment"
 50 |             ],
 51 |             "description": "The action to perform (required for all actions)"
 52 |         },
 53 |         "element_description": {
 54 |             "type": "string",
 55 |             "description": "Description of the element to interact with (required for click, double_click, move, scroll actions)"
 56 |         },
 57 |         "start_element_description": {
 58 |             "type": "string",
 59 |             "description": "Description of the element to start dragging from (required for drag action)"
 60 |         },
 61 |         "end_element_description": {
 62 |             "type": "string",
 63 |             "description": "Description of the element to drag to (required for drag action)"
 64 |         },
 65 |         "text": {
 66 |             "type": "string",
 67 |             "description": "The text to type (required for type action)"
 68 |         },
 69 |         "keys": {
 70 |             "type": "array",
 71 |             "items": {
 72 |                 "type": "string"
 73 |             },
 74 |             "description": "Key(s) to press (required for keypress action)"
 75 |         },
 76 |         "button": {
 77 |             "type": "string",
 78 |             "enum": [
 79 |                 "left",
 80 |                 "right",
 81 |                 "wheel",
 82 |                 "back",
 83 |                 "forward"
 84 |             ],
 85 |             "description": "The mouse button to use for click action (required for click and double_click action)",
 86 |         },
 87 |         "scroll_x": {
 88 |             "type": "integer",
 89 |             "description": "Horizontal scroll amount for scroll action (required for scroll action)",
 90 |         },
 91 |         "scroll_y": {
 92 |             "type": "integer",
 93 |             "description": "Vertical scroll amount for scroll action (required for scroll action)",
 94 |         },
 95 |         },
 96 |         "required": [
 97 |             "action"
 98 |         ]
 99 |     }
100 |   }
101 | }
102 | 
103 | def _prepare_tools_for_grounded(tool_schemas: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
104 |     """Prepare tools for grounded API format"""
105 |     grounded_tools = []
106 |     
107 |     for schema in tool_schemas:
108 |         if schema["type"] == "computer":
109 |             grounded_tools.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
110 |         else:
111 |             grounded_tools.append(schema)
112 |     
113 |     return grounded_tools
114 | 
115 | def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str]:
116 |     """Get the last computer call output image from messages."""
117 |     for message in reversed(messages):
118 |         if (isinstance(message, dict) and 
119 |             message.get("type") == "computer_call_output" and
120 |             isinstance(message.get("output"), dict) and
121 |             message["output"].get("type") == "input_image"):
122 |             image_url = message["output"].get("image_url", "")
123 |             if image_url.startswith("data:image/png;base64,"):
124 |                 return image_url.split(",", 1)[1]
125 |     return None
126 | 
127 | 
128 | @register_agent(r".*\+.*", priority=1)
129 | class ComposedGroundedConfig(AsyncAgentConfig):
130 |     """
131 |     Composed-grounded agent configuration that uses both grounding and thinking models.
132 |     
133 |     The model parameter should be in format: "grounding_model+thinking_model"
134 |     e.g., "huggingface-local/HelloKKMe/GTA1-7B+gemini/gemini-1.5-pro"
135 |     """
136 |     
137 |     def __init__(self):
138 |         self.desc2xy: Dict[str, Tuple[float, float]] = {}
139 |     
140 |     async def predict_step(
141 |         self,
142 |         messages: List[Dict[str, Any]],
143 |         model: str,
144 |         tools: Optional[List[Dict[str, Any]]] = None,
145 |         max_retries: Optional[int] = None,
146 |         stream: bool = False,
147 |         computer_handler=None,
148 |         use_prompt_caching: Optional[bool] = False,
149 |         _on_api_start=None,
150 |         _on_api_end=None,
151 |         _on_usage=None,
152 |         _on_screenshot=None,
153 |         **kwargs
154 |     ) -> Dict[str, Any]:
155 |         """
156 |         Composed-grounded predict step implementation.
157 |         
158 |         Process:
159 |         0. Store last computer call image, if none then take a screenshot
160 |         1. Convert computer calls from xy to descriptions
161 |         2. Convert responses items to completion messages
162 |         3. Call thinking model with litellm.acompletion
163 |         4. Convert completion messages to responses items
164 |         5. Get all element descriptions and populate desc2xy mapping
165 |         6. Convert computer calls from descriptions back to xy coordinates
166 |         7. Return output and usage
167 |         """
168 |         # Parse the composed model
169 |         if "+" not in model:
170 |             raise ValueError(f"Composed model must be in format 'grounding_model+thinking_model', got: {model}")
171 |         grounding_model, thinking_model = model.split("+", 1)
172 |         
173 |         pre_output_items = []
174 |         
175 |         # Step 0: Store last computer call image, if none then take a screenshot
176 |         last_image_b64 = get_last_computer_call_image(messages)
177 |         if last_image_b64 is None:
178 |             # Take a screenshot
179 |             screenshot_b64 = await computer_handler.screenshot() # type: ignore
180 |             if screenshot_b64:
181 |                 
182 |                 call_id = uuid.uuid4().hex
183 |                 pre_output_items += [
184 |                     {
185 |                         "type": "message",
186 |                         "role": "assistant",
187 |                         "content": [
188 |                             {
189 |                                 "type": "output_text",
190 |                                 "text": "Taking a screenshot to see the current computer screen."
191 |                             }
192 |                         ]
193 |                     },
194 |                     {
195 |                         "action": {
196 |                             "type": "screenshot"
197 |                         },
198 |                         "call_id": call_id,
199 |                         "status": "completed",
200 |                         "type": "computer_call"
201 |                     },
202 |                     {
203 |                         "type": "computer_call_output",
204 |                         "call_id": call_id,
205 |                         "output": {
206 |                             "type": "input_image",
207 |                             "image_url": f"data:image/png;base64,{screenshot_b64}"
208 |                         }
209 |                     },
210 |                 ]
211 |                 last_image_b64 = screenshot_b64
212 |                 
213 |                 # Call screenshot callback if provided
214 |                 if _on_screenshot:
215 |                     await _on_screenshot(screenshot_b64)
216 |         
217 |         tool_schemas = _prepare_tools_for_grounded(tools) # type: ignore
218 | 
219 |         # Step 1: Convert computer calls from xy to descriptions
220 |         input_messages = messages + pre_output_items
221 |         messages_with_descriptions = convert_computer_calls_xy2desc(input_messages, self.desc2xy)
222 |         
223 |         # Step 2: Convert responses items to completion messages
224 |         completion_messages = convert_responses_items_to_completion_messages(
225 |             messages_with_descriptions, 
226 |             allow_images_in_tool_results=False
227 |         )
228 |         
229 |         # Step 3: Call thinking model with litellm.acompletion
230 |         api_kwargs = {
231 |             "model": thinking_model,
232 |             "messages": completion_messages,
233 |             "tools": tool_schemas,
234 |             "max_retries": max_retries,
235 |             "stream": stream,
236 |             **kwargs
237 |         }
238 | 
239 |         if use_prompt_caching:
240 |             api_kwargs["use_prompt_caching"] = use_prompt_caching
241 |         
242 |         # Call API start hook
243 |         if _on_api_start:
244 |             await _on_api_start(api_kwargs)
245 |         
246 |         # Make the completion call
247 |         response = await litellm.acompletion(**api_kwargs)
248 |         
249 |         # Call API end hook
250 |         if _on_api_end:
251 |             await _on_api_end(api_kwargs, response)
252 |         
253 |         # Extract usage information
254 |         usage = {
255 |             **response.usage.model_dump(), # type: ignore
256 |             "response_cost": response._hidden_params.get("response_cost", 0.0),
257 |         }
258 |         if _on_usage:
259 |             await _on_usage(usage)
260 |         
261 |         # Step 4: Convert completion messages back to responses items format
262 |         response_dict = response.model_dump() # type: ignore
263 |         choice_messages = [choice["message"] for choice in response_dict["choices"]]
264 |         thinking_output_items = []
265 |         
266 |         for choice_message in choice_messages:
267 |             thinking_output_items.extend(convert_completion_messages_to_responses_items([choice_message]))
268 |         
269 |         # Step 5: Get all element descriptions and populate desc2xy mapping
270 |         element_descriptions = get_all_element_descriptions(thinking_output_items)
271 |         
272 |         if element_descriptions and last_image_b64:
273 |             # Use grounding model to predict coordinates for each description
274 |             grounding_agent_conf = find_agent_config(grounding_model)
275 |             if grounding_agent_conf:
276 |                 grounding_agent = grounding_agent_conf.agent_class()
277 |                 
278 |                 for desc in element_descriptions:
279 |                     for _ in range(3): # try 3 times
280 |                         coords = await grounding_agent.predict_click(
281 |                             model=grounding_model,
282 |                             image_b64=last_image_b64,
283 |                             instruction=desc
284 |                         )
285 |                         if coords:
286 |                             self.desc2xy[desc] = coords
287 |                             break
288 |         
289 |         # Step 6: Convert computer calls from descriptions back to xy coordinates
290 |         final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
291 |         
292 |         # Step 7: Return output and usage
293 |         return {
294 |             "output": pre_output_items + final_output_items,
295 |             "usage": usage
296 |         }
297 |     
298 |     async def predict_click(
299 |         self,
300 |         model: str,
301 |         image_b64: str,
302 |         instruction: str,
303 |         **kwargs
304 |     ) -> Optional[Tuple[int, int]]:
305 |         """
306 |         Predict click coordinates using the grounding model.
307 |         
308 |         For composed models, uses only the grounding model part for click prediction.
309 |         """
310 |         # Parse the composed model to get grounding model
311 |         if "+" not in model:
312 |             raise ValueError(f"Composed model must be in format 'grounding_model+thinking_model', got: {model}")
313 |         grounding_model, thinking_model = model.split("+", 1)
314 |         
315 |         # Find and use the grounding agent
316 |         grounding_agent_conf = find_agent_config(grounding_model)
317 |         if grounding_agent_conf:
318 |             grounding_agent = grounding_agent_conf.agent_class()
319 |             return await grounding_agent.predict_click(
320 |                 model=grounding_model,
321 |                 image_b64=image_b64,
322 |                 instruction=instruction,
323 |                 **kwargs
324 |             )
325 |         
326 |         return None
327 |     
328 |     def get_capabilities(self) -> List[AgentCapability]:
329 |         """Return the capabilities supported by this agent."""
330 |         return ["click", "step"]
331 | 
```

--------------------------------------------------------------------------------
/.github/scripts/tests/test_get_pyproject_version.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Comprehensive tests for get_pyproject_version.py script using unittest.
  3 | 
  4 | This test suite covers:
  5 | - Version matching validation
  6 | - Error handling for missing versions
  7 | - Invalid input handling
  8 | - File not found scenarios
  9 | - Malformed TOML handling
 10 | """
 11 | 
 12 | import sys
 13 | import unittest
 14 | import tempfile
 15 | from pathlib import Path
 16 | from io import StringIO
 17 | from unittest.mock import patch
 18 | 
 19 | # Add parent directory to path to import the module
 20 | sys.path.insert(0, str(Path(__file__).parent.parent))
 21 | 
 22 | # Import after path is modified
 23 | import get_pyproject_version
 24 | 
 25 | 
 26 | class TestGetPyprojectVersion(unittest.TestCase):
 27 |     """Test suite for get_pyproject_version.py functionality."""
 28 | 
 29 |     def setUp(self):
 30 |         """Reset sys.argv before each test."""
 31 |         self.original_argv = sys.argv.copy()
 32 | 
 33 |     def tearDown(self):
 34 |         """Restore sys.argv after each test."""
 35 |         sys.argv = self.original_argv
 36 | 
 37 |     def create_pyproject_toml(self, version: str) -> Path:
 38 |         """Helper to create a temporary pyproject.toml file with a given version."""
 39 |         temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.toml', delete=False)
 40 |         temp_file.write(f"""
 41 | [project]
 42 | name = "test-project"
 43 | version = "{version}"
 44 | description = "A test project"
 45 | """)
 46 |         temp_file.close()
 47 |         return Path(temp_file.name)
 48 | 
 49 |     def create_pyproject_toml_no_version(self) -> Path:
 50 |         """Helper to create a pyproject.toml without a version field."""
 51 |         temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.toml', delete=False)
 52 |         temp_file.write("""
 53 | [project]
 54 | name = "test-project"
 55 | description = "A test project without version"
 56 | """)
 57 |         temp_file.close()
 58 |         return Path(temp_file.name)
 59 | 
 60 |     def create_pyproject_toml_no_project(self) -> Path:
 61 |         """Helper to create a pyproject.toml without a project section."""
 62 |         temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.toml', delete=False)
 63 |         temp_file.write("""
 64 | [tool.poetry]
 65 | name = "test-project"
 66 | version = "1.0.0"
 67 | """)
 68 |         temp_file.close()
 69 |         return Path(temp_file.name)
 70 | 
 71 |     def create_malformed_toml(self) -> Path:
 72 |         """Helper to create a malformed TOML file."""
 73 |         temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.toml', delete=False)
 74 |         temp_file.write("""
 75 | [project
 76 | name = "test-project
 77 | version = "1.0.0"
 78 | """)
 79 |         temp_file.close()
 80 |         return Path(temp_file.name)
 81 | 
 82 |     # Test: Successful version match
 83 |     def test_matching_versions(self):
 84 |         """Test that matching versions result in success."""
 85 |         pyproject_file = self.create_pyproject_toml("1.2.3")
 86 | 
 87 |         try:
 88 |             sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.2.3']
 89 | 
 90 |             # Capture stdout
 91 |             captured_output = StringIO()
 92 |             with patch('sys.stdout', captured_output):
 93 |                 with self.assertRaises(SystemExit) as cm:
 94 |                     get_pyproject_version.main()
 95 | 
 96 |             self.assertEqual(cm.exception.code, 0)
 97 |             self.assertIn("✅ Version consistency check passed: 1.2.3", captured_output.getvalue())
 98 |         finally:
 99 |             pyproject_file.unlink()
100 | 
101 |     # Test: Version mismatch
102 |     def test_version_mismatch(self):
103 |         """Test that mismatched versions result in failure with appropriate error message."""
104 |         pyproject_file = self.create_pyproject_toml("1.2.3")
105 | 
106 |         try:
107 |             sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.2.4']
108 | 
109 |             # Capture stderr
110 |             captured_error = StringIO()
111 |             with patch('sys.stderr', captured_error):
112 |                 with self.assertRaises(SystemExit) as cm:
113 |                     get_pyproject_version.main()
114 | 
115 |             self.assertEqual(cm.exception.code, 1)
116 |             error_output = captured_error.getvalue()
117 |             self.assertIn("❌ Version mismatch detected!", error_output)
118 |             self.assertIn("pyproject.toml version: 1.2.3", error_output)
119 |             self.assertIn("Expected version: 1.2.4", error_output)
120 |             self.assertIn("Please update pyproject.toml to version 1.2.4", error_output)
121 |         finally:
122 |             pyproject_file.unlink()
123 | 
124 |     # Test: Missing version in pyproject.toml
125 |     def test_missing_version_field(self):
126 |         """Test handling of pyproject.toml without a version field."""
127 |         pyproject_file = self.create_pyproject_toml_no_version()
128 | 
129 |         try:
130 |             sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.0.0']
131 | 
132 |             captured_error = StringIO()
133 |             with patch('sys.stderr', captured_error):
134 |                 with self.assertRaises(SystemExit) as cm:
135 |                     get_pyproject_version.main()
136 | 
137 |             self.assertEqual(cm.exception.code, 1)
138 |             self.assertIn("❌ ERROR: No version found in pyproject.toml", captured_error.getvalue())
139 |         finally:
140 |             pyproject_file.unlink()
141 | 
142 |     # Test: Missing project section
143 |     def test_missing_project_section(self):
144 |         """Test handling of pyproject.toml without a project section."""
145 |         pyproject_file = self.create_pyproject_toml_no_project()
146 | 
147 |         try:
148 |             sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.0.0']
149 | 
150 |             captured_error = StringIO()
151 |             with patch('sys.stderr', captured_error):
152 |                 with self.assertRaises(SystemExit) as cm:
153 |                     get_pyproject_version.main()
154 | 
155 |             self.assertEqual(cm.exception.code, 1)
156 |             self.assertIn("❌ ERROR: No version found in pyproject.toml", captured_error.getvalue())
157 |         finally:
158 |             pyproject_file.unlink()
159 | 
160 |     # Test: File not found
161 |     def test_file_not_found(self):
162 |         """Test handling of non-existent pyproject.toml file."""
163 |         sys.argv = ['get_pyproject_version.py', '/nonexistent/pyproject.toml', '1.0.0']
164 | 
165 |         with self.assertRaises(SystemExit) as cm:
166 |             get_pyproject_version.main()
167 | 
168 |         self.assertEqual(cm.exception.code, 1)
169 | 
170 |     # Test: Malformed TOML
171 |     def test_malformed_toml(self):
172 |         """Test handling of malformed TOML file."""
173 |         pyproject_file = self.create_malformed_toml()
174 | 
175 |         try:
176 |             sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.0.0']
177 | 
178 |             with self.assertRaises(SystemExit) as cm:
179 |                 get_pyproject_version.main()
180 | 
181 |             self.assertEqual(cm.exception.code, 1)
182 |         finally:
183 |             pyproject_file.unlink()
184 | 
185 |     # Test: Incorrect number of arguments - too few
186 |     def test_too_few_arguments(self):
187 |         """Test that providing too few arguments results in usage error."""
188 |         sys.argv = ['get_pyproject_version.py', 'pyproject.toml']
189 | 
190 |         captured_error = StringIO()
191 |         with patch('sys.stderr', captured_error):
192 |             with self.assertRaises(SystemExit) as cm:
193 |                 get_pyproject_version.main()
194 | 
195 |         self.assertEqual(cm.exception.code, 1)
196 |         self.assertIn("Usage: python get_pyproject_version.py <pyproject_path> <expected_version>",
197 |                      captured_error.getvalue())
198 | 
199 |     # Test: Incorrect number of arguments - too many
200 |     def test_too_many_arguments(self):
201 |         """Test that providing too many arguments results in usage error."""
202 |         sys.argv = ['get_pyproject_version.py', 'pyproject.toml', '1.0.0', 'extra']
203 | 
204 |         captured_error = StringIO()
205 |         with patch('sys.stderr', captured_error):
206 |             with self.assertRaises(SystemExit) as cm:
207 |                 get_pyproject_version.main()
208 | 
209 |         self.assertEqual(cm.exception.code, 1)
210 |         self.assertIn("Usage: python get_pyproject_version.py <pyproject_path> <expected_version>",
211 |                      captured_error.getvalue())
212 | 
213 |     # Test: No arguments
214 |     def test_no_arguments(self):
215 |         """Test that providing no arguments results in usage error."""
216 |         sys.argv = ['get_pyproject_version.py']
217 | 
218 |         captured_error = StringIO()
219 |         with patch('sys.stderr', captured_error):
220 |             with self.assertRaises(SystemExit) as cm:
221 |                 get_pyproject_version.main()
222 | 
223 |         self.assertEqual(cm.exception.code, 1)
224 |         self.assertIn("Usage: python get_pyproject_version.py <pyproject_path> <expected_version>",
225 |                      captured_error.getvalue())
226 | 
227 |     # Test: Version with pre-release tags
228 |     def test_version_with_prerelease_tags(self):
229 |         """Test matching versions with pre-release tags like alpha, beta, rc."""
230 |         pyproject_file = self.create_pyproject_toml("1.2.3-rc.1")
231 | 
232 |         try:
233 |             sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.2.3-rc.1']
234 | 
235 |             captured_output = StringIO()
236 |             with patch('sys.stdout', captured_output):
237 |                 with self.assertRaises(SystemExit) as cm:
238 |                     get_pyproject_version.main()
239 | 
240 |             self.assertEqual(cm.exception.code, 0)
241 |             self.assertIn("✅ Version consistency check passed: 1.2.3-rc.1", captured_output.getvalue())
242 |         finally:
243 |             pyproject_file.unlink()
244 | 
245 |     # Test: Version with build metadata
246 |     def test_version_with_build_metadata(self):
247 |         """Test matching versions with build metadata."""
248 |         pyproject_file = self.create_pyproject_toml("1.2.3+build.123")
249 | 
250 |         try:
251 |             sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.2.3+build.123']
252 | 
253 |             captured_output = StringIO()
254 |             with patch('sys.stdout', captured_output):
255 |                 with self.assertRaises(SystemExit) as cm:
256 |                     get_pyproject_version.main()
257 | 
258 |             self.assertEqual(cm.exception.code, 0)
259 |             self.assertIn("✅ Version consistency check passed: 1.2.3+build.123", captured_output.getvalue())
260 |         finally:
261 |             pyproject_file.unlink()
262 | 
263 |     # Test: Various semantic version formats
264 |     def test_semantic_version_0_0_1(self):
265 |         """Test semantic version 0.0.1."""
266 |         self._test_version_format("0.0.1")
267 | 
268 |     def test_semantic_version_1_0_0(self):
269 |         """Test semantic version 1.0.0."""
270 |         self._test_version_format("1.0.0")
271 | 
272 |     def test_semantic_version_10_20_30(self):
273 |         """Test semantic version 10.20.30."""
274 |         self._test_version_format("10.20.30")
275 | 
276 |     def test_semantic_version_alpha(self):
277 |         """Test semantic version with alpha tag."""
278 |         self._test_version_format("1.2.3-alpha")
279 | 
280 |     def test_semantic_version_beta(self):
281 |         """Test semantic version with beta tag."""
282 |         self._test_version_format("1.2.3-beta.1")
283 | 
284 |     def test_semantic_version_rc_with_build(self):
285 |         """Test semantic version with rc and build metadata."""
286 |         self._test_version_format("1.2.3-rc.1+build.456")
287 | 
288 |     def _test_version_format(self, version: str):
289 |         """Helper method to test various semantic version formats."""
290 |         pyproject_file = self.create_pyproject_toml(version)
291 | 
292 |         try:
293 |             sys.argv = ['get_pyproject_version.py', str(pyproject_file), version]
294 | 
295 |             captured_output = StringIO()
296 |             with patch('sys.stdout', captured_output):
297 |                 with self.assertRaises(SystemExit) as cm:
298 |                     get_pyproject_version.main()
299 | 
300 |             self.assertEqual(cm.exception.code, 0)
301 |             self.assertIn(f"✅ Version consistency check passed: {version}", captured_output.getvalue())
302 |         finally:
303 |             pyproject_file.unlink()
304 | 
305 |     # Test: Empty version string
306 |     def test_empty_version_string(self):
307 |         """Test handling of empty version string."""
308 |         pyproject_file = self.create_pyproject_toml("")
309 | 
310 |         try:
311 |             sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.0.0']
312 | 
313 |             captured_error = StringIO()
314 |             with patch('sys.stderr', captured_error):
315 |                 with self.assertRaises(SystemExit) as cm:
316 |                     get_pyproject_version.main()
317 | 
318 |             self.assertEqual(cm.exception.code, 1)
319 |             # Empty string is falsy, so it should trigger error
320 |             self.assertIn("❌", captured_error.getvalue())
321 |         finally:
322 |             pyproject_file.unlink()
323 | 
324 | 
325 | class TestSuiteInfo(unittest.TestCase):
326 |     """Test suite metadata."""
327 | 
328 |     def test_suite_info(self):
329 |         """Display test suite information."""
330 |         print("\n" + "="*70)
331 |         print("Test Suite: get_pyproject_version.py")
332 |         print("Framework: unittest (Python built-in)")
333 |         print("TOML Library: tomllib (Python 3.11+ built-in)")
334 |         print("="*70)
335 |         self.assertTrue(True)
336 | 
337 | 
338 | if __name__ == '__main__':
339 |     # Run tests with verbose output
340 |     unittest.main(verbosity=2)
341 | 
```

--------------------------------------------------------------------------------
/libs/python/computer-server/computer_server/watchdog.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Watchdog module for monitoring the Computer API server health.
  3 | Unix/Linux only - provides process management and restart capabilities.
  4 | """
  5 | 
  6 | import asyncio
  7 | import fcntl
  8 | import json
  9 | import logging
 10 | import os
 11 | import platform
 12 | import subprocess
 13 | import sys
 14 | import time
 15 | import websockets
 16 | from typing import Optional
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | def instance_already_running(label="watchdog"):
 22 |     """
 23 |     Detect if an an instance with the label is already running, globally
 24 |     at the operating system level.
 25 | 
 26 |     Using `os.open` ensures that the file pointer won't be closed
 27 |     by Python's garbage collector after the function's scope is exited.
 28 | 
 29 |     The lock will be released when the program exits, or could be
 30 |     released if the file pointer were closed.
 31 |     """
 32 | 
 33 |     lock_file_pointer = os.open(f"/tmp/instance_{label}.lock", os.O_WRONLY | os.O_CREAT)
 34 | 
 35 |     try:
 36 |         fcntl.lockf(lock_file_pointer, fcntl.LOCK_EX | fcntl.LOCK_NB)
 37 |         already_running = False
 38 |     except IOError:
 39 |         already_running = True
 40 | 
 41 |     return already_running
 42 | 
 43 | 
 44 | class Watchdog:
 45 |     """Watchdog class to monitor server health via WebSocket connection.
 46 |     Unix/Linux only - provides restart capabilities.
 47 |     """
 48 |     
 49 |     def __init__(self, cli_args: Optional[dict] = None, ping_interval: int = 30):
 50 |         """
 51 |         Initialize the watchdog.
 52 |         
 53 |         Args:
 54 |             cli_args: Dictionary of CLI arguments to replicate when restarting
 55 |             ping_interval: Interval between ping checks in seconds
 56 |         """
 57 |         # Check if running on Unix/Linux
 58 |         if platform.system() not in ['Linux', 'Darwin']:
 59 |             raise RuntimeError("Watchdog is only supported on Unix/Linux systems")
 60 |         
 61 |         # Store CLI arguments for restart
 62 |         self.cli_args = cli_args or {}
 63 |         self.host = self.cli_args.get('host', 'localhost')
 64 |         self.port = self.cli_args.get('port', 8000)
 65 |         self.ping_interval = ping_interval
 66 |         self.container_name = os.environ.get("CONTAINER_NAME")
 67 |         self.running = False
 68 |         self.restart_enabled = True
 69 |     
 70 |     @property
 71 |     def ws_uri(self) -> str:
 72 |         """Get the WebSocket URI using the current IP address.
 73 |         
 74 |         Returns:
 75 |             WebSocket URI for the Computer API Server
 76 |         """
 77 |         ip_address = "localhost" if not self.container_name else f"{self.container_name}.containers.cloud.trycua.com"
 78 |         protocol = "wss" if self.container_name else "ws"
 79 |         port = "8443" if self.container_name else "8000"
 80 |         return f"{protocol}://{ip_address}:{port}/ws"
 81 |         
 82 |     async def ping(self) -> bool:
 83 |         """
 84 |         Test connection to the WebSocket endpoint.
 85 |         
 86 |         Returns:
 87 |             True if connection successful, False otherwise
 88 |         """
 89 |         try:
 90 |             # Create a simple ping message
 91 |             ping_message = {
 92 |                 "command": "get_screen_size",
 93 |                 "params": {}
 94 |             }
 95 |             
 96 |             # Try to connect to the WebSocket
 97 |             async with websockets.connect(
 98 |                 self.ws_uri,
 99 |                 max_size=1024 * 1024 * 10  # 10MB limit to match server
100 |             ) as websocket:
101 |                 # Send ping message
102 |                 await websocket.send(json.dumps(ping_message))
103 |                 
104 |                 # Wait for any response or just close
105 |                 try:
106 |                     response = await asyncio.wait_for(websocket.recv(), timeout=5)
107 |                     logger.debug(f"Ping response received: {response[:100]}...")
108 |                     return True
109 |                 except asyncio.TimeoutError:
110 |                     return False
111 |         except Exception as e:
112 |             logger.warning(f"Ping failed: {e}")
113 |             return False
114 |     
115 |     def kill_processes_on_port(self, port: int) -> bool:
116 |         """
117 |         Kill any processes using the specified port.
118 |         
119 |         Args:
120 |             port: Port number to check and kill processes on
121 |             
122 |         Returns:
123 |             True if processes were killed or none found, False on error
124 |         """
125 |         try:
126 |             # Find processes using the port
127 |             result = subprocess.run(
128 |                 ["lsof", "-ti", f":{port}"],
129 |                 capture_output=True,
130 |                 text=True,
131 |                 timeout=10
132 |             )
133 |             
134 |             if result.returncode == 0 and result.stdout.strip():
135 |                 pids = result.stdout.strip().split('\n')
136 |                 logger.info(f"Found {len(pids)} processes using port {port}: {pids}")
137 |                 
138 |                 # Kill each process
139 |                 for pid in pids:
140 |                     if pid.strip():
141 |                         try:
142 |                             subprocess.run(["kill", "-9", pid.strip()], timeout=5)
143 |                             logger.info(f"Killed process {pid}")
144 |                         except subprocess.TimeoutExpired:
145 |                             logger.warning(f"Timeout killing process {pid}")
146 |                         except Exception as e:
147 |                             logger.warning(f"Error killing process {pid}: {e}")
148 |                             
149 |                 return True
150 |             else:
151 |                 logger.debug(f"No processes found using port {port}")
152 |                 return True
153 |                 
154 |         except subprocess.TimeoutExpired:
155 |             logger.error(f"Timeout finding processes on port {port}")
156 |             return False
157 |         except Exception as e:
158 |             logger.error(f"Error finding processes on port {port}: {e}")
159 |             return False
160 |     
161 |     def restart_server(self) -> bool:
162 |         """
163 |         Attempt to restart the server by killing existing processes and starting new one.
164 |         
165 |         Returns:
166 |             True if restart was attempted, False on error
167 |         """
168 |         if not self.restart_enabled:
169 |             logger.info("Server restart is disabled")
170 |             return False
171 |             
172 |         try:
173 |             logger.info("Attempting to restart server...")
174 |             
175 |             # Kill processes on the port
176 |             port_to_kill = 8443 if self.container_name else self.port
177 |             if not self.kill_processes_on_port(port_to_kill):
178 |                 logger.error("Failed to kill processes on port, restart aborted")
179 |                 return False
180 |             
181 |             # Wait a moment for processes to die
182 |             time.sleep(2)
183 |             
184 |             # Try to restart the server
185 |             # In container mode, we can't easily restart, so just log
186 |             if self.container_name:
187 |                 logger.warning("Container mode detected - cannot restart server automatically")
188 |                 logger.warning("Container orchestrator should handle restart")
189 |                 return False
190 |             else:
191 |                 # For local mode, try to restart the CLI
192 |                 logger.info("Attempting to restart local server...")
193 |                 
194 |                 # Get the current Python executable and script
195 |                 python_exe = sys.executable
196 |                 
197 |                 # Try to find the CLI module
198 |                 try:
199 |                     # Build command with all original CLI arguments
200 |                     cmd = [python_exe, "-m", "computer_server.cli"]
201 |                     
202 |                     # Add all CLI arguments except watchdog-related ones
203 |                     for key, value in self.cli_args.items():
204 |                         if key in ['watchdog', 'watchdog_interval', 'no_restart']:
205 |                             continue  # Skip watchdog args to avoid recursive watchdog
206 |                         
207 |                         # Convert underscores to hyphens for CLI args
208 |                         arg_name = f"--{key.replace('_', '-')}"
209 |                         
210 |                         if isinstance(value, bool):
211 |                             if value:  # Only add flag if True
212 |                                 cmd.append(arg_name)
213 |                         else:
214 |                             cmd.extend([arg_name, str(value)])
215 |                     
216 |                     logger.info(f"Starting server with command: {' '.join(cmd)}")
217 |                     
218 |                     # Start process in background
219 |                     subprocess.Popen(
220 |                         cmd,
221 |                         stdout=subprocess.DEVNULL,
222 |                         stderr=subprocess.DEVNULL,
223 |                         start_new_session=True
224 |                     )
225 |                     
226 |                     logger.info("Server restart initiated")
227 |                     return True
228 |                     
229 |                 except Exception as e:
230 |                     logger.error(f"Failed to restart server: {e}")
231 |                     return False
232 |                     
233 |         except Exception as e:
234 |             logger.error(f"Error during server restart: {e}")
235 |             return False
236 |     
237 |     async def start_monitoring(self) -> None:
238 |         """Start the watchdog monitoring loop."""
239 |         self.running = True
240 |         logger.info(f"Starting watchdog monitoring for {self.ws_uri}")
241 |         logger.info(f"Ping interval: {self.ping_interval} seconds")
242 |         if self.container_name:
243 |             logger.info(f"Container mode detected: {self.container_name}")
244 |         
245 |         consecutive_failures = 0
246 |         max_failures = 3
247 |         
248 |         while self.running:
249 |             try:
250 |                 success = await self.ping()
251 |                 
252 |                 if success:
253 |                     if consecutive_failures > 0:
254 |                         logger.info("Server connection restored")
255 |                     consecutive_failures = 0
256 |                     logger.debug("Ping successful")
257 |                 else:
258 |                     consecutive_failures += 1
259 |                     logger.warning(f"Ping failed ({consecutive_failures}/{max_failures})")
260 |                     
261 |                     if consecutive_failures >= max_failures:
262 |                         logger.error(f"Server appears to be down after {max_failures} consecutive failures")
263 |                         
264 |                         # Attempt to restart the server
265 |                         if self.restart_enabled:
266 |                             logger.info("Attempting automatic server restart...")
267 |                             restart_success = self.restart_server()
268 |                             
269 |                             if restart_success:
270 |                                 logger.info("Server restart initiated, waiting before next ping...")
271 |                                 # Wait longer after restart attempt
272 |                                 await asyncio.sleep(self.ping_interval * 2)
273 |                                 consecutive_failures = 0  # Reset counter after restart attempt
274 |                             else:
275 |                                 logger.error("Server restart failed")
276 |                         else:
277 |                             logger.warning("Automatic restart is disabled")
278 |                         
279 |                 # Wait for next ping interval
280 |                 await asyncio.sleep(self.ping_interval)
281 |                 
282 |             except asyncio.CancelledError:
283 |                 logger.info("Watchdog monitoring cancelled")
284 |                 break
285 |             except Exception as e:
286 |                 logger.error(f"Unexpected error in watchdog loop: {e}")
287 |                 await asyncio.sleep(self.ping_interval)
288 |     
289 |     def stop_monitoring(self) -> None:
290 |         """Stop the watchdog monitoring."""
291 |         self.running = False
292 |         logger.info("Stopping watchdog monitoring")
293 | 
294 | 
295 | async def run_watchdog(cli_args: Optional[dict] = None, ping_interval: int = 30) -> None:
296 |     """
297 |     Run the watchdog monitoring.
298 |     
299 |     Args:
300 |         cli_args: Dictionary of CLI arguments to replicate when restarting
301 |         ping_interval: Interval between ping checks in seconds
302 |     """
303 |     watchdog = Watchdog(cli_args=cli_args, ping_interval=ping_interval)
304 |     
305 |     try:
306 |         await watchdog.start_monitoring()
307 |     except KeyboardInterrupt:
308 |         logger.info("Watchdog stopped by user")
309 |     finally:
310 |         watchdog.stop_monitoring()
311 | 
312 | 
313 | if __name__ == "__main__":
314 |     # For testing the watchdog standalone
315 |     import argparse
316 |     
317 |     parser = argparse.ArgumentParser(description="Run Computer API server watchdog")
318 |     parser.add_argument("--host", default="localhost", help="Server host to monitor")
319 |     parser.add_argument("--port", type=int, default=8000, help="Server port to monitor")
320 |     parser.add_argument("--ping-interval", type=int, default=30, help="Ping interval in seconds")
321 |     
322 |     args = parser.parse_args()
323 |     
324 |     logging.basicConfig(
325 |         level=logging.INFO,
326 |         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
327 |     )
328 |     
329 |     cli_args = {
330 |         'host': args.host,
331 |         'port': args.port
332 |     }
333 |     asyncio.run(run_watchdog(cli_args, args.ping_interval))
334 | 
```