This is page 7 of 16. Use http://codebase.md/trycua/cua?page={x} to view the full context. # Directory Structure ``` ├── .all-contributorsrc ├── .cursorignore ├── .devcontainer │ ├── devcontainer.json │ ├── post-install.sh │ └── README.md ├── .dockerignore ├── .gitattributes ├── .github │ ├── FUNDING.yml │ ├── scripts │ │ ├── get_pyproject_version.py │ │ └── tests │ │ ├── __init__.py │ │ ├── README.md │ │ └── test_get_pyproject_version.py │ └── workflows │ ├── ci-lume.yml │ ├── docker-publish-kasm.yml │ ├── docker-publish-xfce.yml │ ├── docker-reusable-publish.yml │ ├── npm-publish-computer.yml │ ├── npm-publish-core.yml │ ├── publish-lume.yml │ ├── pypi-publish-agent.yml │ ├── pypi-publish-computer-server.yml │ ├── pypi-publish-computer.yml │ ├── pypi-publish-core.yml │ ├── pypi-publish-mcp-server.yml │ ├── pypi-publish-pylume.yml │ ├── pypi-publish-som.yml │ ├── pypi-reusable-publish.yml │ └── test-validation-script.yml ├── .gitignore ├── .vscode │ ├── docs.code-workspace │ ├── launch.json │ ├── libs-ts.code-workspace │ ├── lume.code-workspace │ ├── lumier.code-workspace │ ├── py.code-workspace │ └── settings.json ├── blog │ ├── app-use.md │ ├── assets │ │ ├── composite-agents.png │ │ ├── docker-ubuntu-support.png │ │ ├── hack-booth.png │ │ ├── hack-closing-ceremony.jpg │ │ ├── hack-cua-ollama-hud.jpeg │ │ ├── hack-leaderboard.png │ │ ├── hack-the-north.png │ │ ├── hack-winners.jpeg │ │ ├── hack-workshop.jpeg │ │ ├── hud-agent-evals.png │ │ └── trajectory-viewer.jpeg │ ├── bringing-computer-use-to-the-web.md │ ├── build-your-own-operator-on-macos-1.md │ ├── build-your-own-operator-on-macos-2.md │ ├── composite-agents.md │ ├── cua-hackathon.md │ ├── hack-the-north.md │ ├── hud-agent-evals.md │ ├── human-in-the-loop.md │ ├── introducing-cua-cloud-containers.md │ ├── lume-to-containerization.md │ ├── sandboxed-python-execution.md │ ├── training-computer-use-models-trajectories-1.md │ ├── trajectory-viewer.md │ ├── ubuntu-docker-support.md │ └── windows-sandbox.md ├── CONTRIBUTING.md ├── Development.md ├── Dockerfile ├── docs │ ├── .gitignore │ ├── .prettierrc │ ├── content │ │ └── docs │ │ ├── agent-sdk │ │ │ ├── agent-loops.mdx │ │ │ ├── benchmarks │ │ │ │ ├── index.mdx │ │ │ │ ├── interactive.mdx │ │ │ │ ├── introduction.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── osworld-verified.mdx │ │ │ │ ├── screenspot-pro.mdx │ │ │ │ └── screenspot-v2.mdx │ │ │ ├── callbacks │ │ │ │ ├── agent-lifecycle.mdx │ │ │ │ ├── cost-saving.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── logging.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── pii-anonymization.mdx │ │ │ │ └── trajectories.mdx │ │ │ ├── chat-history.mdx │ │ │ ├── custom-computer-handlers.mdx │ │ │ ├── custom-tools.mdx │ │ │ ├── customizing-computeragent.mdx │ │ │ ├── integrations │ │ │ │ ├── hud.mdx │ │ │ │ └── meta.json │ │ │ ├── message-format.mdx │ │ │ ├── meta.json │ │ │ ├── migration-guide.mdx │ │ │ ├── prompt-caching.mdx │ │ │ ├── supported-agents │ │ │ │ ├── composed-agents.mdx │ │ │ │ ├── computer-use-agents.mdx │ │ │ │ ├── grounding-models.mdx │ │ │ │ ├── human-in-the-loop.mdx │ │ │ │ └── meta.json │ │ │ ├── supported-model-providers │ │ │ │ ├── index.mdx │ │ │ │ └── local-models.mdx │ │ │ └── usage-tracking.mdx │ │ ├── computer-sdk │ │ │ ├── cloud-vm-management.mdx │ │ │ ├── commands.mdx │ │ │ ├── computer-ui.mdx │ │ │ ├── computers.mdx │ │ │ ├── meta.json │ │ │ └── sandboxed-python.mdx │ │ ├── index.mdx │ │ ├── libraries │ │ │ ├── agent │ │ │ │ └── index.mdx │ │ │ ├── computer │ │ │ │ └── index.mdx │ │ │ ├── computer-server │ │ │ │ ├── Commands.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── REST-API.mdx │ │ │ │ └── WebSocket-API.mdx │ │ │ ├── core │ │ │ │ └── index.mdx │ │ │ ├── lume │ │ │ │ ├── cli-reference.mdx │ │ │ │ ├── faq.md │ │ │ │ ├── http-api.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── meta.json │ │ │ │ └── prebuilt-images.mdx │ │ │ ├── lumier │ │ │ │ ├── building-lumier.mdx │ │ │ │ ├── docker-compose.mdx │ │ │ │ ├── docker.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ └── meta.json │ │ │ ├── mcp-server │ │ │ │ ├── client-integrations.mdx │ │ │ │ ├── configuration.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── llm-integrations.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── tools.mdx │ │ │ │ └── usage.mdx │ │ │ └── som │ │ │ ├── configuration.mdx │ │ │ └── index.mdx │ │ ├── meta.json │ │ ├── quickstart-cli.mdx │ │ ├── quickstart-devs.mdx │ │ └── telemetry.mdx │ ├── next.config.mjs │ ├── package-lock.json │ ├── package.json │ ├── pnpm-lock.yaml │ ├── postcss.config.mjs │ ├── public │ │ └── img │ │ ├── agent_gradio_ui.png │ │ ├── agent.png │ │ ├── cli.png │ │ ├── computer.png │ │ ├── som_box_threshold.png │ │ └── som_iou_threshold.png │ ├── README.md │ ├── source.config.ts │ ├── src │ │ ├── app │ │ │ ├── (home) │ │ │ │ ├── [[...slug]] │ │ │ │ │ └── page.tsx │ │ │ │ └── layout.tsx │ │ │ ├── api │ │ │ │ └── search │ │ │ │ └── route.ts │ │ │ ├── favicon.ico │ │ │ ├── global.css │ │ │ ├── layout.config.tsx │ │ │ ├── layout.tsx │ │ │ ├── llms.mdx │ │ │ │ └── [[...slug]] │ │ │ │ └── route.ts │ │ │ └── llms.txt │ │ │ └── route.ts │ │ ├── assets │ │ │ ├── discord-black.svg │ │ │ ├── discord-white.svg │ │ │ ├── logo-black.svg │ │ │ └── logo-white.svg │ │ ├── components │ │ │ ├── iou.tsx │ │ │ └── mermaid.tsx │ │ ├── lib │ │ │ ├── llms.ts │ │ │ └── source.ts │ │ └── mdx-components.tsx │ └── tsconfig.json ├── examples │ ├── agent_examples.py │ ├── agent_ui_examples.py │ ├── cloud_api_examples.py │ ├── computer_examples_windows.py │ ├── computer_examples.py │ ├── computer_ui_examples.py │ ├── computer-example-ts │ │ ├── .env.example │ │ ├── .gitignore │ │ ├── .prettierrc │ │ ├── package-lock.json │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── README.md │ │ ├── src │ │ │ ├── helpers.ts │ │ │ └── index.ts │ │ └── tsconfig.json │ ├── docker_examples.py │ ├── evals │ │ ├── hud_eval_examples.py │ │ └── wikipedia_most_linked.txt │ ├── pylume_examples.py │ ├── sandboxed_functions_examples.py │ ├── som_examples.py │ ├── utils.py │ └── winsandbox_example.py ├── img │ ├── agent_gradio_ui.png │ ├── agent.png │ ├── cli.png │ ├── computer.png │ ├── logo_black.png │ └── logo_white.png ├── libs │ ├── kasm │ │ ├── Dockerfile │ │ ├── LICENSE │ │ ├── README.md │ │ └── src │ │ └── ubuntu │ │ └── install │ │ └── firefox │ │ ├── custom_startup.sh │ │ ├── firefox.desktop │ │ └── install_firefox.sh │ ├── lume │ │ ├── .cursorignore │ │ ├── CONTRIBUTING.md │ │ ├── Development.md │ │ ├── img │ │ │ └── cli.png │ │ ├── Package.resolved │ │ ├── Package.swift │ │ ├── README.md │ │ ├── resources │ │ │ └── lume.entitlements │ │ ├── scripts │ │ │ ├── build │ │ │ │ ├── build-debug.sh │ │ │ │ ├── build-release-notarized.sh │ │ │ │ └── build-release.sh │ │ │ └── install.sh │ │ ├── src │ │ │ ├── Commands │ │ │ │ ├── Clone.swift │ │ │ │ ├── Config.swift │ │ │ │ ├── Create.swift │ │ │ │ ├── Delete.swift │ │ │ │ ├── Get.swift │ │ │ │ ├── Images.swift │ │ │ │ ├── IPSW.swift │ │ │ │ ├── List.swift │ │ │ │ ├── Logs.swift │ │ │ │ ├── Options │ │ │ │ │ └── FormatOption.swift │ │ │ │ ├── Prune.swift │ │ │ │ ├── Pull.swift │ │ │ │ ├── Push.swift │ │ │ │ ├── Run.swift │ │ │ │ ├── Serve.swift │ │ │ │ ├── Set.swift │ │ │ │ └── Stop.swift │ │ │ ├── ContainerRegistry │ │ │ │ ├── ImageContainerRegistry.swift │ │ │ │ ├── ImageList.swift │ │ │ │ └── ImagesPrinter.swift │ │ │ ├── Errors │ │ │ │ └── Errors.swift │ │ │ ├── FileSystem │ │ │ │ ├── Home.swift │ │ │ │ ├── Settings.swift │ │ │ │ ├── VMConfig.swift │ │ │ │ ├── VMDirectory.swift │ │ │ │ └── VMLocation.swift │ │ │ ├── LumeController.swift │ │ │ ├── Main.swift │ │ │ ├── Server │ │ │ │ ├── Handlers.swift │ │ │ │ ├── HTTP.swift │ │ │ │ ├── Requests.swift │ │ │ │ ├── Responses.swift │ │ │ │ └── Server.swift │ │ │ ├── Utils │ │ │ │ ├── CommandRegistry.swift │ │ │ │ ├── CommandUtils.swift │ │ │ │ ├── Logger.swift │ │ │ │ ├── NetworkUtils.swift │ │ │ │ ├── Path.swift │ │ │ │ ├── ProcessRunner.swift │ │ │ │ ├── ProgressLogger.swift │ │ │ │ ├── String.swift │ │ │ │ └── Utils.swift │ │ │ ├── Virtualization │ │ │ │ ├── DarwinImageLoader.swift │ │ │ │ ├── DHCPLeaseParser.swift │ │ │ │ ├── ImageLoaderFactory.swift │ │ │ │ └── VMVirtualizationService.swift │ │ │ ├── VM │ │ │ │ ├── DarwinVM.swift │ │ │ │ ├── LinuxVM.swift │ │ │ │ ├── VM.swift │ │ │ │ ├── VMDetails.swift │ │ │ │ ├── VMDetailsPrinter.swift │ │ │ │ ├── VMDisplayResolution.swift │ │ │ │ └── VMFactory.swift │ │ │ └── VNC │ │ │ ├── PassphraseGenerator.swift │ │ │ └── VNCService.swift │ │ └── tests │ │ ├── Mocks │ │ │ ├── MockVM.swift │ │ │ ├── MockVMVirtualizationService.swift │ │ │ └── MockVNCService.swift │ │ ├── VM │ │ │ └── VMDetailsPrinterTests.swift │ │ ├── VMTests.swift │ │ ├── VMVirtualizationServiceTests.swift │ │ └── VNCServiceTests.swift │ ├── lumier │ │ ├── .dockerignore │ │ ├── Dockerfile │ │ ├── README.md │ │ └── src │ │ ├── bin │ │ │ └── entry.sh │ │ ├── config │ │ │ └── constants.sh │ │ ├── hooks │ │ │ └── on-logon.sh │ │ └── lib │ │ ├── utils.sh │ │ └── vm.sh │ ├── python │ │ ├── agent │ │ │ ├── .bumpversion.cfg │ │ │ ├── agent │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── adapters │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── huggingfacelocal_adapter.py │ │ │ │ │ ├── human_adapter.py │ │ │ │ │ ├── mlxvlm_adapter.py │ │ │ │ │ └── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── qwen2_5_vl.py │ │ │ │ ├── agent.py │ │ │ │ ├── callbacks │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── budget_manager.py │ │ │ │ │ ├── image_retention.py │ │ │ │ │ ├── logging.py │ │ │ │ │ ├── operator_validator.py │ │ │ │ │ ├── pii_anonymization.py │ │ │ │ │ ├── prompt_instructions.py │ │ │ │ │ ├── telemetry.py │ │ │ │ │ └── trajectory_saver.py │ │ │ │ ├── cli.py │ │ │ │ ├── computers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cua.py │ │ │ │ │ └── custom.py │ │ │ │ ├── decorators.py │ │ │ │ ├── human_tool │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ ├── server.py │ │ │ │ │ └── ui.py │ │ │ │ ├── integrations │ │ │ │ │ └── hud │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── agent.py │ │ │ │ │ └── proxy.py │ │ │ │ ├── loops │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── anthropic.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── composed_grounded.py │ │ │ │ │ ├── gemini.py │ │ │ │ │ ├── glm45v.py │ │ │ │ │ ├── gta1.py │ │ │ │ │ ├── holo.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── model_types.csv │ │ │ │ │ ├── moondream3.py │ │ │ │ │ ├── omniparser.py │ │ │ │ │ ├── openai.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── uitars.py │ │ │ │ ├── proxy │ │ │ │ │ ├── examples.py │ │ │ │ │ └── handlers.py │ │ │ │ ├── responses.py │ │ │ │ ├── types.py │ │ │ │ └── ui │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ └── gradio │ │ │ │ ├── __init__.py │ │ │ │ ├── app.py │ │ │ │ └── ui_components.py │ │ │ ├── benchmarks │ │ │ │ ├── .gitignore │ │ │ │ ├── contrib.md │ │ │ │ ├── interactive.py │ │ │ │ ├── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ └── gta1.py │ │ │ │ ├── README.md │ │ │ │ ├── ss-pro.py │ │ │ │ ├── ss-v2.py │ │ │ │ └── utils.py │ │ │ ├── example.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer │ │ │ ├── .bumpversion.cfg │ │ │ ├── computer │ │ │ │ ├── __init__.py │ │ │ │ ├── computer.py │ │ │ │ ├── diorama_computer.py │ │ │ │ ├── helpers.py │ │ │ │ ├── interface │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ ├── models.py │ │ │ │ │ └── windows.py │ │ │ │ ├── logger.py │ │ │ │ ├── models.py │ │ │ │ ├── providers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cloud │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── docker │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── lume │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── lume_api.py │ │ │ │ │ ├── lumier │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── types.py │ │ │ │ │ └── winsandbox │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── provider.py │ │ │ │ │ └── setup_script.ps1 │ │ │ │ ├── ui │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ └── gradio │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── app.py │ │ │ │ └── utils.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer-server │ │ │ ├── .bumpversion.cfg │ │ │ ├── computer_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── cli.py │ │ │ │ ├── diorama │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── diorama_computer.py │ │ │ │ │ ├── diorama.py │ │ │ │ │ ├── draw.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── safezone.py │ │ │ │ ├── handlers │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── windows.py │ │ │ │ ├── main.py │ │ │ │ ├── server.py │ │ │ │ └── watchdog.py │ │ │ ├── examples │ │ │ │ ├── __init__.py │ │ │ │ └── usage_example.py │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ ├── run_server.py │ │ │ └── test_connection.py │ │ ├── core │ │ │ ├── .bumpversion.cfg │ │ │ ├── core │ │ │ │ ├── __init__.py │ │ │ │ └── telemetry │ │ │ │ ├── __init__.py │ │ │ │ └── posthog.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── mcp-server │ │ │ ├── .bumpversion.cfg │ │ │ ├── CONCURRENT_SESSIONS.md │ │ │ ├── mcp_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── server.py │ │ │ │ └── session_manager.py │ │ │ ├── pdm.lock │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ └── scripts │ │ │ ├── install_mcp_server.sh │ │ │ └── start_mcp_server.sh │ │ ├── pylume │ │ │ ├── __init__.py │ │ │ ├── .bumpversion.cfg │ │ │ ├── pylume │ │ │ │ ├── __init__.py │ │ │ │ ├── client.py │ │ │ │ ├── exceptions.py │ │ │ │ ├── lume │ │ │ │ ├── models.py │ │ │ │ ├── pylume.py │ │ │ │ └── server.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ └── som │ │ ├── .bumpversion.cfg │ │ ├── LICENSE │ │ ├── poetry.toml │ │ ├── pyproject.toml │ │ ├── README.md │ │ ├── som │ │ │ ├── __init__.py │ │ │ ├── detect.py │ │ │ ├── detection.py │ │ │ ├── models.py │ │ │ ├── ocr.py │ │ │ ├── util │ │ │ │ └── utils.py │ │ │ └── visualization.py │ │ └── tests │ │ └── test_omniparser.py │ ├── typescript │ │ ├── .gitignore │ │ ├── .nvmrc │ │ ├── agent │ │ │ ├── examples │ │ │ │ ├── playground-example.html │ │ │ │ └── README.md │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── client.ts │ │ │ │ ├── index.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ └── client.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── biome.json │ │ ├── computer │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── computer │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── providers │ │ │ │ │ │ ├── base.ts │ │ │ │ │ │ ├── cloud.ts │ │ │ │ │ │ └── index.ts │ │ │ │ │ └── types.ts │ │ │ │ ├── index.ts │ │ │ │ ├── interface │ │ │ │ │ ├── base.ts │ │ │ │ │ ├── factory.ts │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── linux.ts │ │ │ │ │ ├── macos.ts │ │ │ │ │ └── windows.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ ├── computer │ │ │ │ │ └── cloud.test.ts │ │ │ │ ├── interface │ │ │ │ │ ├── factory.test.ts │ │ │ │ │ ├── index.test.ts │ │ │ │ │ ├── linux.test.ts │ │ │ │ │ ├── macos.test.ts │ │ │ │ │ └── windows.test.ts │ │ │ │ └── setup.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── core │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── index.ts │ │ │ │ └── telemetry │ │ │ │ ├── clients │ │ │ │ │ ├── index.ts │ │ │ │ │ └── posthog.ts │ │ │ │ └── index.ts │ │ │ ├── tests │ │ │ │ └── telemetry.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── pnpm-workspace.yaml │ │ └── README.md │ └── xfce │ ├── .dockerignore │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ └── src │ ├── scripts │ │ ├── resize-display.sh │ │ ├── start-computer-server.sh │ │ ├── start-novnc.sh │ │ ├── start-vnc.sh │ │ └── xstartup.sh │ ├── supervisor │ │ └── supervisord.conf │ └── xfce-config │ ├── helpers.rc │ ├── xfce4-power-manager.xml │ └── xfce4-session.xml ├── LICENSE.md ├── Makefile ├── notebooks │ ├── agent_nb.ipynb │ ├── blog │ │ ├── build-your-own-operator-on-macos-1.ipynb │ │ └── build-your-own-operator-on-macos-2.ipynb │ ├── composite_agents_docker_nb.ipynb │ ├── computer_nb.ipynb │ ├── computer_server_nb.ipynb │ ├── customizing_computeragent.ipynb │ ├── eval_osworld.ipynb │ ├── ollama_nb.ipynb │ ├── pylume_nb.ipynb │ ├── README.md │ ├── sota_hackathon_cloud.ipynb │ └── sota_hackathon.ipynb ├── pdm.lock ├── pyproject.toml ├── pyrightconfig.json ├── README.md ├── samples │ └── community │ ├── global-online │ │ └── README.md │ └── hack-the-north │ └── README.md ├── scripts │ ├── build-uv.sh │ ├── build.ps1 │ ├── build.sh │ ├── cleanup.sh │ ├── playground-docker.sh │ ├── playground.sh │ └── run-docker-dev.sh └── tests ├── pytest.ini ├── shell_cmd.py ├── test_files.py ├── test_mcp_server_session_management.py ├── test_mcp_server_streaming.py ├── test_shell_bash.py ├── test_telemetry.py ├── test_venv.py └── test_watchdog.py ``` # Files -------------------------------------------------------------------------------- /libs/kasm/src/ubuntu/install/firefox/install_firefox.sh: -------------------------------------------------------------------------------- ```bash #!/usr/bin/env bash set -xe # Add icon if [ -f /dockerstartup/install/ubuntu/install/firefox/firefox.desktop ]; then mv /dockerstartup/install/ubuntu/install/firefox/firefox.desktop $HOME/Desktop/ fi ARCH=$(arch | sed 's/aarch64/arm64/g' | sed 's/x86_64/amd64/g') set_desktop_icon() { sed -i -e 's!Icon=.\+!Icon=/usr/share/icons/hicolor/48x48/apps/firefox.png!' "$HOME/Desktop/firefox.desktop" } echo "Install Firefox" if [[ "${DISTRO}" == @(oracle8|rockylinux9|rockylinux8|oracle9|rhel9|almalinux9|almalinux8|fedora39|fedora40) ]]; then dnf install -y firefox p11-kit elif [ "${DISTRO}" == "opensuse" ]; then zypper install -yn p11-kit-tools MozillaFirefox elif grep -q Jammy /etc/os-release || grep -q Noble /etc/os-release; then if [ ! -f '/etc/apt/preferences.d/mozilla-firefox' ]; then add-apt-repository -y ppa:mozillateam/ppa echo ' Package: * Pin: release o=LP-PPA-mozillateam Pin-Priority: 1001 ' > /etc/apt/preferences.d/mozilla-firefox fi apt-get install -y firefox p11-kit-modules elif grep -q "ID=kali" /etc/os-release; then apt-get update apt-get install -y firefox-esr p11-kit-modules rm -f $HOME/Desktop/firefox.desktop cp \ /usr/share/applications/firefox-esr.desktop \ $HOME/Desktop/ chmod +x $HOME/Desktop/firefox-esr.desktop elif grep -q "ID=debian" /etc/os-release || grep -q "ID=parrot" /etc/os-release; then if [ "${ARCH}" == "amd64" ]; then install -d -m 0755 /etc/apt/keyrings wget -q https://packages.mozilla.org/apt/repo-signing-key.gpg -O- > /etc/apt/keyrings/packages.mozilla.org.asc echo "deb [signed-by=/etc/apt/keyrings/packages.mozilla.org.asc] https://packages.mozilla.org/apt mozilla main" > /etc/apt/sources.list.d/mozilla.list echo ' Package: * Pin: origin packages.mozilla.org Pin-Priority: 1000 ' > /etc/apt/preferences.d/mozilla apt-get update apt-get install -y firefox p11-kit-modules else apt-get update apt-get install -y firefox-esr p11-kit-modules rm -f $HOME/Desktop/firefox.desktop cp \ /usr/share/applications/firefox-esr.desktop \ $HOME/Desktop/ chmod +x $HOME/Desktop/firefox-esr.desktop fi else apt-mark unhold firefox || : apt-get remove firefox apt-get update apt-get install -y firefox p11-kit-modules fi # Add Langpacks FIREFOX_VERSION=$(curl -sI https://download.mozilla.org/?product=firefox-latest | awk -F '(releases/|/win32)' '/Location/ {print $2}') RELEASE_URL="https://releases.mozilla.org/pub/firefox/releases/${FIREFOX_VERSION}/win64/xpi/" LANGS=$(curl -Ls ${RELEASE_URL} | awk -F '(xpi">|</a>)' '/href.*xpi/ {print $2}' | tr '\n' ' ') EXTENSION_DIR=/usr/lib/firefox-addons/distribution/extensions/ mkdir -p ${EXTENSION_DIR} for LANG in ${LANGS}; do LANGCODE=$(echo ${LANG} | sed 's/\.xpi//g') echo "Downloading ${LANG} Language pack" curl -o \ ${EXTENSION_DIR}langpack-${LANGCODE}@firefox.mozilla.org.xpi -Ls \ ${RELEASE_URL}${LANG} done # Cleanup and install flash if supported if [[ "${DISTRO}" == @(oracle8|rockylinux9|rockylinux8|oracle9|rhel9|almalinux9|almalinux8|fedora39|fedora40) ]]; then if [ -z ${SKIP_CLEAN+x} ]; then dnf clean all fi elif [ "${DISTRO}" == "opensuse" ]; then if [ -z ${SKIP_CLEAN+x} ]; then zypper clean --all fi else if [ "$ARCH" == "arm64" ] && [ "$(lsb_release -cs)" == "focal" ] ; then echo "Firefox flash player not supported on arm64 Ubuntu Focal Skipping" elif grep -q "ID=debian" /etc/os-release || grep -q "ID=kali" /etc/os-release || grep -q "ID=parrot" /etc/os-release; then echo "Firefox flash player not supported on Debian" elif grep -q Focal /etc/os-release; then # Plugin to support running flash videos for sites like vimeo apt-get update apt-get install -y browser-plugin-freshplayer-pepperflash apt-mark hold firefox if [ -z ${SKIP_CLEAN+x} ]; then apt-get autoclean rm -rf \ /var/lib/apt/lists/* \ /var/tmp/* fi fi fi if [[ "${DISTRO}" != @(oracle8|rockylinux9|rockylinux8|oracle9|rhel9|almalinux9|almalinux8|opensuse|fedora39|fedora40) ]]; then # Update firefox to utilize the system certificate store instead of the one that ships with firefox if grep -q "ID=debian" /etc/os-release || grep -q "ID=kali" /etc/os-release || grep -q "ID=parrot" /etc/os-release && [ "${ARCH}" == "arm64" ]; then rm -f /usr/lib/firefox-esr/libnssckbi.so ln /usr/lib/$(arch)-linux-gnu/pkcs11/p11-kit-trust.so /usr/lib/firefox-esr/libnssckbi.so elif grep -q "ID=kali" /etc/os-release && [ "${ARCH}" == "amd64" ]; then rm -f /usr/lib/firefox-esr/libnssckbi.so ln /usr/lib/$(arch)-linux-gnu/pkcs11/p11-kit-trust.so /usr/lib/firefox-esr/libnssckbi.so else rm -f /usr/lib/firefox/libnssckbi.so ln /usr/lib/$(arch)-linux-gnu/pkcs11/p11-kit-trust.so /usr/lib/firefox/libnssckbi.so fi fi if [[ "${DISTRO}" == @(oracle8|rockylinux9|rockylinux8|oracle9|rhel9|almalinux9|almalinux8|fedora39|fedora40) ]]; then if [[ "${DISTRO}" == @(fedora39|fedora40) ]]; then preferences_file=/usr/lib64/firefox/browser/defaults/preferences/firefox-redhat-default-prefs.js else preferences_file=/usr/lib64/firefox/browser/defaults/preferences/all-redhat.js fi sed -i -e '/homepage/d' "$preferences_file" elif [ "${DISTRO}" == "opensuse" ]; then preferences_file=/usr/lib64/firefox/browser/defaults/preferences/firefox.js elif grep -q "ID=kali" /etc/os-release; then preferences_file=/usr/lib/firefox-esr/defaults/pref/firefox.js elif grep -q "ID=debian" /etc/os-release || grep -q "ID=parrot" /etc/os-release; then if [ "${ARCH}" == "amd64" ]; then preferences_file=/usr/lib/firefox/defaults/pref/firefox.js else preferences_file=/usr/lib/firefox-esr/defaults/pref/firefox.js fi else preferences_file=/usr/lib/firefox/browser/defaults/preferences/firefox.js fi # Disabling default first run URL for Debian based images if [[ "${DISTRO}" != @(oracle8|rockylinux9|rockylinux8|oracle9|rhel9|almalinux9|almalinux8|opensuse|fedora39|fedora40) ]]; then cat >"$preferences_file" <<EOF pref("datareporting.policy.firstRunURL", ""); pref("datareporting.policy.dataSubmissionEnabled", false); pref("datareporting.healthreport.service.enabled", false); pref("datareporting.healthreport.uploadEnabled", false); pref("trailhead.firstrun.branches", "nofirstrun-empty"); pref("browser.aboutwelcome.enabled", false); EOF fi if [[ "${DISTRO}" == @(oracle8|rockylinux9|rockylinux8|oracle9|rhel9|almalinux9|almalinux8|opensuse|fedora39|fedora40) ]]; then # Creating a default profile chown -R root:root $HOME firefox -headless -CreateProfile "kasm $HOME/.mozilla/firefox/kasm" # Generate a certdb to be detected on squid start HOME=/root firefox --headless & mkdir -p /root/.mozilla CERTDB=$(find /root/.mozilla* -name "cert9.db") while [ -z "${CERTDB}" ] ; do sleep 1 echo "waiting for certdb" CERTDB=$(find /root/.mozilla* -name "cert9.db") done sleep 2 kill $(pgrep firefox) CERTDIR=$(dirname ${CERTDB}) mv ${CERTDB} $HOME/.mozilla/firefox/kasm/ rm -Rf /root/.mozilla else # Creating Default Profile chown -R 0:0 $HOME firefox -headless -CreateProfile "kasm $HOME/.mozilla/firefox/kasm" fi # Silence Firefox security nag "Some of Firefox's features may offer less protection on your current operating system". echo 'user_pref("security.sandbox.warn_unprivileged_namespaces", false);' > $HOME/.mozilla/firefox/kasm/user.js chown 1000:1000 $HOME/.mozilla/firefox/kasm/user.js if [[ "${DISTRO}" == @(oracle8|rockylinux9|rockylinux8|oracle9|rhel9|almalinux9|almalinux8|opensuse|fedora39|fedora40) ]]; then set_desktop_icon fi # Starting with version 67, Firefox creates a unique profile mapping per installation which is hash generated # based off the installation path. Because that path will be static for our deployments we can assume the hash # and thus assign our profile to the default for the installation if grep -q "ID=kali" /etc/os-release; then cat >>$HOME/.mozilla/firefox/profiles.ini <<EOL [Install3B6073811A6ABF12] Default=kasm Locked=1 EOL elif grep -q "ID=debian" /etc/os-release || grep -q "ID=parrot" /etc/os-release; then if [ "${ARCH}" != "amd64" ]; then cat >>$HOME/.mozilla/firefox/profiles.ini <<EOL [Install3B6073811A6ABF12] Default=kasm Locked=1 EOL else cat >>$HOME/.mozilla/firefox/profiles.ini <<EOL [Install4F96D1932A9F858E] Default=kasm Locked=1 EOL fi elif [[ "${DISTRO}" != @(oracle8|rockylinux9|rockylinux8|oracle9|rhel9|almalinux9|almalinux8|opensuse|fedora39|fedora40) ]]; then cat >>$HOME/.mozilla/firefox/profiles.ini <<EOL [Install4F96D1932A9F858E] Default=kasm Locked=1 EOL elif [[ "${DISTRO}" == @(oracle8|rockylinux9|rockylinux8|oracle9|rhel9|almalinux9|almalinux8|opensuse|fedora39|fedora40) ]]; then cat >>$HOME/.mozilla/firefox/profiles.ini <<EOL [Install11457493C5A56847] Default=kasm Locked=1 EOL fi # Desktop Icon FIxes if [[ "${DISTRO}" == @(rockylinux9|oracle9|rhel9|almalinux9|fedora39|fedora40) ]]; then sed -i 's#Icon=/usr/lib/firefox#Icon=/usr/lib64/firefox#g' $HOME/Desktop/firefox.desktop fi # Cleanup for app layer chown -R 1000:0 $HOME find /usr/share/ -name "icon-theme.cache" -exec rm -f {} \; if [ -f $HOME/Desktop/firefox.desktop ]; then chmod +x $HOME/Desktop/firefox.desktop fi chown -R 1000:1000 $HOME/.mozilla ``` -------------------------------------------------------------------------------- /docs/content/docs/quickstart-devs.mdx: -------------------------------------------------------------------------------- ```markdown --- title: Quickstart description: Get started with Cua in three steps icon: Rocket --- import { Step, Steps } from 'fumadocs-ui/components/steps'; import { Tab, Tabs } from 'fumadocs-ui/components/tabs'; This quickstart guides you through setting up your [computer environment](#set-up-your-computer-environment), programmatic control with a [Cua computer](#using-computer), and task automation with a [Cua agent](#using-agent): <Steps> <Step> ## Set Up Your Computer Environment Choose how you want to run your Cua computer. This will be the environment where your automated tasks will execute. You can run your Cua computer in the cloud (recommended for easiest setup), locally on macOS with Lume, locally on Windows with a Windows Sandbox, or in a Docker container on any platform. Choose the option that matches your system and needs. <Tabs items={['☁️ Cloud', '🐳 Docker', '🍎 Lume', '🪟 Windows Sandbox']}> <Tab value="☁️ Cloud"> Cua Cloud Sandbox provides virtual machines that run Ubuntu. 1. Go to [trycua.com/signin](https://www.trycua.com/signin) 2. Navigate to **Dashboard > Containers > Create Instance** 3. Create a **Medium, Ubuntu 22** sandbox 4. Note your sandbox name and API key Your Cloud Sandbox will be automatically configured and ready to use. </Tab> <Tab value="🍎 Lume"> Lume containers are macOS virtual machines that run on a macOS host machine. 1. Install the Lume CLI: ```bash /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)" ``` 2. Start a local Cua sandbox: ```bash lume run macos-sequoia-cua:latest ``` </Tab> <Tab value="🪟 Windows Sandbox"> Windows Sandbox provides Windows virtual environments that run on a Windows host machine. 1. Enable [Windows Sandbox](https://learn.microsoft.com/en-us/windows/security/application-security/application-isolation/windows-sandbox/windows-sandbox-install) (requires Windows 10 Pro/Enterprise or Windows 11) 2. Install the `pywinsandbox` dependency: ```bash pip install -U git+git://github.com/karkason/pywinsandbox.git ``` 3. Windows Sandbox will be automatically configured when you run the CLI </Tab> <Tab value="🐳 Docker"> Docker provides a way to run Ubuntu containers on any host machine. 1. Install Docker Desktop or Docker Engine: 2. Pull the CUA Ubuntu sandbox: ```bash docker pull --platform=linux/amd64 trycua/cua-ubuntu:latest ``` </Tab> </Tabs> </Step> <Step> ## Using Computer Connect to your Cua computer and perform basic interactions, such as taking screenshots or simulating user input. <Tabs items={['Python', 'TypeScript']}> <Tab value="Python"> Install the Cua computer Python SDK: ```bash pip install cua-computer ``` Then, connect to your desired computer environment: <Tabs items={['☁️ Cloud', '🐳 Docker', '🍎 Lume', '🪟 Windows Sandbox', '🖥️ Host Desktop']}> <Tab value="☁️ Cloud"> ```python from computer import Computer computer = Computer( os_type="linux", provider_type="cloud", name="your-sandbox-name", api_key="your-api-key" ) await computer.run() # Connect to the sandbox ``` </Tab> <Tab value="🍎 Lume"> ```python from computer import Computer computer = Computer( os_type="macos", provider_type="lume", name="macos-sequoia-cua:latest" ) await computer.run() # Launch & connect to the container ``` </Tab> <Tab value="🪟 Windows Sandbox"> ```python from computer import Computer computer = Computer( os_type="windows", provider_type="windows_sandbox" ) await computer.run() # Launch & connect to the container ``` </Tab> <Tab value="🐳 Docker"> ```python from computer import Computer computer = Computer( os_type="linux", provider_type="docker", name="trycua/cua-ubuntu:latest" ) await computer.run() # Launch & connect to the container ``` </Tab> <Tab value="🖥️ Host Desktop"> Install and run `cua-computer-server`: ```bash pip install cua-computer-server python -m computer_server ``` Then, use the `Computer` object to connect: ```python from computer import Computer computer = Computer(use_host_computer_server=True) await computer.run() # Connect to the host desktop ``` </Tab> </Tabs> Once connected, you can perform interactions: ```python try: # Take a screenshot of the computer's current display screenshot = await computer.interface.screenshot() # Simulate a left-click at coordinates (100, 100) await computer.interface.left_click(100, 100) # Type "Hello!" into the active application await computer.interface.type("Hello!") finally: await computer.close() ``` </Tab> <Tab value="TypeScript"> Install the Cua computer TypeScript SDK: ```bash npm install @trycua/computer ``` Then, connect to your desired computer environment: <Tabs items={['☁️ Cloud','🐳 Docker', '🍎 Lume', '🪟 Windows Sandbox', '🖥️ Host Desktop']}> <Tab value="☁️ Cloud"> ```typescript import { Computer, OSType } from '@trycua/computer'; const computer = new Computer({ osType: OSType.LINUX, name: "your-sandbox-name", apiKey: "your-api-key" }); await computer.run(); // Connect to the sandbox ``` </Tab> <Tab value="🍎 Lume"> ```typescript import { Computer, OSType, ProviderType } from '@trycua/computer'; const computer = new Computer({ osType: OSType.MACOS, providerType: ProviderType.LUME, name: "macos-sequoia-cua:latest" }); await computer.run(); // Launch & connect to the container ``` </Tab> <Tab value="🪟 Windows Sandbox"> ```typescript import { Computer, OSType, ProviderType } from '@trycua/computer'; const computer = new Computer({ osType: OSType.WINDOWS, providerType: ProviderType.WINDOWS_SANDBOX }); await computer.run(); // Launch & connect to the container ``` </Tab> <Tab value="🐳 Docker"> ```typescript import { Computer, OSType, ProviderType } from '@trycua/computer'; const computer = new Computer({ osType: OSType.LINUX, providerType: ProviderType.DOCKER, name: "trycua/cua-ubuntu:latest" }); await computer.run(); // Launch & connect to the container ``` </Tab> <Tab value="🖥️ Host Desktop"> First, install and run `cua-computer-server`: ```bash pip install cua-computer-server python -m computer_server ``` Then, use the `Computer` object to connect: ```typescript import { Computer } from '@trycua/computer'; const computer = new Computer({ useHostComputerServer: true }); await computer.run(); // Connect to the host desktop ``` </Tab> </Tabs> Once connected, you can perform interactions: ```typescript try { // Take a screenshot of the computer's current display const screenshot = await computer.interface.screenshot(); // Simulate a left-click at coordinates (100, 100) await computer.interface.leftClick(100, 100); // Type "Hello!" into the active application await computer.interface.typeText("Hello!"); } finally { await computer.close(); } ``` </Tab> </Tabs> Learn more about computers in the [Cua computers documentation](/computer-sdk/computers). You will see how to automate computers with agents in the next step. </Step> <Step> ## Using Agent Utilize an Agent to automate complex tasks by providing it with a goal and allowing it to interact with the computer environment. Install the Cua agent Python SDK: ```bash pip install "cua-agent[all]" ``` Then, use the `ComputerAgent` object: ```python from agent import ComputerAgent agent = ComputerAgent( model="anthropic/claude-3-5-sonnet-20241022", tools=[computer], max_trajectory_budget=5.0 ) messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}] async for result in agent.run(messages): for item in result["output"]: if item["type"] == "message": print(item["content"][0]["text"]) ``` Learn more about agents in [Agent Loops](/agent-sdk/agent-loops) and available models in [Supported Models](/agent-sdk/supported-model-providers/). </Step> </Steps> ## Next Steps - Learn more about [Cua computers](/computer-sdk/computers) and [computer commands](/computer-sdk/commands) - Read about [Agent loops](/agent-sdk/agent-loops), [tools](/agent-sdk/custom-tools), and [supported model providers](/agent-sdk/supported-model-providers/) - Join our [Discord community](https://discord.com/invite/mVnXXpdE85) for help ``` -------------------------------------------------------------------------------- /libs/python/computer-server/test_connection.py: -------------------------------------------------------------------------------- ```python #!/usr/bin/env python """ Connection test script for Computer Server. This script tests both WebSocket (/ws) and REST (/cmd) connections to the Computer Server and keeps it alive, allowing you to verify the server is running correctly. """ import asyncio import json import websockets import argparse import sys import aiohttp import os import dotenv dotenv.load_dotenv() async def test_websocket_connection(host="localhost", port=8000, keep_alive=False, container_name=None, api_key=None): """Test WebSocket connection to the Computer Server.""" if container_name: # Container mode: use WSS with container domain and port 8443 uri = f"wss://{container_name}.containers.cloud.trycua.com:8443/ws" print(f"Connecting to container {container_name} at {uri}...") else: # Local mode: use WS with specified host and port uri = f"ws://{host}:{port}/ws" print(f"Connecting to local server at {uri}...") try: async with websockets.connect(uri) as websocket: print("WebSocket connection established!") # If container connection, send authentication first if container_name: if not api_key: print("Error: API key required for container connections") return False print("Sending authentication...") auth_message = { "command": "authenticate", "params": { "api_key": api_key, "container_name": container_name } } await websocket.send(json.dumps(auth_message)) auth_response = await websocket.recv() print(f"Authentication response: {auth_response}") # Check if authentication was successful auth_data = json.loads(auth_response) if not auth_data.get("success", False): print("Authentication failed!") return False print("Authentication successful!") # Send a test command to get version await websocket.send(json.dumps({"command": "version", "params": {}})) response = await websocket.recv() print(f"Version response: {response}") # Send a test command to get screen size await websocket.send(json.dumps({"command": "get_screen_size", "params": {}})) response = await websocket.recv() print(f"Screen size response: {response}") if keep_alive: print("\nKeeping WebSocket connection alive. Press Ctrl+C to exit...") while True: # Send a command every 5 seconds to keep the connection alive await asyncio.sleep(5) await websocket.send( json.dumps({"command": "get_cursor_position", "params": {}}) ) response = await websocket.recv() print(f"Cursor position: {response}") except websockets.exceptions.ConnectionClosed as e: print(f"WebSocket connection closed: {e}") return False except ConnectionRefusedError: print(f"Connection refused. Is the server running at {host}:{port}?") return False except Exception as e: print(f"WebSocket error: {e}") return False return True async def test_rest_connection(host="localhost", port=8000, keep_alive=False, container_name=None, api_key=None): """Test REST connection to the Computer Server.""" if container_name: # Container mode: use HTTPS with container domain and port 8443 base_url = f"https://{container_name}.containers.cloud.trycua.com:8443" print(f"Connecting to container {container_name} at {base_url}...") else: # Local mode: use HTTP with specified host and port base_url = f"http://{host}:{port}" print(f"Connecting to local server at {base_url}...") try: async with aiohttp.ClientSession() as session: print("REST connection established!") # Prepare headers for container authentication headers = {} if container_name: if not api_key: print("Error: API key required for container connections") return False headers["X-Container-Name"] = container_name headers["X-API-Key"] = api_key print(f"Using container authentication headers") # Test screenshot endpoint async with session.post( f"{base_url}/cmd", json={"command": "screenshot", "params": {}}, headers=headers ) as response: if response.status == 200: text = await response.text() print(f"Screenshot response: {text}") else: print(f"Screenshot request failed with status: {response.status}") print(await response.text()) return False # Test screen size endpoint async with session.post( f"{base_url}/cmd", json={"command": "get_screen_size", "params": {}}, headers=headers ) as response: if response.status == 200: text = await response.text() print(f"Screen size response: {text}") else: print(f"Screen size request failed with status: {response.status}") print(await response.text()) return False if keep_alive: print("\nKeeping REST connection alive. Press Ctrl+C to exit...") while True: # Send a command every 5 seconds to keep testing await asyncio.sleep(5) async with session.post( f"{base_url}/cmd", json={"command": "get_cursor_position", "params": {}}, headers=headers ) as response: if response.status == 200: text = await response.text() print(f"Cursor position: {text}") else: print(f"Cursor position request failed with status: {response.status}") print(await response.text()) return False except aiohttp.ClientError as e: print(f"REST connection error: {e}") return False except Exception as e: print(f"REST error: {e}") return False return True async def test_connection(host="localhost", port=8000, keep_alive=False, container_name=None, use_rest=False, api_key=None): """Test connection to the Computer Server using WebSocket or REST.""" if use_rest: return await test_rest_connection(host, port, keep_alive, container_name, api_key) else: return await test_websocket_connection(host, port, keep_alive, container_name, api_key) def parse_args(): parser = argparse.ArgumentParser(description="Test connection to Computer Server") parser.add_argument("--host", default="localhost", help="Host address (default: localhost)") parser.add_argument("-p", "--port", type=int, default=8000, help="Port number (default: 8000)") parser.add_argument("-c", "--container-name", help="Container name for cloud connection (uses WSS/HTTPS and port 8443)") parser.add_argument("--api-key", help="API key for container authentication (can also use CUA_API_KEY env var)") parser.add_argument("--keep-alive", action="store_true", help="Keep connection alive") parser.add_argument("--rest", action="store_true", help="Use REST endpoint (/cmd) instead of WebSocket (/ws)") return parser.parse_args() async def main(): args = parse_args() # Convert hyphenated argument to underscore for function parameter container_name = getattr(args, 'container_name', None) # Get API key from argument or environment variable api_key = getattr(args, 'api_key', None) or os.environ.get('CUA_API_KEY') # Check if container name is provided but API key is missing if container_name and not api_key: print("Warning: Container name provided but no API key found.") print("Please provide --api-key argument or set CUA_API_KEY environment variable.") return 1 print(f"Testing {'REST' if args.rest else 'WebSocket'} connection...") if container_name: print(f"Container: {container_name}") print(f"API Key: {'***' + api_key[-4:] if api_key and len(api_key) > 4 else 'Not provided'}") success = await test_connection( host=args.host, port=args.port, keep_alive=args.keep_alive, container_name=container_name, use_rest=args.rest, api_key=api_key ) return 0 if success else 1 if __name__ == "__main__": try: sys.exit(asyncio.run(main())) except KeyboardInterrupt: print("\nExiting...") sys.exit(0) ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/proxy/handlers.py: -------------------------------------------------------------------------------- ```python """ Request handlers for the proxy endpoints. """ import asyncio import json import logging import os from contextlib import contextmanager from typing import Dict, Any, List, Union, Optional from ..agent import ComputerAgent from computer import Computer logger = logging.getLogger(__name__) class ResponsesHandler: """Handler for /responses endpoint that processes agent requests.""" def __init__(self): self.computer = None self.agent = None # Simple in-memory caches self._computer_cache: Dict[str, Any] = {} self._agent_cache: Dict[str, Any] = {} async def setup_computer_agent( self, model: str, agent_kwargs: Optional[Dict[str, Any]] = None, computer_kwargs: Optional[Dict[str, Any]] = None, ): """Set up (and cache) computer and agent instances. Caching keys: - Computer cache key: computer_kwargs - Agent cache key: {"model": model, **agent_kwargs} """ agent_kwargs = agent_kwargs or {} computer_kwargs = computer_kwargs or {} def _stable_key(obj: Dict[str, Any]) -> str: try: return json.dumps(obj, sort_keys=True, separators=(",", ":")) except Exception: # Fallback: stringify non-serializable values safe_obj = {} for k, v in obj.items(): try: json.dumps(v) safe_obj[k] = v except Exception: safe_obj[k] = str(v) return json.dumps(safe_obj, sort_keys=True, separators=(",", ":")) # Determine if custom tools are supplied; if so, skip computer setup entirely has_custom_tools = bool(agent_kwargs.get("tools")) computer = None if not has_custom_tools: # ---------- Computer setup (with cache) ---------- comp_key = _stable_key(computer_kwargs) computer = self._computer_cache.get(comp_key) if computer is None: # Default computer configuration default_c_config = { "os_type": "linux", "provider_type": "cloud", "name": os.getenv("CUA_CONTAINER_NAME"), "api_key": os.getenv("CUA_API_KEY"), } default_c_config.update(computer_kwargs) computer = Computer(**default_c_config) await computer.__aenter__() self._computer_cache[comp_key] = computer logger.info(f"Computer created and cached with key={comp_key} config={default_c_config}") else: logger.info(f"Reusing cached computer for key={comp_key}") # Bind current computer reference (None if custom tools supplied) self.computer = computer # ---------- Agent setup (with cache) ---------- # Build agent cache key from {model} + agent_kwargs (excluding tools unless explicitly passed) agent_kwargs_for_key = dict(agent_kwargs) agent_key_payload = {"model": model, **agent_kwargs_for_key} agent_key = _stable_key(agent_key_payload) agent = self._agent_cache.get(agent_key) if agent is None: # Default agent configuration default_a_config: Dict[str, Any] = {"model": model} if not has_custom_tools: default_a_config["tools"] = [computer] # Apply user overrides, but keep tools unless user explicitly sets if agent_kwargs: if not has_custom_tools: agent_kwargs.setdefault("tools", [computer]) default_a_config.update(agent_kwargs) # JSON-derived kwargs may have loose types; ignore static arg typing here agent = ComputerAgent(**default_a_config) # type: ignore[arg-type] self._agent_cache[agent_key] = agent logger.info(f"Agent created and cached with key={agent_key} model={model}") else: # Ensure cached agent uses the current computer tool (in case object differs) # Only update if tools not explicitly provided in agent_kwargs if not has_custom_tools: try: agent.tools = [computer] except Exception: pass logger.info(f"Reusing cached agent for key={agent_key}") # Bind current agent reference self.agent = agent async def process_request(self, request_data: Dict[str, Any]) -> Dict[str, Any]: """ Process a /responses request and return the result. Args: request_data: Dictionary containing model, input, and optional kwargs Returns: Dictionary with the agent's response """ try: # Extract request parameters model = request_data.get("model") input_data = request_data.get("input") agent_kwargs = request_data.get("agent_kwargs", {}) computer_kwargs = request_data.get("computer_kwargs", {}) env_overrides = request_data.get("env", {}) or {} if not model: raise ValueError("Model is required") if not input_data: raise ValueError("Input is required") # Apply env overrides for the duration of this request with self._env_overrides(env_overrides): # Set up (and possibly reuse) computer and agent via caches await self.setup_computer_agent(model, agent_kwargs, computer_kwargs) # Defensive: ensure agent is initialized for type checkers agent = self.agent if agent is None: raise RuntimeError("Agent failed to initialize") # Convert input to messages format messages = self._convert_input_to_messages(input_data) # Run agent and get first result async for result in agent.run(messages): # Return the first result and break return { "success": True, "result": result, "model": model } # If no results were yielded return { "success": False, "error": "No results from agent", "model": model } except Exception as e: logger.error(f"Error processing request: {e}") return { "success": False, "error": str(e), "model": request_data.get("model", "unknown") } def _convert_input_to_messages(self, input_data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]: """Convert input data to messages format.""" if isinstance(input_data, str): # Simple string input return [{"role": "user", "content": input_data}] elif isinstance(input_data, list): # Already in messages format messages = [] for msg in input_data: # Convert content array format if needed if isinstance(msg.get("content"), list): content_parts = [] for part in msg["content"]: if part.get("type") == "input_text": content_parts.append({"type": "text", "text": part["text"]}) elif part.get("type") == "input_image": content_parts.append({ "type": "image_url", "image_url": {"url": part["image_url"]} }) else: content_parts.append(part) messages.append({ "role": msg["role"], "content": content_parts }) else: messages.append(msg) return messages else: raise ValueError("Input must be string or list of messages") async def cleanup(self): """Clean up resources.""" if self.computer: try: await self.computer.__aexit__(None, None, None) except Exception as e: logger.error(f"Error cleaning up computer: {e}") finally: self.computer = None self.agent = None @staticmethod @contextmanager def _env_overrides(env: Dict[str, str]): """Temporarily apply environment variable overrides for the current process. Restores previous values after the context exits. Args: env: Mapping of env var names to override for this request. """ if not env: # No-op context yield return original: Dict[str, Optional[str]] = {} try: for k, v in env.items(): original[k] = os.environ.get(k) os.environ[k] = str(v) yield finally: for k, old in original.items(): if old is None: # Was not set before os.environ.pop(k, None) else: os.environ[k] = old ``` -------------------------------------------------------------------------------- /.github/workflows/publish-lume.yml: -------------------------------------------------------------------------------- ```yaml name: Publish Notarized Lume on: push: tags: - "lume-v*" workflow_dispatch: inputs: version: description: "Version to notarize (without v prefix)" required: true default: "0.1.0" workflow_call: inputs: version: description: "Version to notarize" required: true type: string secrets: APPLICATION_CERT_BASE64: required: true INSTALLER_CERT_BASE64: required: true CERT_PASSWORD: required: true APPLE_ID: required: true TEAM_ID: required: true APP_SPECIFIC_PASSWORD: required: true DEVELOPER_NAME: required: true permissions: contents: write env: APPLICATION_CERT_BASE64: ${{ secrets.APPLICATION_CERT_BASE64 }} INSTALLER_CERT_BASE64: ${{ secrets.INSTALLER_CERT_BASE64 }} CERT_PASSWORD: ${{ secrets.CERT_PASSWORD }} APPLE_ID: ${{ secrets.APPLE_ID }} TEAM_ID: ${{ secrets.TEAM_ID }} APP_SPECIFIC_PASSWORD: ${{ secrets.APP_SPECIFIC_PASSWORD }} DEVELOPER_NAME: ${{ secrets.DEVELOPER_NAME }} jobs: notarize: runs-on: macos-15 outputs: sha256_checksums: ${{ steps.generate_checksums.outputs.checksums }} version: ${{ steps.set_version.outputs.version }} steps: - uses: actions/checkout@v4 - name: Select Xcode 16 run: | sudo xcode-select -s /Applications/Xcode_16.app xcodebuild -version - name: Install dependencies run: | brew install cpio - name: Create .release directory run: mkdir -p .release - name: Set version id: set_version run: | # Determine version from tag or input if [[ "$GITHUB_REF" == refs/tags/lume-v* ]]; then VERSION="${GITHUB_REF#refs/tags/lume-v}" echo "Using version from tag: $VERSION" elif [[ -n "${{ inputs.version }}" ]]; then VERSION="${{ inputs.version }}" echo "Using version from input: $VERSION" elif [[ -n "${{ inputs.version }}" ]]; then VERSION="${{ inputs.version }}" echo "Using version from workflow_call input: $VERSION" else echo "Error: No version found in tag or input" exit 1 fi # Update version in Main.swift echo "Updating version in Main.swift to $VERSION" sed -i '' "s/static let current: String = \".*\"/static let current: String = \"$VERSION\"/" libs/lume/src/Main.swift # Set output for later steps echo "version=$VERSION" >> $GITHUB_OUTPUT - name: Import Certificates env: APPLICATION_CERT_BASE64: ${{ secrets.APPLICATION_CERT_BASE64 }} INSTALLER_CERT_BASE64: ${{ secrets.INSTALLER_CERT_BASE64 }} CERT_PASSWORD: ${{ secrets.CERT_PASSWORD }} KEYCHAIN_PASSWORD: "temp_password" run: | # Create a temporary keychain security create-keychain -p "$KEYCHAIN_PASSWORD" build.keychain security default-keychain -s build.keychain security unlock-keychain -p "$KEYCHAIN_PASSWORD" build.keychain security set-keychain-settings -t 3600 -l build.keychain # Import certificates echo $APPLICATION_CERT_BASE64 | base64 --decode > application.p12 echo $INSTALLER_CERT_BASE64 | base64 --decode > installer.p12 # Import certificates silently (minimize output) security import application.p12 -k build.keychain -P "$CERT_PASSWORD" -T /usr/bin/codesign -T /usr/bin/pkgbuild > /dev/null 2>&1 security import installer.p12 -k build.keychain -P "$CERT_PASSWORD" -T /usr/bin/codesign -T /usr/bin/pkgbuild > /dev/null 2>&1 # Allow codesign to access the certificates (minimal output) security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k "$KEYCHAIN_PASSWORD" build.keychain > /dev/null 2>&1 # Verify certificates were imported echo "Verifying signing identities..." CERT_COUNT=$(security find-identity -v -p codesigning build.keychain | grep -c "Developer ID Application" || echo "0") INSTALLER_COUNT=$(security find-identity -v build.keychain | grep -c "Developer ID Installer" || echo "0") if [ "$CERT_COUNT" -eq 0 ]; then echo "Error: No Developer ID Application certificate found" security find-identity -v -p codesigning build.keychain exit 1 fi if [ "$INSTALLER_COUNT" -eq 0 ]; then echo "Error: No Developer ID Installer certificate found" security find-identity -v build.keychain exit 1 fi echo "Found $CERT_COUNT Developer ID Application certificate(s) and $INSTALLER_COUNT Developer ID Installer certificate(s)" echo "All required certificates verified successfully" # Clean up certificate files rm application.p12 installer.p12 - name: Build and Notarize id: build_notarize env: APPLE_ID: ${{ secrets.APPLE_ID }} TEAM_ID: ${{ secrets.TEAM_ID }} APP_SPECIFIC_PASSWORD: ${{ secrets.APP_SPECIFIC_PASSWORD }} # These will now reference the imported certificates CERT_APPLICATION_NAME: "Developer ID Application: ${{ secrets.DEVELOPER_NAME }} (${{ secrets.TEAM_ID }})" CERT_INSTALLER_NAME: "Developer ID Installer: ${{ secrets.DEVELOPER_NAME }} (${{ secrets.TEAM_ID }})" VERSION: ${{ steps.set_version.outputs.version }} working-directory: ./libs/lume run: | # Minimal debug information echo "Starting build process..." echo "Swift version: $(swift --version | head -n 1)" echo "Building version: $VERSION" # Ensure .release directory exists mkdir -p .release chmod 755 .release # Build the project first (redirect verbose output) echo "Building project..." swift build --configuration release > build.log 2>&1 echo "Build completed." # Run the notarization script with LOG_LEVEL env var chmod +x scripts/build/build-release-notarized.sh cd scripts/build LOG_LEVEL=minimal ./build-release-notarized.sh # Return to the lume directory cd ../.. # Debug: List what files were actually created echo "Files in .release directory:" find .release -type f -name "*.tar.gz" -o -name "*.pkg.tar.gz" # Get architecture for output filename ARCH=$(uname -m) OS_IDENTIFIER="darwin-${ARCH}" # Output paths for later use echo "tarball_path=.release/lume-${VERSION}-${OS_IDENTIFIER}.tar.gz" >> $GITHUB_OUTPUT echo "pkg_path=.release/lume-${VERSION}-${OS_IDENTIFIER}.pkg.tar.gz" >> $GITHUB_OUTPUT - name: Generate SHA256 Checksums id: generate_checksums working-directory: ./libs/lume/.release run: | # Use existing checksums file if it exists, otherwise generate one if [ -f "checksums.txt" ]; then echo "Using existing checksums file" cat checksums.txt else echo "## SHA256 Checksums" > checksums.txt echo '```' >> checksums.txt shasum -a 256 lume-*.tar.gz >> checksums.txt echo '```' >> checksums.txt fi checksums=$(cat checksums.txt) echo "checksums<<EOF" >> $GITHUB_OUTPUT echo "$checksums" >> $GITHUB_OUTPUT echo "EOF" >> $GITHUB_OUTPUT # Debug: Show all files in the release directory echo "All files in release directory:" ls -la - name: Create Standard Version Releases working-directory: ./libs/lume/.release run: | VERSION=${{ steps.set_version.outputs.version }} ARCH=$(uname -m) OS_IDENTIFIER="darwin-${ARCH}" # Create OS-tagged symlinks ln -sf "lume-${VERSION}-${OS_IDENTIFIER}.tar.gz" "lume-darwin.tar.gz" ln -sf "lume-${VERSION}-${OS_IDENTIFIER}.pkg.tar.gz" "lume-darwin.pkg.tar.gz" # Create simple symlinks ln -sf "lume-${VERSION}-${OS_IDENTIFIER}.tar.gz" "lume.tar.gz" ln -sf "lume-${VERSION}-${OS_IDENTIFIER}.pkg.tar.gz" "lume.pkg.tar.gz" # List all files (including symlinks) echo "Files with symlinks in release directory:" ls -la - name: Upload Notarized Package (Tarball) uses: actions/upload-artifact@v4 with: name: lume-notarized-tarball path: ./libs/lume/${{ steps.build_notarize.outputs.tarball_path }} if-no-files-found: error - name: Upload Notarized Package (Installer) uses: actions/upload-artifact@v4 with: name: lume-notarized-installer path: ./libs/lume/${{ steps.build_notarize.outputs.pkg_path }} if-no-files-found: error - name: Create Release if: startsWith(github.ref, 'refs/tags/lume-v') uses: softprops/action-gh-release@v1 with: files: | ./libs/lume/${{ steps.build_notarize.outputs.tarball_path }} ./libs/lume/${{ steps.build_notarize.outputs.pkg_path }} ./libs/lume/.release/lume-darwin.tar.gz ./libs/lume/.release/lume-darwin.pkg.tar.gz ./libs/lume/.release/lume.tar.gz ./libs/lume/.release/lume.pkg.tar.gz body: | ${{ steps.generate_checksums.outputs.checksums }} ### Installation with script /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)" ``` generate_release_notes: true make_latest: true ``` -------------------------------------------------------------------------------- /scripts/playground-docker.sh: -------------------------------------------------------------------------------- ```bash #!/bin/bash set -e # Colors for output GREEN='\033[0;32m' BLUE='\033[0;34m' RED='\033[0;31m' YELLOW='\033[1;33m' NC='\033[0m' # No Color # Print with color print_info() { echo -e "${BLUE}==> $1${NC}" } print_success() { echo -e "${GREEN}==> $1${NC}" } print_error() { echo -e "${RED}==> $1${NC}" } print_warning() { echo -e "${YELLOW}==> $1${NC}" } echo "🚀 Launching Cua Computer-Use Agent UI..." # Check if Docker is installed if ! command -v docker &> /dev/null; then print_error "Docker is not installed!" echo "" echo "To use Cua with Docker containers, you need to install Docker first:" echo "" echo "📦 Install Docker:" echo " • macOS: Download Docker Desktop from https://docker.com/products/docker-desktop" echo " • Windows: Download Docker Desktop from https://docker.com/products/docker-desktop" echo " • Linux: Follow instructions at https://docs.docker.com/engine/install/" echo "" echo "After installing Docker, run this script again." exit 1 fi # Check if Docker daemon is running if ! docker info &> /dev/null; then print_error "Docker is installed but not running!" echo "" echo "Please start Docker Desktop and try again." exit 1 fi print_success "Docker is installed and running!" # Save the original working directory ORIGINAL_DIR="$(pwd)" DEMO_DIR="$HOME/.cua" mkdir -p "$DEMO_DIR" # Check if we're already in the cua repository # Look for the specific trycua identifier in pyproject.toml if [[ -f "pyproject.toml" ]] && grep -q "[email protected]" "pyproject.toml"; then print_success "Already in Cua repository - using current directory" REPO_DIR="$ORIGINAL_DIR" USE_EXISTING_REPO=true else # Directories used by the script when not in repo REPO_DIR="$DEMO_DIR/cua" USE_EXISTING_REPO=false fi # Function to clean up on exit cleanup() { cd "$ORIGINAL_DIR" 2>/dev/null || true } trap cleanup EXIT echo "" echo "Choose your Cua setup:" echo "1) ☁️ Cua Cloud Sandbox (works on any system)" echo "2) 🖥️ Local macOS VMs (requires Apple Silicon Mac + macOS 15+)" echo "3) 🖥️ Local Windows VMs (requires Windows 10 / 11)" echo "" read -p "Enter your choice (1, 2, or 3): " CHOICE if [[ "$CHOICE" == "1" ]]; then # Cua Cloud Sandbox setup echo "" print_info "Setting up Cua Cloud Sandbox..." echo "" # Check if existing .env.local already has CUA_API_KEY REPO_ENV_FILE="$REPO_DIR/.env.local" CURRENT_ENV_FILE="$ORIGINAL_DIR/.env.local" CUA_API_KEY="" # First check current directory if [[ -f "$CURRENT_ENV_FILE" ]] && grep -q "CUA_API_KEY=" "$CURRENT_ENV_FILE"; then EXISTING_CUA_KEY=$(grep "CUA_API_KEY=" "$CURRENT_ENV_FILE" | cut -d'=' -f2- | tr -d '"' | tr -d "'" | xargs) if [[ -n "$EXISTING_CUA_KEY" && "$EXISTING_CUA_KEY" != "your_cua_api_key_here" && "$EXISTING_CUA_KEY" != "" ]]; then CUA_API_KEY="$EXISTING_CUA_KEY" fi fi # Then check repo directory if not found in current dir if [[ -z "$CUA_API_KEY" ]] && [[ -f "$REPO_ENV_FILE" ]] && grep -q "CUA_API_KEY=" "$REPO_ENV_FILE"; then EXISTING_CUA_KEY=$(grep "CUA_API_KEY=" "$REPO_ENV_FILE" | cut -d'=' -f2- | tr -d '"' | tr -d "'" | xargs) if [[ -n "$EXISTING_CUA_KEY" && "$EXISTING_CUA_KEY" != "your_cua_api_key_here" && "$EXISTING_CUA_KEY" != "" ]]; then CUA_API_KEY="$EXISTING_CUA_KEY" fi fi # If no valid API key found, prompt for one if [[ -z "$CUA_API_KEY" ]]; then echo "To use Cua Cloud Sandbox, you need to:" echo "1. Sign up at https://trycua.com" echo "2. Create a Cloud Sandbox" echo "3. Generate an Api Key" echo "" read -p "Enter your Cua Api Key: " CUA_API_KEY if [[ -z "$CUA_API_KEY" ]]; then print_error "Cua Api Key is required for Cloud Sandbox." exit 1 fi else print_success "Found existing CUA API key" fi USE_CLOUD=true COMPUTER_TYPE="cloud" elif [[ "$CHOICE" == "2" ]]; then # Local macOS VM setup echo "" print_info "Setting up local macOS VMs..." # Check for Apple Silicon Mac if [[ $(uname -s) != "Darwin" || $(uname -m) != "arm64" ]]; then print_error "Local macOS VMs require an Apple Silicon Mac (M1/M2/M3/M4)." echo "💡 Consider using Cua Cloud Sandbox instead (option 1)." exit 1 fi # Check for macOS 15 (Sequoia) or newer OSVERSION=$(sw_vers -productVersion) if [[ $(echo "$OSVERSION 15.0" | tr " " "\n" | sort -V | head -n 1) != "15.0" ]]; then print_error "Local macOS VMs require macOS 15 (Sequoia) or newer. You have $OSVERSION." echo "💡 Consider using Cua Cloud Sandbox instead (option 1)." exit 1 fi USE_CLOUD=false COMPUTER_TYPE="macos" elif [[ "$CHOICE" == "3" ]]; then # Local Windows VM setup echo "" print_info "Setting up local Windows VMs..." # Check if we're on Windows if [[ $(uname -s) != MINGW* && $(uname -s) != CYGWIN* && $(uname -s) != MSYS* ]]; then print_error "Local Windows VMs require Windows 10 or 11." echo "💡 Consider using Cua Cloud Sandbox instead (option 1)." echo "" echo "🔗 If you are using WSL, refer to the blog post to get started: https://www.trycua.com/blog/windows-sandbox" exit 1 fi USE_CLOUD=false COMPUTER_TYPE="windows" else print_error "Invalid choice. Please run the script again and choose 1, 2, or 3." exit 1 fi print_success "All checks passed! 🎉" # Create demo directory and handle repository if [[ "$USE_EXISTING_REPO" == "true" ]]; then print_info "Using existing repository in current directory" cd "$REPO_DIR" else # Clone or update the repository if [[ ! -d "$REPO_DIR" ]]; then print_info "Cloning Cua repository..." cd "$DEMO_DIR" git clone https://github.com/trycua/cua.git else print_info "Updating Cua repository..." cd "$REPO_DIR" git pull origin main fi cd "$REPO_DIR" fi # Create .env.local file with API keys ENV_FILE="$REPO_DIR/.env.local" if [[ ! -f "$ENV_FILE" ]]; then cat > "$ENV_FILE" << EOF # Uncomment and add your API keys here # OPENAI_API_KEY=your_openai_api_key_here # ANTHROPIC_API_KEY=your_anthropic_api_key_here CUA_API_KEY=your_cua_api_key_here EOF print_success "Created .env.local file with API key placeholders" else print_success "Found existing .env.local file - keeping your current settings" fi if [[ "$USE_CLOUD" == "true" ]]; then # Add CUA API key to .env.local if not already present if ! grep -q "CUA_API_KEY" "$ENV_FILE"; then echo "CUA_API_KEY=$CUA_API_KEY" >> "$ENV_FILE" print_success "Added CUA_API_KEY to .env.local" elif grep -q "CUA_API_KEY=your_cua_api_key_here" "$ENV_FILE"; then # Update placeholder with actual key sed -i.bak "s/CUA_API_KEY=your_cua_api_key_here/CUA_API_KEY=$CUA_API_KEY/" "$ENV_FILE" print_success "Updated CUA_API_KEY in .env.local" fi fi # Build the Docker image if it doesn't exist print_info "Checking Docker image..." if ! docker image inspect cua-dev-image &> /dev/null; then print_info "Building Docker image (this may take a while)..." ./scripts/run-docker-dev.sh build else print_success "Docker image already exists" fi # Install Lume if needed for local VMs if [[ "$USE_CLOUD" == "false" && "$COMPUTER_TYPE" == "macos" ]]; then if ! command -v lume &> /dev/null; then print_info "Installing Lume CLI..." curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh | bash # Add lume to PATH for this session if it's not already there if ! command -v lume &> /dev/null; then export PATH="$PATH:$HOME/.local/bin" fi fi # Pull the macOS CUA image if not already present if ! lume ls | grep -q "macos-sequoia-cua"; then # Check available disk space IMAGE_SIZE_GB=30 AVAILABLE_SPACE_KB=$(df -k $HOME | tail -1 | awk '{print $4}') AVAILABLE_SPACE_GB=$(($AVAILABLE_SPACE_KB / 1024 / 1024)) echo "📊 The macOS CUA image will use approximately ${IMAGE_SIZE_GB}GB of disk space." echo " You currently have ${AVAILABLE_SPACE_GB}GB available on your system." # Prompt for confirmation read -p " Continue? [y]/n: " CONTINUE CONTINUE=${CONTINUE:-y} if [[ $CONTINUE =~ ^[Yy]$ ]]; then print_info "Pulling macOS CUA image (this may take a while)..." # Use caffeinate on macOS to prevent system sleep during the pull if command -v caffeinate &> /dev/null; then print_info "Using caffeinate to prevent system sleep during download..." caffeinate -i lume pull macos-sequoia-cua:latest else lume pull macos-sequoia-cua:latest fi else print_error "Installation cancelled." exit 1 fi fi # Check if the VM is running print_info "Checking if the macOS CUA VM is running..." VM_RUNNING=$(lume ls | grep "macos-sequoia-cua" | grep "running" || echo "") if [ -z "$VM_RUNNING" ]; then print_info "Starting the macOS CUA VM in the background..." lume run macos-sequoia-cua:latest & # Wait a moment for the VM to initialize sleep 5 print_success "VM started successfully." else print_success "macOS CUA VM is already running." fi fi # Create a convenience script to run the demo cat > "$DEMO_DIR/start_ui.sh" << EOF #!/bin/bash cd "$REPO_DIR" ./scripts/run-docker-dev.sh run agent_ui_examples.py EOF chmod +x "$DEMO_DIR/start_ui.sh" print_success "Setup complete!" if [[ "$USE_CLOUD" == "true" ]]; then echo "☁️ Cua Cloud Sandbox setup complete!" else echo "🖥️ Cua Local VM setup complete!" fi echo "📝 Edit $ENV_FILE to update your API keys" echo "🖥️ Start the playground by running: $DEMO_DIR/start_ui.sh" # Start the demo automatically echo print_info "Starting the Cua Computer-Use Agent UI..." echo "" print_success "Cua Computer-Use Agent UI is now running at http://localhost:7860/" echo echo "🌐 Open your browser and go to: http://localhost:7860/" echo "$DEMO_DIR/start_ui.sh" ``` -------------------------------------------------------------------------------- /tests/test_mcp_server_streaming.py: -------------------------------------------------------------------------------- ```python import asyncio import importlib.util import sys import types from pathlib import Path import pytest def _install_stub_module(name: str, module: types.ModuleType, registry: dict[str, types.ModuleType | None]) -> None: registry[name] = sys.modules.get(name) sys.modules[name] = module @pytest.fixture def server_module(): stubbed_modules: dict[str, types.ModuleType | None] = {} # Stub MCP Context primitives mcp_module = types.ModuleType("mcp") mcp_module.__path__ = [] # mark as package mcp_server_module = types.ModuleType("mcp.server") mcp_server_module.__path__ = [] fastmcp_module = types.ModuleType("mcp.server.fastmcp") class _StubContext: async def yield_message(self, *args, **kwargs): return None async def yield_tool_call(self, *args, **kwargs): return None async def yield_tool_output(self, *args, **kwargs): return None def report_progress(self, *_args, **_kwargs): return None def info(self, *_args, **_kwargs): return None def error(self, *_args, **_kwargs): return None class _StubImage: def __init__(self, format: str, data: bytes): self.format = format self.data = data class _StubFastMCP: def __init__(self, name: str): self.name = name self._tools: dict[str, types.FunctionType] = {} def tool(self, *args, **kwargs): def decorator(func): self._tools[func.__name__] = func return func return decorator def run(self): return None fastmcp_module.Context = _StubContext fastmcp_module.FastMCP = _StubFastMCP fastmcp_module.Image = _StubImage _install_stub_module("mcp", mcp_module, stubbed_modules) _install_stub_module("mcp.server", mcp_server_module, stubbed_modules) _install_stub_module("mcp.server.fastmcp", fastmcp_module, stubbed_modules) # Stub Computer module to avoid heavy dependencies computer_module = types.ModuleType("computer") class _StubInterface: async def screenshot(self) -> bytes: # pragma: no cover - default stub return b"" class _StubComputer: def __init__(self, *args, **kwargs): self.interface = _StubInterface() async def run(self): # pragma: no cover - default stub return None class _StubVMProviderType: CLOUD = "cloud" LOCAL = "local" computer_module.Computer = _StubComputer computer_module.VMProviderType = _StubVMProviderType _install_stub_module("computer", computer_module, stubbed_modules) # Stub agent module so server can import ComputerAgent agent_module = types.ModuleType("agent") class _StubComputerAgent: def __init__(self, *args, **kwargs): pass async def run(self, *_args, **_kwargs): # pragma: no cover - default stub if False: # pragma: no cover yield {} return agent_module.ComputerAgent = _StubComputerAgent _install_stub_module("agent", agent_module, stubbed_modules) module_name = "mcp_server_server_under_test" module_path = Path("libs/python/mcp-server/mcp_server/server.py").resolve() spec = importlib.util.spec_from_file_location(module_name, module_path) server_module = importlib.util.module_from_spec(spec) assert spec and spec.loader spec.loader.exec_module(server_module) server_instance = getattr(server_module, "server", None) if server_instance is not None and hasattr(server_instance, "_tools"): for name, func in server_instance._tools.items(): setattr(server_module, name, func) try: yield server_module finally: sys.modules.pop(module_name, None) for name, original in stubbed_modules.items(): if original is None: sys.modules.pop(name, None) else: sys.modules[name] = original class FakeContext: def __init__(self) -> None: self.events: list[tuple] = [] self.progress_updates: list[float] = [] def info(self, message: str) -> None: self.events.append(("info", message)) def error(self, message: str) -> None: self.events.append(("error", message)) def report_progress(self, value: float) -> None: self.progress_updates.append(value) async def yield_message(self, *, role: str, content): timestamp = asyncio.get_running_loop().time() self.events.append(("message", role, content, timestamp)) async def yield_tool_call(self, *, name: str | None, call_id: str, input): timestamp = asyncio.get_running_loop().time() self.events.append(("tool_call", name, call_id, input, timestamp)) async def yield_tool_output(self, *, call_id: str, output, is_error: bool = False): timestamp = asyncio.get_running_loop().time() self.events.append(("tool_output", call_id, output, is_error, timestamp)) def test_run_cua_task_streams_partial_results(server_module): async def _run_test(): class FakeAgent: script = [] def __init__(self, *args, **kwargs): pass async def run(self, messages): # type: ignore[override] for factory, delay in type(self).script: yield factory(messages) if delay: await asyncio.sleep(delay) FakeAgent.script = [ ( lambda _messages: { "output": [ { "type": "message", "role": "assistant", "content": [ {"type": "output_text", "text": "First chunk"} ], } ] }, 0.0, ), ( lambda _messages: { "output": [ { "type": "tool_use", "id": "call_1", "name": "computer", "input": {"action": "click"}, }, { "type": "computer_call_output", "call_id": "call_1", "output": [ {"type": "text", "text": "Tool completed"} ], }, ] }, 0.05, ), ] class FakeInterface: def __init__(self) -> None: self.calls = 0 async def screenshot(self) -> bytes: self.calls += 1 return b"final-image" fake_interface = FakeInterface() server_module.global_computer = types.SimpleNamespace(interface=fake_interface) server_module.ComputerAgent = FakeAgent # type: ignore[assignment] ctx = FakeContext() task = asyncio.create_task(server_module.run_cua_task(ctx, "open settings")) await asyncio.sleep(0.01) assert not task.done(), "Task should still be running to simulate long operation" message_events = [event for event in ctx.events if event[0] == "message"] assert message_events, "Expected message event before task completion" text_result, image = await task assert "First chunk" in text_result assert "Tool completed" in text_result assert image.data == b"final-image" assert fake_interface.calls == 1 tool_call_events = [event for event in ctx.events if event[0] == "tool_call"] tool_output_events = [event for event in ctx.events if event[0] == "tool_output"] assert tool_call_events and tool_output_events assert tool_call_events[0][2] == "call_1" assert tool_output_events[0][1] == "call_1" asyncio.run(_run_test()) def test_run_multi_cua_tasks_reports_progress(server_module, monkeypatch): async def _run_test(): class FakeAgent: script = [] def __init__(self, *args, **kwargs): pass async def run(self, messages): # type: ignore[override] for factory, delay in type(self).script: yield factory(messages) if delay: await asyncio.sleep(delay) FakeAgent.script = [ ( lambda messages: { "output": [ { "type": "message", "role": "assistant", "content": [ { "type": "output_text", "text": f"Result for {messages[0].get('content')}", } ], } ] }, 0.0, ) ] server_module.ComputerAgent = FakeAgent # type: ignore[assignment] class FakeInterface: async def screenshot(self) -> bytes: return b"progress-image" server_module.global_computer = types.SimpleNamespace(interface=FakeInterface()) ctx = FakeContext() results = await server_module.run_multi_cua_tasks(ctx, ["a", "b", "c"]) assert len(results) == 3 assert results[0][0] == "Result for a" assert ctx.progress_updates[0] == pytest.approx(0.0) assert ctx.progress_updates[-1] == pytest.approx(1.0) assert len(ctx.progress_updates) == 6 asyncio.run(_run_test()) ``` -------------------------------------------------------------------------------- /libs/python/computer/computer/providers/cloud/provider.py: -------------------------------------------------------------------------------- ```python """Cloud VM provider implementation using CUA Public API. Implements the following public API endpoints: - GET /v1/vms - POST /v1/vms/:name/start - POST /v1/vms/:name/stop - POST /v1/vms/:name/restart """ import logging from typing import Dict, List, Optional, Any from ..base import BaseVMProvider, VMProviderType from ..types import ListVMsResponse, MinimalVM # Setup logging logger = logging.getLogger(__name__) import asyncio import aiohttp from urllib.parse import urlparse import os DEFAULT_API_BASE = os.getenv("CUA_API_BASE", "https://api.cua.ai") class CloudProvider(BaseVMProvider): """Cloud VM Provider implementation.""" def __init__( self, api_key: str, verbose: bool = False, api_base: Optional[str] = None, **kwargs, ): """ Args: api_key: API key for authentication name: Name of the VM verbose: Enable verbose logging """ assert api_key, "api_key required for CloudProvider" self.api_key = api_key self.verbose = verbose self.api_base = (api_base or DEFAULT_API_BASE).rstrip("/") @property def provider_type(self) -> VMProviderType: return VMProviderType.CLOUD async def __aenter__(self): return self async def __aexit__(self, exc_type, exc_val, exc_tb): pass async def get_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: """Get VM information by querying the VM status endpoint. - Build hostname via get_ip(name) → "{name}.containers.cloud.trycua.com" - Probe https://{hostname}:8443/status with a short timeout - If JSON contains a "status" field, return it; otherwise infer - Fallback to DNS resolve check to distinguish unknown vs not_found """ hostname = await self.get_ip(name=name) # Try HTTPS probe to the computer-server status endpoint (8443) try: timeout = aiohttp.ClientTimeout(total=3) async with aiohttp.ClientSession(timeout=timeout) as session: url = f"https://{hostname}:8443/status" async with session.get(url, allow_redirects=False) as resp: status_code = resp.status vm_status: str vm_os_type: Optional[str] = None if status_code == 200: try: data = await resp.json(content_type=None) vm_status = str(data.get("status", "ok")) vm_os_type = str(data.get("os_type")) except Exception: vm_status = "unknown" elif status_code < 500: vm_status = "unknown" else: vm_status = "unknown" return { "name": name, "status": "running" if vm_status == "ok" else vm_status, "api_url": f"https://{hostname}:8443", "os_type": vm_os_type, } except Exception: return {"name": name, "status": "not_found", "api_url": f"https://{hostname}:8443"} async def list_vms(self) -> ListVMsResponse: url = f"{self.api_base}/v1/vms" headers = { "Authorization": f"Bearer {self.api_key}", "Accept": "application/json", } async with aiohttp.ClientSession() as session: async with session.get(url, headers=headers) as resp: if resp.status == 200: try: data = await resp.json(content_type=None) except Exception: text = await resp.text() logger.error(f"Failed to parse list_vms JSON: {text}") return [] if isinstance(data, list): # Enrich with convenience URLs when possible. enriched: List[Dict[str, Any]] = [] for item in data: vm = dict(item) if isinstance(item, dict) else {} name = vm.get("name") password = vm.get("password") if isinstance(name, str) and name: host = f"{name}.containers.cloud.trycua.com" # api_url: always set if missing if not vm.get("api_url"): vm["api_url"] = f"https://{host}:8443" # vnc_url: only when password available if not vm.get("vnc_url") and isinstance(password, str) and password: vm[ "vnc_url" ] = f"https://{host}/vnc.html?autoconnect=true&password={password}" enriched.append(vm) return enriched # type: ignore[return-value] logger.warning("Unexpected response for list_vms; expected list") return [] elif resp.status == 401: logger.error("Unauthorized: invalid CUA API key for list_vms") return [] else: text = await resp.text() logger.error(f"list_vms failed: HTTP {resp.status} - {text}") return [] async def run_vm(self, name: str, image: Optional[str] = None, run_opts: Optional[Dict[str, Any]] = None, storage: Optional[str] = None) -> Dict[str, Any]: """Start a VM via public API. Returns a minimal status.""" url = f"{self.api_base}/v1/vms/{name}/start" headers = { "Authorization": f"Bearer {self.api_key}", "Accept": "application/json", } async with aiohttp.ClientSession() as session: async with session.post(url, headers=headers) as resp: if resp.status in (200, 201, 202, 204): return {"name": name, "status": "starting"} elif resp.status == 404: return {"name": name, "status": "not_found"} elif resp.status == 401: return {"name": name, "status": "unauthorized"} else: text = await resp.text() return {"name": name, "status": "error", "message": text} async def stop_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: """Stop a VM via public API.""" url = f"{self.api_base}/v1/vms/{name}/stop" headers = { "Authorization": f"Bearer {self.api_key}", "Accept": "application/json", } async with aiohttp.ClientSession() as session: async with session.post(url, headers=headers) as resp: if resp.status in (200, 202): # Spec says 202 with {"status":"stopping"} body_status: Optional[str] = None try: data = await resp.json(content_type=None) body_status = data.get("status") if isinstance(data, dict) else None except Exception: body_status = None return {"name": name, "status": body_status or "stopping"} elif resp.status == 404: return {"name": name, "status": "not_found"} elif resp.status == 401: return {"name": name, "status": "unauthorized"} else: text = await resp.text() return {"name": name, "status": "error", "message": text} async def restart_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: """Restart a VM via public API.""" url = f"{self.api_base}/v1/vms/{name}/restart" headers = { "Authorization": f"Bearer {self.api_key}", "Accept": "application/json", } async with aiohttp.ClientSession() as session: async with session.post(url, headers=headers) as resp: if resp.status in (200, 202): # Spec says 202 with {"status":"restarting"} body_status: Optional[str] = None try: data = await resp.json(content_type=None) body_status = data.get("status") if isinstance(data, dict) else None except Exception: body_status = None return {"name": name, "status": body_status or "restarting"} elif resp.status == 404: return {"name": name, "status": "not_found"} elif resp.status == 401: return {"name": name, "status": "unauthorized"} else: text = await resp.text() return {"name": name, "status": "error", "message": text} async def update_vm(self, name: str, update_opts: Dict[str, Any], storage: Optional[str] = None) -> Dict[str, Any]: logger.warning("CloudProvider.update_vm is not implemented via public API") return {"name": name, "status": "unchanged", "message": "update_vm not supported by public API"} async def get_ip(self, name: Optional[str] = None, storage: Optional[str] = None, retry_delay: int = 2) -> str: """ Return the VM's IP address as '{container_name}.containers.cloud.trycua.com'. Uses the provided 'name' argument (the VM name requested by the caller), falling back to self.name only if 'name' is None. Retries up to 3 times with retry_delay seconds if hostname is not available. """ if name is None: raise ValueError("VM name is required for CloudProvider.get_ip") return f"{name}.containers.cloud.trycua.com" ``` -------------------------------------------------------------------------------- /libs/lume/scripts/install.sh: -------------------------------------------------------------------------------- ```bash #!/bin/bash set -e # Lume Installer # This script installs Lume to your system # Define colors for output BOLD=$(tput bold) NORMAL=$(tput sgr0) RED=$(tput setaf 1) GREEN=$(tput setaf 2) BLUE=$(tput setaf 4) YELLOW=$(tput setaf 3) # Check if running as root or with sudo if [ "$(id -u)" -eq 0 ] || [ -n "$SUDO_USER" ]; then echo "${RED}Error: Do not run this script with sudo or as root.${NORMAL}" echo "If you need to install to a system directory, create it first with proper permissions:" echo " sudo mkdir -p /desired/directory && sudo chown $(whoami) /desired/directory" echo "Then run the installer normally:" echo " ./install.sh --install-dir=/desired/directory" exit 1 fi # Default installation directory (user-specific, doesn't require sudo) DEFAULT_INSTALL_DIR="$HOME/.local/bin" INSTALL_DIR="${INSTALL_DIR:-$DEFAULT_INSTALL_DIR}" # GitHub info GITHUB_REPO="trycua/cua" LATEST_RELEASE_URL="https://api.github.com/repos/$GITHUB_REPO/releases/latest" # Option to skip background service setup (default: install it) INSTALL_BACKGROUND_SERVICE=true # Default port for lume serve (default: 7777) LUME_PORT=7777 # Parse command line arguments while [ "$#" -gt 0 ]; do case "$1" in --install-dir) INSTALL_DIR="$2" shift ;; --port) LUME_PORT="$2" shift ;; --no-background-service) INSTALL_BACKGROUND_SERVICE=false ;; --help) echo "${BOLD}${BLUE}Lume Installer${NORMAL}" echo "Usage: $0 [OPTIONS]" echo "" echo "Options:" echo " --install-dir DIR Install to the specified directory (default: $DEFAULT_INSTALL_DIR)" echo " --port PORT Specify the port for lume serve (default: 7777)" echo " --no-background-service Do not setup the Lume background service (LaunchAgent)" echo " --help Display this help message" echo "" echo "Examples:" echo " $0 # Install to $DEFAULT_INSTALL_DIR and setup background service" echo " $0 --install-dir=/usr/local/bin # Install to system directory (may require root privileges)" echo " $0 --port 7778 # Use port 7778 instead of the default 7777" echo " $0 --no-background-service # Install without setting up the background service" echo " INSTALL_DIR=/opt/lume $0 # Install to /opt/lume (legacy env var support)" exit 0 ;; *) echo "${RED}Unknown option: $1${NORMAL}" echo "Use --help for usage information" exit 1 ;; esac shift done echo "${BOLD}${BLUE}Lume Installer${NORMAL}" echo "This script will install Lume to your system." # Check if we're running with appropriate permissions check_permissions() { # System directories that typically require root privileges SYSTEM_DIRS=("/usr/local/bin" "/usr/bin" "/bin" "/opt") NEEDS_ROOT=false for DIR in "${SYSTEM_DIRS[@]}"; do if [[ "$INSTALL_DIR" == "$DIR"* ]] && [ ! -w "$INSTALL_DIR" ]; then NEEDS_ROOT=true break fi done if [ "$NEEDS_ROOT" = true ]; then echo "${YELLOW}Warning: Installing to $INSTALL_DIR may require root privileges.${NORMAL}" echo "Consider these alternatives:" echo " • Install to a user-writable location: $0 --install-dir=$HOME/.local/bin" echo " • Create the directory with correct permissions first:" echo " sudo mkdir -p $INSTALL_DIR && sudo chown $(whoami) $INSTALL_DIR" echo "" # Check if we already have write permission (might have been set up previously) if [ ! -w "$INSTALL_DIR" ] && [ ! -w "$(dirname "$INSTALL_DIR")" ]; then echo "${RED}Error: You don't have write permission to $INSTALL_DIR${NORMAL}" echo "Please choose a different installation directory or ensure you have the proper permissions." exit 1 fi fi } # Detect OS and architecture detect_platform() { OS=$(uname -s | tr '[:upper:]' '[:lower:]') ARCH=$(uname -m) if [ "$OS" != "darwin" ]; then echo "${RED}Error: Currently only macOS is supported.${NORMAL}" exit 1 fi if [ "$ARCH" != "arm64" ]; then echo "${RED}Error: Lume only supports macOS on Apple Silicon (ARM64).${NORMAL}" exit 1 fi PLATFORM="darwin-arm64" echo "Detected platform: ${BOLD}$PLATFORM${NORMAL}" } # Create temporary directory create_temp_dir() { TEMP_DIR=$(mktemp -d) echo "Using temporary directory: $TEMP_DIR" # Make sure we clean up on exit trap 'rm -rf "$TEMP_DIR"' EXIT } # Download the latest release download_release() { echo "Downloading latest Lume release..." # Use the direct download link with the non-versioned symlink DOWNLOAD_URL="https://github.com/$GITHUB_REPO/releases/latest/download/lume.tar.gz" echo "Downloading from: $DOWNLOAD_URL" # Download the tarball if command -v curl &> /dev/null; then curl -L --progress-bar "$DOWNLOAD_URL" -o "$TEMP_DIR/lume.tar.gz" # Verify the download was successful if [ ! -s "$TEMP_DIR/lume.tar.gz" ]; then echo "${RED}Error: Failed to download Lume.${NORMAL}" echo "The download URL may be incorrect or the file may not exist." exit 1 fi # Verify the file is a valid archive if ! tar -tzf "$TEMP_DIR/lume.tar.gz" > /dev/null 2>&1; then echo "${RED}Error: The downloaded file is not a valid tar.gz archive.${NORMAL}" echo "Let's try the alternative URL..." # Try alternative URL ALT_DOWNLOAD_URL="https://github.com/$GITHUB_REPO/releases/latest/download/lume-$PLATFORM.tar.gz" echo "Downloading from alternative URL: $ALT_DOWNLOAD_URL" curl -L --progress-bar "$ALT_DOWNLOAD_URL" -o "$TEMP_DIR/lume.tar.gz" # Check again if ! tar -tzf "$TEMP_DIR/lume.tar.gz" > /dev/null 2>&1; then echo "${RED}Error: Could not download a valid Lume archive.${NORMAL}" echo "Please try installing Lume manually from: https://github.com/$GITHUB_REPO/releases/latest" exit 1 fi fi else echo "${RED}Error: curl is required but not installed.${NORMAL}" exit 1 fi } # Extract and install install_binary() { echo "Extracting archive..." tar -xzf "$TEMP_DIR/lume.tar.gz" -C "$TEMP_DIR" echo "Installing to $INSTALL_DIR..." # Create install directory if it doesn't exist mkdir -p "$INSTALL_DIR" # Move the binary to the installation directory mv "$TEMP_DIR/lume" "$INSTALL_DIR/" # Make the binary executable chmod +x "$INSTALL_DIR/lume" echo "${GREEN}Installation complete!${NORMAL}" echo "Lume has been installed to ${BOLD}$INSTALL_DIR/lume${NORMAL}" # Check if the installation directory is in PATH if [ -n "${PATH##*$INSTALL_DIR*}" ]; then SHELL_NAME=$(basename "$SHELL") echo "${YELLOW}Warning: $INSTALL_DIR is not in your PATH.${NORMAL}" case "$SHELL_NAME" in zsh) echo "To add it, run:" echo " echo 'export PATH=\"\$PATH:$INSTALL_DIR\"' >> ~/.zprofile" ;; bash) echo "To add it, run:" echo " echo 'export PATH=\"\$PATH:$INSTALL_DIR\"' >> ~/.bash_profile" ;; fish) echo "To add it, run:" echo " echo 'fish_add_path $INSTALL_DIR' >> ~/.config/fish/config.fish" ;; *) echo "Add $INSTALL_DIR to your PATH in your shell profile file." ;; esac fi } # Main installation flow main() { check_permissions detect_platform create_temp_dir download_release install_binary echo "" echo "${GREEN}${BOLD}Lume has been successfully installed!${NORMAL}" echo "Run ${BOLD}lume${NORMAL} to get started." if [ "$INSTALL_BACKGROUND_SERVICE" = true ]; then # --- Setup background service (LaunchAgent) for Lume --- SERVICE_NAME="com.trycua.lume_daemon" PLIST_PATH="$HOME/Library/LaunchAgents/$SERVICE_NAME.plist" LUME_BIN="$INSTALL_DIR/lume" echo "" echo "Setting up LaunchAgent to run lume daemon on login..." # Create LaunchAgents directory if it doesn't exist mkdir -p "$HOME/Library/LaunchAgents" # Unload existing service if present if [ -f "$PLIST_PATH" ]; then echo "Existing LaunchAgent found. Unloading..." launchctl unload "$PLIST_PATH" 2>/dev/null || true fi # Create the plist file cat <<EOF > "$PLIST_PATH" <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd"> <plist version="1.0"> <dict> <key>Label</key> <string>$SERVICE_NAME</string> <key>ProgramArguments</key> <array> <string>$LUME_BIN</string> <string>serve</string> <string>--port</string> <string>$LUME_PORT</string> </array> <key>RunAtLoad</key> <true/> <key>KeepAlive</key> <true/> <key>WorkingDirectory</key> <string>$HOME</string> <key>EnvironmentVariables</key> <dict> <key>PATH</key> <string>/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:$HOME/.local/bin</string> <key>HOME</key> <string>$HOME</string> </dict> <key>StandardOutPath</key> <string>/tmp/lume_daemon.log</string> <key>StandardErrorPath</key> <string>/tmp/lume_daemon.error.log</string> <key>ProcessType</key> <string>Interactive</string> <key>SessionType</key> <string>Aqua</string> </dict> </plist> EOF # Set permissions chmod 644 "$PLIST_PATH" touch /tmp/lume_daemon.log /tmp/lume_daemon.error.log chmod 644 /tmp/lume_daemon.log /tmp/lume_daemon.error.log # Load the LaunchAgent echo "Loading LaunchAgent..." launchctl unload "$PLIST_PATH" 2>/dev/null || true launchctl load "$PLIST_PATH" echo "${GREEN}Lume daemon LaunchAgent installed and loaded. It will start automatically on login!${NORMAL}" echo "To check status: launchctl list | grep $SERVICE_NAME" echo "To view logs: tail -f /tmp/lume_daemon.log" echo "" echo "To remove the lume daemon service, run:" echo " launchctl unload \"$PLIST_PATH\"" echo " rm \"$PLIST_PATH\"" else SERVICE_NAME="com.trycua.lume_daemon" PLIST_PATH="$HOME/Library/LaunchAgents/$SERVICE_NAME.plist" if [ -f "$PLIST_PATH" ]; then echo "Removing existing Lume background service (LaunchAgent)..." launchctl unload "$PLIST_PATH" 2>/dev/null || true rm "$PLIST_PATH" echo "Lume background service (LaunchAgent) removed." else echo "Skipping Lume background service (LaunchAgent) setup as requested (use --no-background-service)." fi fi } # Run the installation main ``` -------------------------------------------------------------------------------- /blog/hack-the-north.md: -------------------------------------------------------------------------------- ```markdown # What happens when hackathon judging is a public benchmark (Hack the North edition) *Written by Francesco Bonacci — Reviewed by Parth Patel (HUD W25) — Sept 25, 2025* ## Prologue Hack the North ran Sept 12–14 at the University of Waterloo. Official count this year: **1,778 hackers**, and a [Guinness World Record for the most people building interlocking plastic brick sculptures simultaneously](https://uwaterloo.ca/news/eweal-making-hackathons-fun-again-breaking-guinness-world-record). Our team arrived from Europe and the US one day before the hackathon, after a summer scattered post–YC X25, waiting for our O-1 visas. **HUD**’s founders Parth and Jay flew in from SF to help us run evaluations, and Michael and Parth from **Ollama** joined as co-sponsors. Our plan was ambitious: run the **first state-of-the-art Computer-Use Agents track**, score it on a public benchmark, and give the top performer a guaranteed YC interview. (Interview ≠ offer. YC didn’t judge.) The rest, as they say, was a 36h story worth telling—and a playbook worth sharing for anyone thinking about running or sponsoring this type of hackathon track.  ## The sign-up problem we had to invent We joined as a sponsor at the last minute, thanks to a push from our friend @Michael Chiang at Ollama—Waterloo alum, naturally. It’s kind of an open secret that UWaterloo turns out some of the sharpest hackers around (*no pun intended, HackMIT*). It was a bit of a scramble, but also great timing—our Agent framework had just finished a major refactor, with support for **100+ VLM configurations** now live. Naturally, we wanted to stress-test it at scale—and see whether teams could come up with SOTA-level setups. *This wasn’t a blank-slate, build-whatever-you-want kind of track.* From day one, though, we knew we’d have to fight for sign-ups. This was a niche track, and a guaranteed YC interview alone wouldn’t be enough to pull people in. Unfortunately, Hack the North (HTN) didn’t offer an interest form to help us estimate demand, which made capacity planning tricky—especially with early-stage infra. Stress-testing takes foresight, and multimodal language model usage is still costly (~1.5× to 3–4× the price of comparable text-only models). On top of that, we were discouraged from external promotion on [lu.ma](http://lu.ma). So we spun up our own sign-up page at **trycua.com/hackathon** and built ad-hoc Discord channels to share track details. We emphasized—repeatedly—that only students already accepted to Hack the North should register. *(Moral: the “measure-zero effect”—no matter how many times you say it, some people won’t see it. Plenty of invalid sign-ups still slipped through.)* Even so, having your own form is absolutely worth it: it gives you an **early funnel**, surfaces demand signals ahead of time, and—crucially—**lets you require platform sign-up before kickoff**. In our case, Hack the North didn’t provide Devpost access until the very end, so our form was the only way to build a working roster. Only a small trickle of sign-ups came through by the time the event kicked off—too few to plan around, but clearly the right kind of crowd. Several were already familiar with computer-use agents; one was even interning at Shopify, working on this space. ## At the Sponsor Booth Day 0 on campus made the difference. We arrived a couple of hours early to collect swag shipments (around 1,200 stickers of our new **Cua-la** mascot, plus t-shirts and hats—always plan ~1.5× the estimated number of hackers!). After walking the sponsor floor and explaining the track at our booth, ~40 hackers signed up. **Moral:** sponsor booths are still the most effective way to recruit for a track. **Suggestions to maximize booth time (for HTN this is only ~24 of the total 36 hours):** - **Be unmistakable.** Run a mini-challenge and a visible giveaway. We offered 5 × $200 Anthropic credits as a lightning raffle and constantly advertised in HTN Slack. Shout-out to our neighbors at **Mintlify**, who dressed their teammate as a mint plant - memorable and effective. - **Create multiple touchpoints.** Hand out flyers and QR codes, and ask nearby booths to cross-refer. Big thanks to the YC team for flyer space and student connections - and to Michael (Ollama) for pointing visitors our way. - **Never leave the booth empty.** Keep someone at the booth at all times and rotate shifts. With four founding engineers on-site, coverage was easy. Even after hacking kicked off, the booth stayed a point of reference - and even then multiple participants DM’d us asking where to meet up. - **Students are organic DevRel.** Our runner-up, Adam, hung out with us at the booth, pulling more people in. Peer-to-peer energy creates the network effect you need!  *(Our Founding Engineer, Morgan, hangs out with students at the stand, while Adam (runner-up) hacks on the side.)* ## 02:30 a.m. is still prime time at a hackathon Hack the North gives sponsors a 30-minute API Workshop during the early hours of the event—a perfect moment to shift from talking to building. Our slot landed at **2:30 a.m.** (*perks of the cheapest sponsor tier*). Thirty students showed up, energy surprisingly high. James, our new Founding DevRel Engineer, led the session and nailed it. **Our track rules were simple:** 1. Build a Computer-Use Agent with the [Cua framework](https://github.com/trycua/cua) 2. Benchmark the agent on [HUD](https://www.hud.so) 3. Use [OSWorld-Tiny](https://huggingface.co/datasets/ddupont/OSWorld-Tiny-Public): a 14-task distillation of the full benchmark (~360 tasks, >1h) **Suggestions:** - **Leave something tangible.** We provided a Jupyter Notebook teams could run immediately. - **Narrow scope, strong starts.** The more focused the challenge, the more **robust starting points** you should provide. - **Want the details?** [Here’s the notebook we left participants](https://github.com/trycua/cua/blob/main/notebooks/sota_hackathon.ipynb).  *(Our CUA Workshop at 2:30 AM.)* ## Making it possible to focus on the work If you’re an OSS framework, it’s tempting to have hackers self-host on laptops. **Don’t.** You’ll spend the workshop debugging setups instead of reviewing ideas. **Lesson learned:** within hours, we shifted to **cloud-only Sandboxes**. Payoff: consistent environments, faster starts, far less tech support. We provided: - **Credits:** $200 Cua Cloud + $200 HUD per team (manual top-ups for visible progress) - **LLMs/VLMs:** Anthropic assigned $50 per participant—tight for VLM iteration—so we added capped access under our org - **Pre-kickoff provisioning:** Platform sign-up auto-created projects, keys, and sandboxes **Takeaway:** every minute not spent on setup is a minute gained for iterating. ## 12 Hours in the Hackathon **After the workshop buzz.** Morning interest was high, but Docker setup + requiring focus on a single track thinned the crowd. Most sponsor prizes are broad (“use our product and you qualify”), letting students stack tracks. Ours required commitment. Upside: those who stayed shipped sharper, higher-quality submissions. **The bell curve of submissions.** Most entries used *claude-sonnet-4-20250514*—proof that docs and public leaderboards ([OSWorld](https://os-world.github.io/#benchmark)) guide choices. Results clustered around the safe pick, with fewer pushing boundaries. **Who went beyond the baseline.** A few tried multi-agent/tool graphs. One standout—[**cuala**](https://github.com/YeIIcw/cuala)—was a clean reference: deterministic actions, verifiable state changes, callbacks for saving images and trajectories. **Bottom line:** Early excitement is easy; keeping teams engaged requires reducing friction and offering multiple entry points. ### What broke (and why) We skipped a full end-to-end **Cua × HUD** dry-run. It showed. - Hackers ran out of inference credits. Desktop tasks are token-heavy. A full OSWorld run (200 max steps) for *computer-use-preview* (OpenAI Operator API) can cost >$600. Serious attempts: ~400k tokens × 14 tasks. - Python version/build mismatches surfaced, requiring debug time across both OSS repos. - Our Cua framework lacked a **Response Agent** to complete evaluation loops. Some runs stalled until patched. ## Scoring and Results ### Participation & Outcomes - ~**30** hackers gave the track a serious try; **5** crossed the finish line - All submissions were **solo**, mostly undergrads - Judging: OSWorld-Tiny on HUD, with Cua + HUD reruns to verify scores - Final leaderboard: [HUD Leaderboard](https://www.hud.so/leaderboards/ddupont/OSWorld-Tiny-Public)  *(Leaderboard on HUD)* ### Winners **🥇 Winner — Ram** - Devpost: https://devpost.com/software/sota-computer-use-agent-challenge - Code: https://github.com/Ram-Raghav-S/cua/tree/ram - Score: 68.3% **🥈 Runner-up — Aryan** - Devpost: https://devpost.com/software/loopdeloop-computer-use-agent-sota-attempt - Code: https://github.com/Tumph/cua - Score: 55.9% **🥉 Special Mention — Adam** - Devpost: https://devpost.com/software/cuala - Code: https://github.com/YeIIcw/cuala - Score: 42.1%  *(Our finalists before the award ceremony)* ## What We’d Keep - **Sponsor Hack the North again** - **Keep a visible, staffed booth** - **Publish a compact FAQ** - **Simple, transparent scoring** ## What We’d Change - **Run a full Cua × HUD dry-run under load** - **Offer multiple on-ramps (evals, creative, RL)** - **Keep a private eval set for judging** - **Default to cloud sandboxes** - **Handle ops earlier (swag, signage, QR codes)** - **Reward generalization, not lucky runs** ## Closing Thoughts Our first outing as sponsors wasn’t perfect, but it gave us a working playbook: **provision cloud early, keep scoring simple, always dry-run infra, and make the booth unforgettable**. If more hackathon tracks leaned on **public benchmarks**, weekends like this would produce fewer demos-for-show and more measurable progress. **P.S.** Huge thanks to the Ollama and HUD teams for co-sponsoring the track, and to our YC Partner Diana for offering a **guaranteed YC interview** as first prize. Whether you’re a hacker who wants to participate, or a company looking to sponsor, let’s talk — we’re especially excited to support benchmark-first hackathon tracks in the Bay Area this year.  *(HTN Closing Ceremony — Cua Track Winner Announcement)* ``` -------------------------------------------------------------------------------- /libs/typescript/computer/src/interface/base.ts: -------------------------------------------------------------------------------- ```typescript /** * Base interface for computer control. */ import pino from 'pino'; import WebSocket from 'ws'; import type { ScreenSize } from '../types'; export type MouseButton = 'left' | 'middle' | 'right'; export interface CursorPosition { x: number; y: number; } export interface AccessibilityNode { role: string; title?: string; value?: string; description?: string; bounds?: { x: number; y: number; width: number; height: number; }; children?: AccessibilityNode[]; } /** * Base class for computer control interfaces. */ export abstract class BaseComputerInterface { protected ipAddress: string; protected username: string; protected password: string; protected closed = false; protected commandLock: Promise<unknown> = Promise.resolve(); protected ws: WebSocket; protected apiKey?: string; protected vmName?: string; protected logger = pino({ name: 'computer.interface-base' }); constructor( ipAddress: string, username = 'lume', password = 'lume', apiKey?: string, vmName?: string ) { this.ipAddress = ipAddress; this.username = username; this.password = password; this.apiKey = apiKey; this.vmName = vmName; // Initialize WebSocket with headers if needed const headers: { [key: string]: string } = {}; if (this.apiKey && this.vmName) { headers['X-API-Key'] = this.apiKey; headers['X-VM-Name'] = this.vmName; } // Create the WebSocket instance this.ws = new WebSocket(this.wsUri, { headers }); } /** * Get the WebSocket URI for connection. * Subclasses can override this to customize the URI. */ protected get wsUri(): string { const protocol = this.apiKey ? 'wss' : 'ws'; // Check if ipAddress already includes a port if (this.ipAddress.includes(':')) { return `${protocol}://${this.ipAddress}/ws`; } // Otherwise, append the default port const port = this.apiKey ? '8443' : '8000'; return `${protocol}://${this.ipAddress}:${port}/ws`; } /** * Wait for interface to be ready. * @param timeout Maximum time to wait in seconds * @throws Error if interface is not ready within timeout */ async waitForReady(timeout = 60): Promise<void> { const startTime = Date.now(); while (Date.now() - startTime < timeout * 1000) { try { await this.connect(); return; } catch (error) { console.log(error); // Wait a bit before retrying this.logger.error( `Error connecting to websocket: ${JSON.stringify(error)}` ); await new Promise((resolve) => setTimeout(resolve, 1000)); } } throw new Error(`Interface not ready after ${timeout} seconds`); } /** * Authenticate with the WebSocket server. * This should be called immediately after the WebSocket connection is established. */ private async authenticate(): Promise<void> { if (!this.apiKey || !this.vmName) { // No authentication needed return; } this.logger.info('Performing authentication handshake...'); const authMessage = { command: 'authenticate', params: { api_key: this.apiKey, container_name: this.vmName, }, }; return new Promise<void>((resolve, reject) => { const authHandler = (data: WebSocket.RawData) => { try { const authResult = JSON.parse(data.toString()); if (!authResult.success) { const errorMsg = authResult.error || 'Authentication failed'; this.logger.error(`Authentication failed: ${errorMsg}`); this.ws.close(); reject(new Error(`Authentication failed: ${errorMsg}`)); } else { this.logger.info('Authentication successful'); this.ws.off('message', authHandler); resolve(); } } catch (error) { this.ws.off('message', authHandler); reject(error); } }; this.ws.on('message', authHandler); this.ws.send(JSON.stringify(authMessage)); }); } /** * Connect to the WebSocket server. */ public async connect(): Promise<void> { // If the WebSocket is already open, check if we need to authenticate if (this.ws.readyState === WebSocket.OPEN) { this.logger.info( 'Websocket is open, ensuring authentication is complete.' ); return this.authenticate(); } // If the WebSocket is closed or closing, reinitialize it if ( this.ws.readyState === WebSocket.CLOSED || this.ws.readyState === WebSocket.CLOSING ) { this.logger.info('Websocket is closed. Reinitializing connection.'); const headers: { [key: string]: string } = {}; if (this.apiKey && this.vmName) { headers['X-API-Key'] = this.apiKey; headers['X-VM-Name'] = this.vmName; } this.ws = new WebSocket(this.wsUri, { headers }); return this.authenticate(); } // Connect and authenticate return new Promise((resolve, reject) => { const onOpen = async () => { try { // Always authenticate immediately after connection await this.authenticate(); resolve(); } catch (error) { reject(error); } }; // If already connecting, wait for it to complete then authenticate if (this.ws.readyState === WebSocket.CONNECTING) { this.ws.addEventListener('open', onOpen, { once: true }); this.ws.addEventListener('error', (error) => reject(error), { once: true, }); return; } // Set up event handlers this.ws.on('open', onOpen); this.ws.on('error', (error: Error) => { reject(error); }); this.ws.on('close', () => { if (!this.closed) { // Attempt to reconnect setTimeout(() => this.connect(), 1000); } }); }); } /** * Send a command to the WebSocket server. */ public async sendCommand( command: string, params: { [key: string]: unknown } = {} ): Promise<{ [key: string]: unknown }> { // Create a new promise for this specific command const commandPromise = new Promise<{ [key: string]: unknown }>( (resolve, reject) => { // Chain it to the previous commands const executeCommand = async (): Promise<{ [key: string]: unknown; }> => { if (!this.ws || this.ws.readyState !== WebSocket.OPEN) { await this.connect(); } return new Promise<{ [key: string]: unknown }>( (innerResolve, innerReject) => { const messageHandler = (data: WebSocket.RawData) => { try { const response = JSON.parse(data.toString()); if (response.error) { innerReject(new Error(response.error)); } else { innerResolve(response); } } catch (error) { innerReject(error); } this.ws.off('message', messageHandler); }; this.ws.on('message', messageHandler); const wsCommand = { command, params }; this.ws.send(JSON.stringify(wsCommand)); } ); }; // Add this command to the lock chain this.commandLock = this.commandLock.then(() => executeCommand().then(resolve, reject) ); } ); return commandPromise; } /** * Check if the WebSocket is connected. */ public isConnected(): boolean { return this.ws && this.ws.readyState === WebSocket.OPEN; } /** * Close the interface connection. */ disconnect(): void { this.closed = true; if (this.ws && this.ws.readyState === WebSocket.OPEN) { this.ws.close(); } else if (this.ws && this.ws.readyState === WebSocket.CONNECTING) { // If still connecting, terminate the connection attempt this.ws.terminate(); } } /** * Force close the interface connection. * By default, this just calls close(), but subclasses can override * to provide more forceful cleanup. */ forceClose(): void { this.disconnect(); } // Mouse Actions abstract mouseDown( x?: number, y?: number, button?: MouseButton ): Promise<void>; abstract mouseUp(x?: number, y?: number, button?: MouseButton): Promise<void>; abstract leftClick(x?: number, y?: number): Promise<void>; abstract rightClick(x?: number, y?: number): Promise<void>; abstract doubleClick(x?: number, y?: number): Promise<void>; abstract moveCursor(x: number, y: number): Promise<void>; abstract dragTo( x: number, y: number, button?: MouseButton, duration?: number ): Promise<void>; abstract drag( path: Array<[number, number]>, button?: MouseButton, duration?: number ): Promise<void>; // Keyboard Actions abstract keyDown(key: string): Promise<void>; abstract keyUp(key: string): Promise<void>; abstract typeText(text: string): Promise<void>; abstract pressKey(key: string): Promise<void>; abstract hotkey(...keys: string[]): Promise<void>; // Scrolling Actions abstract scroll(x: number, y: number): Promise<void>; abstract scrollDown(clicks?: number): Promise<void>; abstract scrollUp(clicks?: number): Promise<void>; // Screen Actions abstract screenshot(): Promise<Buffer>; abstract getScreenSize(): Promise<ScreenSize>; abstract getCursorPosition(): Promise<CursorPosition>; // Clipboard Actions abstract copyToClipboard(): Promise<string>; abstract setClipboard(text: string): Promise<void>; // File System Actions abstract fileExists(path: string): Promise<boolean>; abstract directoryExists(path: string): Promise<boolean>; abstract listDir(path: string): Promise<string[]>; abstract readText(path: string): Promise<string>; abstract writeText(path: string, content: string): Promise<void>; abstract readBytes(path: string): Promise<Buffer>; abstract writeBytes(path: string, content: Buffer): Promise<void>; abstract deleteFile(path: string): Promise<void>; abstract createDir(path: string): Promise<void>; abstract deleteDir(path: string): Promise<void>; abstract runCommand(command: string): Promise<[string, string]>; // Accessibility Actions abstract getAccessibilityTree(): Promise<AccessibilityNode>; abstract toScreenCoordinates(x: number, y: number): Promise<[number, number]>; abstract toScreenshotCoordinates( x: number, y: number ): Promise<[number, number]>; } ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/integrations/hud/proxy.py: -------------------------------------------------------------------------------- ```python """HUD ComputerAgent wrapper and Fake AsyncOpenAI client. Provides FakeAsyncOpenAI that adapts our ComputerAgent to the OpenAI Responses interface needed by HUD's OperatorAgent. It implements only `responses.create` and returns an OpenAI Response object with `id` and `output` fields, where `output` is a list of OpenAI-like response blocks. We intentionally only support a single-step call by consuming the first yielded result from `ComputerAgent.run()`. """ import traceback import time import uuid from typing import Any, Dict, List, Optional from agent.agent import ComputerAgent as BaseComputerAgent from agent.callbacks import PromptInstructionsCallback from hud.tools.computer.settings import computer_settings from PIL import Image from hud.agents import OperatorAgent # OpenAI Responses typed models (required) from openai.types.responses import ( Response, ResponseInputParam, ResponseOutputItem, ResponseComputerToolCall, ResponseOutputMessage, ResponseOutputText, ResponseReasoningItem, ResponseUsage, ) def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> List[ResponseOutputItem]: """Map our agent output items to OpenAI ResponseOutputItem typed models. Only a subset is supported: computer_call, assistant message (text), and reasoning. Unknown types are ignored. """ blocks: List[ResponseOutputItem] = [] for item in output_items or []: t = item.get("type") if t == "computer_call": comp = ResponseComputerToolCall.model_validate({ "id": item.get("id") or f"cu_{uuid.uuid4().hex}", "type": "computer_call", "call_id": item["call_id"], "action": item["action"], "pending_safety_checks": item.get("pending_safety_checks", []), "status": "completed", }) blocks.append(comp) # we will exit early here as the responses api only supports a single step break elif t == "message" and item.get("role") == "assistant": content_blocks: List[ResponseOutputText] = [] for c in item.get("content", []) or []: content_blocks.append( ResponseOutputText.model_validate({ "type": "output_text", "text": c["text"], "annotations": [], }) ) if content_blocks: msg = ResponseOutputMessage.model_validate({ "id": item.get("id") or f"msg_{uuid.uuid4()}", "type": "message", "role": "assistant", "status": "completed", "content": [ct.model_dump() for ct in content_blocks], }) blocks.append(msg) elif t == "reasoning": reasoning = ResponseReasoningItem.model_validate({ "id": item.get("id") or f"rsn_{uuid.uuid4()}", "type": "reasoning", "summary": item["summary"], }) blocks.append(reasoning) # Unhandled types are ignored return blocks def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]: out: List[Dict[str, Any]] = [] for it in list(items): if hasattr(it, "model_dump"): out.append(it.model_dump()) # type: ignore[attr-defined] elif isinstance(it, dict): out.append(it) else: # Strict: rely on default __dict__ if present out.append(dict(it)) # may raise if not mapping return out class FakeAsyncOpenAI: """Minimal fake OpenAI client with only `responses.create` implemented. It uses a provided `ComputerAgent` instance to produce a single-step response compatible with HUD's OperatorAgent loop. """ def __init__(self, computer_agent: BaseComputerAgent) -> None: self._agent = computer_agent self.responses = self._Responses(self) class _Responses: def __init__(self, parent: "FakeAsyncOpenAI") -> None: # Caches for cross-call context when using previous_response_id self.blocks_cache: Dict[str, ResponseInputParam | ResponseOutputItem] = {} self.context_cache: Dict[str, List[str]] = {} self.agent = parent._agent async def create( self, *, model: str, input: ResponseInputParam, tools: Optional[List[Dict[str, Any]]] = None, instructions: Optional[str] = None, previous_response_id: Optional[str] = None, max_retries: int = 5, **_: Any, ) -> Any: for attempt in range(max_retries): # Prepend cached blocks from previous_response_id to input full_input = input if previous_response_id is not None: prev_block_ids = self.context_cache[previous_response_id] prev_blocks = [self.blocks_cache[b_id] for b_id in prev_block_ids] full_input = _to_plain_dict_list(prev_blocks + input) # Pre-pend instructions message effective_input = full_input if instructions: effective_input = [{ "role": "user", "content": instructions, }] + full_input # Run a single iteration of the ComputerAgent agent_result: Optional[Dict[str, Any]] = None async for result in self.agent.run(effective_input): # type: ignore[arg-type] agent_result = result break assert agent_result is not None, "Agent failed to produce result" output = _map_agent_output_to_openai_blocks(agent_result["output"]) usage = agent_result["usage"] # Cache conversation context using the last response id block_ids: List[str] = [] blocks_to_cache = full_input + output for b in blocks_to_cache: bid = getattr(b, "id", None) or f"tmp-{hash(repr(b))}" self.blocks_cache[bid] = b # type: ignore[assignment] block_ids.append(bid) response_id = agent_result.get("id") or f"fake-{int(time.time()*1000)}" self.context_cache[response_id] = block_ids try: return Response.model_validate({ "id": response_id, "created_at": time.time(), "object": "response", "model": model, "output": output, "parallel_tool_calls": False, "tool_choice": "auto", "tools": [], "previous_response_id": previous_response_id, "usage": ResponseUsage.model_validate({ "input_tokens": usage.get("input_tokens", 0), "output_tokens": usage.get("output_tokens", 0), "total_tokens": usage.get("total_tokens", 0), "input_tokens_details": usage.get("input_tokens_details", { "cached_tokens": 0 }), "output_tokens_details": usage.get("output_tokens_details", { "reasoning_tokens": 0 }), }), }) except Exception as e: print(f"Error while validating agent response (attempt {attempt + 1}/{max_retries}): ", e) if attempt == max_retries - 1: print(traceback.format_exc()) raise e # --------------------------------------------------------------------------- # Proxy OperatorAgent (moved from __init__.py) # --------------------------------------------------------------------------- class ProxyOperatorAgent(OperatorAgent): """OperatorAgent that proxies model calls through our ComputerAgent. Accepts the same config keys we pass via hud.run_dataset `agent_config`: - model: str | None - allowed_tools: list[str] | None Additional kwargs are forwarded to OperatorAgent (if any are supported). """ def __init__( self, *, model: str | None = None, allowed_tools: list[str] | None = None, trajectory_dir: str | dict | None = None, # === ComputerAgent kwargs === tools: list[Any] | None = None, custom_loop: Any | None = None, only_n_most_recent_images: int | None = None, callbacks: list[Any] | None = None, instructions: str | None = None, verbosity: int | None = None, max_retries: int | None = 3, screenshot_delay: float | int = 0.5, use_prompt_caching: bool | None = False, max_trajectory_budget: float | dict | None = None, telemetry_enabled: bool | None = True, **kwargs: Any, ) -> None: model = model or "computer-use-preview" allowed_tools = allowed_tools or ["openai_computer"] computer_shim = { 'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)), 'environment': 'linux', 'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT) } # Build tools ensuring the computer_shim is included agent_tools: list[Any] = [computer_shim] if tools: agent_tools.extend(tools) # Build callbacks, injecting prompt instructions if provided agent_callbacks = list(callbacks or []) if instructions: agent_callbacks.append(PromptInstructionsCallback(instructions)) computer_agent = BaseComputerAgent( model=model, tools=agent_tools, custom_loop=custom_loop, only_n_most_recent_images=only_n_most_recent_images, callbacks=agent_callbacks, verbosity=verbosity, trajectory_dir=trajectory_dir, max_retries=max_retries, screenshot_delay=screenshot_delay, use_prompt_caching=use_prompt_caching, max_trajectory_budget=max_trajectory_budget, telemetry_enabled=telemetry_enabled, ) model_client = FakeAsyncOpenAI(computer_agent) super().__init__( model_client=model_client, # type: ignore[arg-type] model=model, allowed_tools=allowed_tools, **kwargs, ) __all__ = [ "FakeAsyncOpenAI", "ProxyOperatorAgent", ] ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/callbacks/logging.py: -------------------------------------------------------------------------------- ```python """ Logging callback for ComputerAgent that provides configurable logging of agent lifecycle events. """ import json import logging from typing import Dict, List, Any, Optional, Union from .base import AsyncCallbackHandler def sanitize_image_urls(data: Any) -> Any: """ Recursively search for 'image_url' keys and set their values to '[omitted]'. Args: data: Any data structure (dict, list, or primitive type) Returns: A deep copy of the data with all 'image_url' values replaced with '[omitted]' """ if isinstance(data, dict): # Create a copy of the dictionary sanitized = {} for key, value in data.items(): if key == "image_url": sanitized[key] = "[omitted]" else: # Recursively sanitize the value sanitized[key] = sanitize_image_urls(value) return sanitized elif isinstance(data, list): # Recursively sanitize each item in the list return [sanitize_image_urls(item) for item in data] else: # For primitive types (str, int, bool, None, etc.), return as-is return data class LoggingCallback(AsyncCallbackHandler): """ Callback handler that logs agent lifecycle events with configurable verbosity. Logging levels: - DEBUG: All events including API calls, message preprocessing, and detailed outputs - INFO: Major lifecycle events (start/end, messages, outputs) - WARNING: Only warnings and errors - ERROR: Only errors """ def __init__(self, logger: Optional[logging.Logger] = None, level: int = logging.INFO): """ Initialize the logging callback. Args: logger: Logger instance to use. If None, creates a logger named 'agent.ComputerAgent' level: Logging level (logging.DEBUG, logging.INFO, etc.) """ self.logger = logger or logging.getLogger('agent.ComputerAgent') self.level = level # Set up logger if it doesn't have handlers if not self.logger.handlers: handler = logging.StreamHandler() formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) handler.setFormatter(formatter) self.logger.addHandler(handler) self.logger.setLevel(level) def _update_usage(self, usage: Dict[str, Any]) -> None: """Update total usage statistics.""" def add_dicts(target: Dict[str, Any], source: Dict[str, Any]) -> None: for key, value in source.items(): if isinstance(value, dict): if key not in target: target[key] = {} add_dicts(target[key], value) else: if key not in target: target[key] = 0 target[key] += value add_dicts(self.total_usage, usage) async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None: """Called before the run starts.""" self.total_usage = {} async def on_usage(self, usage: Dict[str, Any]) -> None: """Called when usage information is received.""" self._update_usage(usage) async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None: """Called after the run ends.""" def format_dict(d, indent=0): lines = [] prefix = f" - {' ' * indent}" for key, value in d.items(): if isinstance(value, dict): lines.append(f"{prefix}{key}:") lines.extend(format_dict(value, indent + 1)) elif isinstance(value, float): lines.append(f"{prefix}{key}: ${value:.4f}") else: lines.append(f"{prefix}{key}: {value}") return lines formatted_output = "\n".join(format_dict(self.total_usage)) self.logger.info(f"Total usage:\n{formatted_output}") async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Called before LLM processing starts.""" if self.logger.isEnabledFor(logging.INFO): self.logger.info(f"LLM processing started with {len(messages)} messages") if self.logger.isEnabledFor(logging.DEBUG): sanitized_messages = [sanitize_image_urls(msg) for msg in messages] self.logger.debug(f"LLM input messages: {json.dumps(sanitized_messages, indent=2)}") return messages async def on_llm_end(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Called after LLM processing ends.""" if self.logger.isEnabledFor(logging.DEBUG): sanitized_messages = [sanitize_image_urls(msg) for msg in messages] self.logger.debug(f"LLM output: {json.dumps(sanitized_messages, indent=2)}") return messages async def on_computer_call_start(self, item: Dict[str, Any]) -> None: """Called when a computer call starts.""" action = item.get("action", {}) action_type = action.get("type", "unknown") action_args = {k: v for k, v in action.items() if k != "type"} # INFO level logging for the action self.logger.info(f"Computer: {action_type}({action_args})") # DEBUG level logging for full details if self.logger.isEnabledFor(logging.DEBUG): self.logger.debug(f"Computer call started: {json.dumps(action, indent=2)}") async def on_computer_call_end(self, item: Dict[str, Any], result: Any) -> None: """Called when a computer call ends.""" if self.logger.isEnabledFor(logging.DEBUG): action = item.get("action", "unknown") self.logger.debug(f"Computer call completed: {json.dumps(action, indent=2)}") if result: sanitized_result = sanitize_image_urls(result) self.logger.debug(f"Computer call result: {json.dumps(sanitized_result, indent=2)}") async def on_function_call_start(self, item: Dict[str, Any]) -> None: """Called when a function call starts.""" name = item.get("name", "unknown") arguments = item.get("arguments", "{}") # INFO level logging for the function call self.logger.info(f"Function: {name}({arguments})") # DEBUG level logging for full details if self.logger.isEnabledFor(logging.DEBUG): self.logger.debug(f"Function call started: {name}") async def on_function_call_end(self, item: Dict[str, Any], result: Any) -> None: """Called when a function call ends.""" # INFO level logging for function output (similar to function_call_output) if result: # Handle both list and direct result formats if isinstance(result, list) and len(result) > 0: output = result[0].get("output", str(result)) if isinstance(result[0], dict) else str(result[0]) else: output = str(result) # Truncate long outputs if len(output) > 100: output = output[:100] + "..." self.logger.info(f"Output: {output}") # DEBUG level logging for full details if self.logger.isEnabledFor(logging.DEBUG): name = item.get("name", "unknown") self.logger.debug(f"Function call completed: {name}") if result: self.logger.debug(f"Function call result: {json.dumps(result, indent=2)}") async def on_text(self, item: Dict[str, Any]) -> None: """Called when a text message is encountered.""" # Get the role to determine if it's Agent or User role = item.get("role", "unknown") content_items = item.get("content", []) # Process content items to build display text text_parts = [] for content_item in content_items: content_type = content_item.get("type", "output_text") if content_type == "output_text": text_content = content_item.get("text", "") if not text_content.strip(): text_parts.append("[empty]") else: # Truncate long text and add ellipsis if len(text_content) > 2048: text_parts.append(text_content[:2048] + "...") else: text_parts.append(text_content) else: # Non-text content, show as [type] text_parts.append(f"[{content_type}]") # Join all text parts display_text = ''.join(text_parts) if text_parts else "[empty]" # Log with appropriate level and format if role == "assistant": self.logger.info(f"Agent: {display_text}") elif role == "user": self.logger.info(f"User: {display_text}") else: # Fallback for unknown roles, use debug level if self.logger.isEnabledFor(logging.DEBUG): self.logger.debug(f"Text message ({role}): {display_text}") async def on_api_start(self, kwargs: Dict[str, Any]) -> None: """Called when an API call is about to start.""" if self.logger.isEnabledFor(logging.DEBUG): model = kwargs.get("model", "unknown") self.logger.debug(f"API call starting for model: {model}") # Log sanitized messages if present if "messages" in kwargs: sanitized_messages = sanitize_image_urls(kwargs["messages"]) self.logger.debug(f"API call messages: {json.dumps(sanitized_messages, indent=2)}") elif "input" in kwargs: sanitized_input = sanitize_image_urls(kwargs["input"]) self.logger.debug(f"API call input: {json.dumps(sanitized_input, indent=2)}") async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None: """Called when an API call has completed.""" if self.logger.isEnabledFor(logging.DEBUG): model = kwargs.get("model", "unknown") self.logger.debug(f"API call completed for model: {model}") self.logger.debug(f"API call result: {json.dumps(sanitize_image_urls(result), indent=2)}") async def on_screenshot(self, item: Union[str, bytes], name: str = "screenshot") -> None: """Called when a screenshot is taken.""" if self.logger.isEnabledFor(logging.DEBUG): image_size = len(item) / 1024 self.logger.debug(f"Screenshot captured: {name} {image_size:.2f} KB") ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/adapters/models/internvl.py: -------------------------------------------------------------------------------- ```python from __future__ import annotations from typing import List, Dict, Any, Optional # Hugging Face imports are local to avoid hard dependency at module import try: import torch # type: ignore from transformers import AutoModel, AutoTokenizer # type: ignore # Attempt to import InternVL's model dependencies import einops as _ # type: ignore import timm as _ # type: ignore from PIL import Image # type: ignore import torchvision.transforms as T # type: ignore from torchvision.transforms.functional import InterpolationMode # type: ignore import base64 # type: ignore from io import BytesIO # type: ignore import requests # type: ignore HF_AVAILABLE = True except Exception: HF_AVAILABLE = False class InternVLModel: """Generic Hugging Face vision-language model handler. Uses InternVL's native `model.chat()` interface with `AutoTokenizer`. Provides preprocessing to support multi-turn conversations with multiple images. """ def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None: if not HF_AVAILABLE: raise ImportError( "InternVL dependencies not found. Install with: pip install \"cua-agent[internvl-hf]\"" ) self.model_name = model_name self.device = device self.model = None self.tokenizer = None self.trust_remote_code = trust_remote_code self._load() def _load(self) -> None: # Load model self.model = AutoModel.from_pretrained( self.model_name, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, use_flash_attn=True, device_map=self.device, trust_remote_code=self.trust_remote_code, ).eval() # Load tokenizer (InternVL requires trust_remote_code=True and often use_fast=False) self.tokenizer = AutoTokenizer.from_pretrained( self.model_name, trust_remote_code=self.trust_remote_code, use_fast=False, ) # ---- Image preprocessing utilities adapted from InternVL docs ---- IMAGENET_MEAN = (0.485, 0.456, 0.406) IMAGENET_STD = (0.229, 0.224, 0.225) def _build_transform(self, input_size: int) -> T.Compose: MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD transform = T.Compose([ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), T.ToTensor(), T.Normalize(mean=MEAN, std=STD) ]) return transform def _find_closest_aspect_ratio(self, aspect_ratio: float, target_ratios: List[tuple], width: int, height: int, image_size: int): best_ratio_diff = float('inf') best_ratio = (1, 1) area = width * height for ratio in target_ratios: target_aspect_ratio = ratio[0] / ratio[1] ratio_diff = abs(aspect_ratio - target_aspect_ratio) if ratio_diff < best_ratio_diff: best_ratio_diff = ratio_diff best_ratio = ratio elif ratio_diff == best_ratio_diff: if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: best_ratio = ratio return best_ratio def _dynamic_preprocess(self, image: Image.Image, min_num: int = 1, max_num: int = 12, image_size: int = 448, use_thumbnail: bool = True) -> List[Image.Image]: orig_width, orig_height = image.size aspect_ratio = orig_width / orig_height target_ratios = set( (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num) target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) target_aspect_ratio = self._find_closest_aspect_ratio( aspect_ratio, target_ratios, orig_width, orig_height, image_size) target_width = image_size * target_aspect_ratio[0] target_height = image_size * target_aspect_ratio[1] blocks = target_aspect_ratio[0] * target_aspect_ratio[1] resized_img = image.resize((target_width, target_height)) processed_images: List[Image.Image] = [] for i in range(blocks): box = ( (i % (target_width // image_size)) * image_size, (i // (target_width // image_size)) * image_size, ((i % (target_width // image_size)) + 1) * image_size, ((i // (target_width // image_size)) + 1) * image_size ) split_img = resized_img.crop(box) processed_images.append(split_img) assert len(processed_images) == blocks if use_thumbnail and len(processed_images) != 1: thumbnail_img = image.resize((image_size, image_size)) processed_images.append(thumbnail_img) return processed_images def _load_image_from_source(self, src: str) -> Image.Image: """Load PIL image from various sources: data URL, http(s), or local path.""" if src.startswith("data:image/"): # data URL base64 header, b64data = src.split(",", 1) img_bytes = base64.b64decode(b64data) return Image.open(BytesIO(img_bytes)).convert('RGB') if src.startswith("http://") or src.startswith("https://"): resp = requests.get(src, timeout=10) resp.raise_for_status() return Image.open(BytesIO(resp.content)).convert('RGB') # Assume local file path return Image.open(src).convert('RGB') def _images_to_pixel_values(self, images: List[Image.Image], input_size: int = 448, max_num: int = 12): transform = self._build_transform(input_size=input_size) pixel_values_list = [] num_patches_list: List[int] = [] for img in images: tiles = self._dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num) pv = [transform(tile) for tile in tiles] pv = torch.stack(pv) num_patches_list.append(pv.shape[0]) pixel_values_list.append(pv) if not pixel_values_list: return None, [] pixel_values = torch.cat(pixel_values_list) return pixel_values, num_patches_list def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str: """Generate text for the given HF-format messages. messages: [{ role, content: [{type:'text'|'image', text|image}] }] This implementation constructs InternVL-compatible inputs and uses `model.chat(tokenizer, pixel_values, question, history=...)` to avoid relying on AutoProcessor (which fails for some tokenizers). """ assert self.model is not None and self.tokenizer is not None # Build textual context and collect images and the final question context_lines: List[str] = [] all_images: List[Image.Image] = [] last_user_text_parts: List[str] = [] for msg in messages: role = msg.get("role", "user") content = msg.get("content", []) if isinstance(content, str): content_items = [{"type": "text", "text": content}] else: content_items = content if role == "user": # Collect text and images parts_text: List[str] = [] for item in content_items: if item.get("type") == "text": t = item.get("text", "") if t: parts_text.append(t) elif item.get("type") == "image": url = item.get("image", "") if url: try: all_images.append(self._load_image_from_source(url)) except Exception: # Ignore failed image loads but keep going pass text = "\n".join(parts_text).strip() if text: context_lines.append(f"User: {text}") # Track last user text separately for question last_user_text_parts = parts_text or last_user_text_parts elif role == "assistant": # Only keep text content for history parts_text = [item.get("text", "") for item in content_items if item.get("type") == "text"] text = "\n".join(parts_text).strip() if text: context_lines.append(f"Assistant: {text}") # Prepare pixel values for all collected images (across turns) pixel_values = None num_patches_list: List[int] = [] if all_images: pixel_values, num_patches_list = self._images_to_pixel_values(all_images, input_size=448, max_num=12) if pixel_values is not None: # Convert dtype/device as in docs pixel_values = pixel_values.to(torch.bfloat16) # Chat API expects tensors on CUDA when model is on CUDA try: pixel_values = pixel_values.to(self.model.device) except Exception: pass # Build question with any prior context and numbered image placeholders if all_images: # Separate images layout: Image-1: <image> ... then question text prefix_lines = [f"Image-{i+1}: <image>" for i in range(len(all_images))] prefix = "\n".join(prefix_lines) + "\n" else: prefix = "" last_user_text = "\n".join(last_user_text_parts).strip() # Combine prior text-only turns as context to emulate multi-turn context_text = "\n".join(context_lines[:-1]) if len(context_lines) > 1 else "" base_question = last_user_text if last_user_text else "Describe the image(s) in detail." if context_text: question = (context_text + "\n" + prefix + base_question).strip() else: question = (prefix + base_question).strip() # Generation config generation_config = dict(max_new_tokens=max_new_tokens, do_sample=False) # Call InternVL chat try: if pixel_values is None: # Pure-text conversation (embed prior turns in question) response = self.model.chat(self.tokenizer, None, question, generation_config) else: # Multi-image: pass num_patches_list if >1 image if len(num_patches_list) > 1: response = self.model.chat( self.tokenizer, pixel_values, question, generation_config, num_patches_list=num_patches_list, ) else: response = self.model.chat(self.tokenizer, pixel_values, question, generation_config) except Exception as e: # Fallback: return empty string to avoid crashing the adapter return "" return response or "" ``` -------------------------------------------------------------------------------- /scripts/playground.sh: -------------------------------------------------------------------------------- ```bash #!/bin/bash set -e echo "🚀 Launching Cua Computer-Use Agent UI..." # Save the original working directory ORIGINAL_DIR="$(pwd)" # Directories used by the script DEMO_DIR="$HOME/.cua-demo" VENV_DIR="$DEMO_DIR/venv" # Function to clean up on exit cleanup() { cd ~ rm -rf "$TMP_DIR" 2>/dev/null || true } # Create a temporary directory for our work TMP_DIR=$(mktemp -d) cd "$TMP_DIR" trap cleanup EXIT # Ask user to choose between local macOS VMs or Cua Cloud Sandbox echo "" echo "Choose your Cua setup:" echo "1) ☁️ Cua Cloud Sandbox (works on any system)" echo "2) 🖥️ Local macOS VMs (requires Apple Silicon Mac + macOS 15+)" echo "" read -p "Enter your choice (1 or 2): " CHOICE if [[ "$CHOICE" == "1" ]]; then # Cua Cloud Sandbox setup echo "" echo "☁️ Setting up Cua Cloud Sandbox..." echo "" # Check if existing .env.local already has CUA_API_KEY (check current dir and demo dir) # Look for .env.local in the original working directory (before cd to temp dir) CURRENT_ENV_FILE="$ORIGINAL_DIR/.env.local" DEMO_ENV_FILE="$DEMO_DIR/.env.local" CUA_API_KEY="" # First check current directory if [[ -f "$CURRENT_ENV_FILE" ]] && grep -q "CUA_API_KEY=" "$CURRENT_ENV_FILE"; then EXISTING_CUA_KEY=$(grep "CUA_API_KEY=" "$CURRENT_ENV_FILE" | cut -d'=' -f2- | tr -d '"' | tr -d "'" | xargs) if [[ -n "$EXISTING_CUA_KEY" && "$EXISTING_CUA_KEY" != "your_cua_api_key_here" && "$EXISTING_CUA_KEY" != "" ]]; then CUA_API_KEY="$EXISTING_CUA_KEY" fi fi # Then check demo directory if not found in current dir if [[ -z "$CUA_API_KEY" ]] && [[ -f "$DEMO_ENV_FILE" ]] && grep -q "CUA_API_KEY=" "$DEMO_ENV_FILE"; then EXISTING_CUA_KEY=$(grep "CUA_API_KEY=" "$DEMO_ENV_FILE" | cut -d'=' -f2- | tr -d '"' | tr -d "'" | xargs) if [[ -n "$EXISTING_CUA_KEY" && "$EXISTING_CUA_KEY" != "your_cua_api_key_here" && "$EXISTING_CUA_KEY" != "" ]]; then CUA_API_KEY="$EXISTING_CUA_KEY" fi fi # If no valid API key found, prompt for one if [[ -z "$CUA_API_KEY" ]]; then echo "To use Cua Cloud Sandbox, you need to:" echo "1. Sign up at https://trycua.com" echo "2. Create a Cloud Sandbox" echo "3. Generate an Api Key" echo "" read -p "Enter your Cua Api Key: " CUA_API_KEY if [[ -z "$CUA_API_KEY" ]]; then echo "❌ Cua Api Key is required for Cloud Sandbox." exit 1 fi fi USE_CLOUD=true elif [[ "$CHOICE" == "2" ]]; then # Local macOS VM setup echo "" echo "🖥️ Setting up local macOS VMs..." # Check for Apple Silicon Mac if [[ $(uname -s) != "Darwin" || $(uname -m) != "arm64" ]]; then echo "❌ Local macOS VMs require an Apple Silicon Mac (M1/M2/M3/M4)." echo "💡 Consider using Cua Cloud Sandbox instead (option 1)." exit 1 fi # Check for macOS 15 (Sequoia) or newer OSVERSION=$(sw_vers -productVersion) if [[ $(echo "$OSVERSION 15.0" | tr " " "\n" | sort -V | head -n 1) != "15.0" ]]; then echo "❌ Local macOS VMs require macOS 15 (Sequoia) or newer. You have $OSVERSION." echo "💡 Consider using Cua Cloud Sandbox instead (option 1)." exit 1 fi USE_CLOUD=false else echo "❌ Invalid choice. Please run the script again and choose 1 or 2." exit 1 fi # Install Lume if not already installed (only for local VMs) if [[ "$USE_CLOUD" == "false" ]]; then if ! command -v lume &> /dev/null; then echo "📦 Installing Lume CLI..." curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh | bash # Add lume to PATH for this session if it's not already there if ! command -v lume &> /dev/null; then export PATH="$PATH:$HOME/.local/bin" fi fi # Pull the macOS CUA image if not already present if ! lume ls | grep -q "macos-sequoia-cua"; then # Check available disk space IMAGE_SIZE_GB=30 AVAILABLE_SPACE_KB=$(df -k $HOME | tail -1 | awk '{print $4}') AVAILABLE_SPACE_GB=$(($AVAILABLE_SPACE_KB / 1024 / 1024)) echo "📊 The macOS CUA image will use approximately ${IMAGE_SIZE_GB}GB of disk space." echo " You currently have ${AVAILABLE_SPACE_GB}GB available on your system." # Prompt for confirmation read -p " Continue? [y]/n: " CONTINUE CONTINUE=${CONTINUE:-y} if [[ $CONTINUE =~ ^[Yy]$ ]]; then echo "📥 Pulling macOS CUA image (this may take a while)..." lume pull macos-sequoia-cua:latest else echo "❌ Installation cancelled." exit 1 fi fi fi # Create a Python virtual environment echo "🐍 Setting up Python environment..." # Try different Python commands in order of preference PYTHON_CMD="" for cmd in python3.11 python3 python; do if command -v $cmd &> /dev/null; then # Check this Python version PYTHON_VERSION=$($cmd --version 2>&1 | cut -d" " -f2) PYTHON_MAJOR=$(echo $PYTHON_VERSION | cut -d. -f1) PYTHON_MINOR=$(echo $PYTHON_VERSION | cut -d. -f2) if [ "$PYTHON_MAJOR" -eq 3 ] && [ "$PYTHON_MINOR" -eq 11 ]; then PYTHON_CMD=$cmd echo "✅ Found suitable Python: $cmd (version $PYTHON_VERSION)" break elif [ "$PYTHON_MAJOR" -eq 3 ] && [ "$PYTHON_MINOR" -gt 11 ]; then PYTHON_CMD=$cmd PYTHON_TOO_NEW=true echo "⚠️ Found $cmd (version $PYTHON_VERSION) but only Python 3.11.x is supported." break else echo "⚠️ Found $cmd (version $PYTHON_VERSION) but it's too old, trying next..." fi fi done # If no suitable Python was found, or if Python is too new, offer to exit or continue if [ -z "$PYTHON_CMD" ] || [ "$PYTHON_TOO_NEW" = true ]; then OS_TYPE=$(uname -s) if [ "$PYTHON_TOO_NEW" = true ]; then echo -e "\n❌ Python version $PYTHON_VERSION detected. Only Python 3.11.x is supported. Newer versions (e.g., 3.12+) are not yet supported." else if [[ "$OS_TYPE" == "Darwin" ]]; then echo -e "\n❌ python3.11 not found. To continue, we recommend running this:\n\n $ brew install [email protected]\n" elif [[ "$OS_TYPE" == "MINGW"* || "$OS_TYPE" == "CYGWIN"* || "$OS_TYPE" == "MSYS"* ]]; then echo -e "\n❌ python3.11 not found. Please install Python 3.11 from https://www.python.org/downloads/\n" else echo -e "\n❌ python3.11 not found. Please install Python 3.11 from your package manager or https://www.python.org/downloads/\n" fi fi while true; do echo "Would you like to exit so you can install Python 3.11, or continue anyway? (e = exit, c = continue): " read -n 1 -r PYTHON_CONT_CHOICE echo if [[ "$PYTHON_CONT_CHOICE" =~ ^[Ee]$ ]]; then echo "Exiting so you can install Python 3.11." exit 1 elif [[ "$PYTHON_CONT_CHOICE" =~ ^[Cc]$ ]]; then echo "⚠️ Continuing without Python 3.11. Some features may not work as expected." break else echo "Please enter 'e' to exit or 'c' to continue." fi done fi # Create a virtual environment if [ ! -d "$VENV_DIR" ]; then $PYTHON_CMD -m venv "$VENV_DIR" fi # Activate the virtual environment source "$VENV_DIR/bin/activate" # Install required packages echo "📦 Updating Cua packages..." pip install -U pip setuptools wheel Cmake pip install -U cua-computer "cua-agent[all]" # Create a simple demo script mkdir -p "$DEMO_DIR" # Create .env.local file with API keys (only if it doesn't exist) if [[ ! -f "$DEMO_DIR/.env.local" ]]; then cat > "$DEMO_DIR/.env.local" << EOF # Uncomment and add your API keys here # OPENAI_API_KEY=your_openai_api_key_here # ANTHROPIC_API_KEY=your_anthropic_api_key_here CUA_API_KEY=your_cua_api_key_here EOF echo "📝 Created .env.local file with API key placeholders" else echo "📝 Found existing .env.local file - keeping your current settings" fi if [[ "$USE_CLOUD" == "true" ]]; then # Add CUA API key to .env.local if not already present if ! grep -q "CUA_API_KEY" "$DEMO_DIR/.env.local"; then echo "CUA_API_KEY=$CUA_API_KEY" >> "$DEMO_DIR/.env.local" echo "🔑 Added CUA_API_KEY to .env.local" elif grep -q "CUA_API_KEY=your_cua_api_key_here" "$DEMO_DIR/.env.local"; then # Update placeholder with actual key sed -i.bak "s/CUA_API_KEY=your_cua_api_key_here/CUA_API_KEY=$CUA_API_KEY/" "$DEMO_DIR/.env.local" echo "🔑 Updated CUA_API_KEY in .env.local" fi fi # Create a convenience script to run the demo cat > "$DEMO_DIR/start_ui.sh" << EOF #!/bin/bash source "$VENV_DIR/bin/activate" cd "$DEMO_DIR" python run_demo.py EOF chmod +x "$DEMO_DIR/start_ui.sh" echo "✅ Setup complete!" if [[ "$USE_CLOUD" == "true" ]]; then # Create run_demo.py for cloud sandbox cat > "$DEMO_DIR/run_demo.py" << 'EOF' import asyncio import os from pathlib import Path from dotenv import load_dotenv from computer import Computer from agent import ComputerAgent, LLM, AgentLoop, LLMProvider from agent.ui.gradio.ui_components import create_gradio_ui # Load environment variables from .env.local load_dotenv(Path(__file__).parent / ".env.local") # Check for required API keys cua_api_key = os.environ.get("CUA_API_KEY", "") if not cua_api_key: print("\n❌ CUA_API_KEY not found in .env.local file.") print("Please add your CUA API key to the .env.local file.") exit(1) openai_key = os.environ.get("OPENAI_API_KEY", "") anthropic_key = os.environ.get("ANTHROPIC_API_KEY", "") if not openai_key and not anthropic_key: print("\n⚠️ No OpenAI or Anthropic API keys found in .env.local.") print("Please add at least one API key to use AI agents.") print("🚀 Starting CUA playground with Cloud Sandbox...") print("📝 Edit .env.local to update your API keys") # Launch the Gradio UI and open it in the browser app = create_gradio_ui() app.launch(share=False, inbrowser=True) EOF else # Create run_demo.py for local macOS VMs cat > "$DEMO_DIR/run_demo.py" << 'EOF' import asyncio import os from pathlib import Path from dotenv import load_dotenv from computer import Computer from agent import ComputerAgent, LLM, AgentLoop, LLMProvider from agent.ui.gradio.ui_components import create_gradio_ui # Load environment variables from .env.local load_dotenv(Path(__file__).parent / ".env.local") # Try to load API keys from environment openai_key = os.environ.get("OPENAI_API_KEY", "") anthropic_key = os.environ.get("ANTHROPIC_API_KEY", "") if not openai_key and not anthropic_key: print("\n⚠️ No OpenAI or Anthropic API keys found in .env.local.") print("Please add at least one API key to use AI agents.") print("🚀 Starting CUA playground with local macOS VMs...") print("📝 Edit .env.local to update your API keys") # Launch the Gradio UI and open it in the browser app = create_gradio_ui() app.launch(share=False, inbrowser=True) EOF fi echo "☁️ CUA Cloud Sandbox setup complete!" echo "📝 Edit $DEMO_DIR/.env.local to update your API keys" echo "🖥️ Start the playground by running: $DEMO_DIR/start_ui.sh" # Check if the VM is running (only for local setup) if [[ "$USE_CLOUD" == "false" ]]; then echo "🔍 Checking if the macOS CUA VM is running..." VM_RUNNING=$(lume ls | grep "macos-sequoia-cua" | grep "running" || echo "") if [ -z "$VM_RUNNING" ]; then echo "🚀 Starting the macOS CUA VM in the background..." lume run macos-sequoia-cua:latest & # Wait a moment for the VM to initialize sleep 5 echo "✅ VM started successfully." else echo "✅ macOS CUA VM is already running." fi fi # Ask if the user wants to start the demo now echo read -p "Would you like to start the Cua Computer-Use Agent UI now? (y/n) " -n 1 -r echo if [[ $REPLY =~ ^[Yy]$ ]]; then echo "🚀 Starting the Cua Computer-Use Agent UI..." echo "" "$DEMO_DIR/start_ui.sh" fi ``` -------------------------------------------------------------------------------- /libs/python/som/som/visualization.py: -------------------------------------------------------------------------------- ```python from typing import List, Dict, Any, Tuple import numpy as np from PIL import Image, ImageDraw, ImageFont import supervision as sv import platform import os import logging logger = logging.getLogger(__name__) class BoxAnnotator: """Class for drawing bounding boxes and labels on images.""" def __init__(self): """Initialize the box annotator with a color palette.""" # WCAG 2.1 compliant color palette optimized for accessibility self.colors = [ "#2E7D32", # Green "#C62828", # Red "#1565C0", # Blue "#6A1B9A", # Purple "#EF6C00", # Orange "#283593", # Indigo "#4527A0", # Deep Purple "#00695C", # Teal "#D84315", # Deep Orange "#1B5E20", # Dark Green "#B71C1C", # Dark Red "#0D47A1", # Dark Blue "#4A148C", # Dark Purple "#E65100", # Dark Orange "#1A237E", # Dark Indigo "#311B92", # Darker Purple "#004D40", # Dark Teal "#BF360C", # Darker Orange "#33691E", # Darker Green "#880E4F", # Pink ] self.color_index = 0 self.default_font = None self._initialize_font() def _initialize_font(self) -> None: """Initialize the default font.""" # Try to load a system font first system = platform.system() font_paths = [] if system == "Darwin": # macOS font_paths = [ "/System/Library/Fonts/Helvetica.ttc", "/System/Library/Fonts/Arial.ttf", "/Library/Fonts/Arial.ttf", ] elif system == "Linux": font_paths = [ "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", "/usr/share/fonts/TTF/DejaVuSans.ttf", "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf", ] else: # Windows font_paths = ["C:\\Windows\\Fonts\\arial.ttf"] # Try each font path for font_path in font_paths: if os.path.exists(font_path): try: # Test the font with a small size test_font = ImageFont.truetype(font_path, 12) # Test if the font can render text test_font.getbbox("1") self.default_font = font_path return except Exception: continue def _get_next_color(self) -> str: """Get the next color from the palette.""" color = self.colors[self.color_index] self.color_index = (self.color_index + 1) % len(self.colors) return color def _hex_to_rgb(self, hex_color: str) -> Tuple[int, int, int]: """Convert hex color to RGB tuple.""" hex_color = hex_color.lstrip("#") # Create explicit tuple of 3 integers to match the return type r = int(hex_color[0:2], 16) g = int(hex_color[2:4], 16) b = int(hex_color[4:6], 16) return (r, g, b) def draw_boxes( self, image: Image.Image, detections: List[Dict[str, Any]], draw_config: Dict[str, Any] ) -> Image.Image: """Draw bounding boxes and labels on the image.""" draw = ImageDraw.Draw(image) # Create smaller font while keeping contrast try: if self.default_font: font = ImageFont.truetype(self.default_font, size=12) # Reduced from 16 to 12 else: # If no TrueType font available, use default font = ImageFont.load_default() except Exception: font = ImageFont.load_default() padding = 2 # Reduced padding for smaller overall box spacing = 1 # Reduced spacing between elements # Keep track of used label areas to check for collisions used_areas = [] # Store label information for third pass labels_to_draw = [] # First pass: Initialize used_areas with all bounding boxes for detection in detections: box = detection["bbox"] x1, y1, x2, y2 = [ int(coord * dim) for coord, dim in zip(box, [image.width, image.height] * 2) ] used_areas.append((x1, y1, x2, y2)) # Second pass: Draw all bounding boxes for idx, detection in enumerate(detections, 1): # Get box coordinates box = detection["bbox"] x1, y1, x2, y2 = [ int(coord * dim) for coord, dim in zip(box, [image.width, image.height] * 2) ] # Get color for this detection color = self._get_next_color() rgb_color = self._hex_to_rgb(color) # Draw bounding box with original width draw.rectangle(((x1, y1), (x2, y2)), outline=rgb_color, width=2) # Use detection number as label label = str(idx) # Get text dimensions using getbbox bbox = font.getbbox(label) text_width = bbox[2] - bbox[0] text_height = bbox[3] - bbox[1] # Create box dimensions with padding box_width = text_width + (padding * 2) # Removed multiplier for tighter box box_height = text_height + (padding * 2) # Removed multiplier for tighter box def is_inside_bbox(x, y): """Check if a label box would be inside the bounding box.""" return x >= x1 and x + box_width <= x2 and y >= y1 and y + box_height <= y2 # Try different positions until we find one without collision positions = [ # Top center (above bbox) lambda: (x1 + ((x2 - x1) - box_width) // 2, y1 - box_height - spacing), # Bottom center (below bbox) lambda: (x1 + ((x2 - x1) - box_width) // 2, y2 + spacing), # Right center (right of bbox) lambda: (x2 + spacing, y1 + ((y2 - y1) - box_height) // 2), # Left center (left of bbox) lambda: (x1 - box_width - spacing, y1 + ((y2 - y1) - box_height) // 2), # Top right (outside corner) lambda: (x2 + spacing, y1 - box_height - spacing), # Top left (outside corner) lambda: (x1 - box_width - spacing, y1 - box_height - spacing), # Bottom right (outside corner) lambda: (x2 + spacing, y2 + spacing), # Bottom left (outside corner) lambda: (x1 - box_width - spacing, y2 + spacing), ] def check_occlusion(x, y): """Check if a label box occludes any existing ones or is inside bbox.""" # First check if it's inside the bounding box if is_inside_bbox(x, y): return True # Then check collision with other labels new_box = (x, y, x + box_width, y + box_height) label_width = new_box[2] - new_box[0] label_height = new_box[3] - new_box[1] for used_box in used_areas: if not ( new_box[2] < used_box[0] # new box is left of used box or new_box[0] > used_box[2] # new box is right of used box or new_box[3] < used_box[1] # new box is above used box or new_box[1] > used_box[3] # new box is below used box ): # Calculate dimensions of the used box used_box_width = used_box[2] - used_box[0] used_box_height = used_box[3] - used_box[1] # Only consider as collision if used box is NOT more than 5x bigger in both dimensions if not (used_box_width > 5 * label_width and used_box_height > 5 * label_height): return True return False # Try each position until we find one without collision label_x = None label_y = None for get_pos in positions: x, y = get_pos() # Ensure position is within image bounds if x < 0 or y < 0 or x + box_width > image.width or y + box_height > image.height: continue if not check_occlusion(x, y): label_x = x label_y = y break # If all positions collide or are out of bounds, find the best possible position if label_x is None: # Try to place it in the nearest valid position outside the bbox best_pos = positions[0]() # Default to top center label_x = max(0, min(image.width - box_width, best_pos[0])) label_y = max(0, min(image.height - box_height, best_pos[1])) # Ensure it's not inside the bounding box if is_inside_bbox(label_x, label_y): # Force it above the bounding box label_y = max(0, y1 - box_height - spacing) # Add this label area to used areas if ( label_x is not None and label_y is not None and box_width is not None and box_height is not None ): used_areas.append((label_x, label_y, label_x + box_width, label_y + box_height)) # Store label information for second pass labels_to_draw.append( { "label": label, "x": label_x, "y": label_y, "width": box_width, "height": box_height, "text_width": text_width, "text_height": text_height, "color": rgb_color, } ) # Third pass: Draw all labels on top for label_info in labels_to_draw: # Draw background box with white outline draw.rectangle( ( (label_info["x"] - 1, label_info["y"] - 1), ( label_info["x"] + label_info["width"] + 1, label_info["y"] + label_info["height"] + 1, ), ), outline="white", width=2, ) draw.rectangle( ( (label_info["x"], label_info["y"]), (label_info["x"] + label_info["width"], label_info["y"] + label_info["height"]), ), fill=label_info["color"], ) # Center text in box text_x = label_info["x"] + (label_info["width"] - label_info["text_width"]) // 2 text_y = label_info["y"] + (label_info["height"] - label_info["text_height"]) // 2 # Draw text with black outline for better visibility outline_width = 1 for dx in [-outline_width, outline_width]: for dy in [-outline_width, outline_width]: draw.text( (text_x + dx, text_y + dy), label_info["label"], fill="black", font=font ) # Draw the main white text draw.text((text_x, text_y), label_info["label"], fill=(255, 255, 255), font=font) logger.info("Finished drawing all boxes") return image ```