This is page 9 of 21. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .all-contributorsrc ├── .cursorignore ├── .devcontainer │ ├── devcontainer.json │ ├── post-install.sh │ └── README.md ├── .dockerignore ├── .gitattributes ├── .github │ ├── FUNDING.yml │ ├── scripts │ │ ├── get_pyproject_version.py │ │ └── tests │ │ ├── __init__.py │ │ ├── README.md │ │ └── test_get_pyproject_version.py │ └── workflows │ ├── ci-lume.yml │ ├── docker-publish-kasm.yml │ ├── docker-publish-xfce.yml │ ├── docker-reusable-publish.yml │ ├── npm-publish-computer.yml │ ├── npm-publish-core.yml │ ├── publish-lume.yml │ ├── pypi-publish-agent.yml │ ├── pypi-publish-computer-server.yml │ ├── pypi-publish-computer.yml │ ├── pypi-publish-core.yml │ ├── pypi-publish-mcp-server.yml │ ├── pypi-publish-pylume.yml │ ├── pypi-publish-som.yml │ ├── pypi-reusable-publish.yml │ └── test-validation-script.yml ├── .gitignore ├── .vscode │ ├── docs.code-workspace │ ├── launch.json │ ├── libs-ts.code-workspace │ ├── lume.code-workspace │ ├── lumier.code-workspace │ ├── py.code-workspace │ └── settings.json ├── blog │ ├── app-use.md │ ├── assets │ │ ├── composite-agents.png │ │ ├── docker-ubuntu-support.png │ │ ├── hack-booth.png │ │ ├── hack-closing-ceremony.jpg │ │ ├── hack-cua-ollama-hud.jpeg │ │ ├── hack-leaderboard.png │ │ ├── hack-the-north.png │ │ ├── hack-winners.jpeg │ │ ├── hack-workshop.jpeg │ │ ├── hud-agent-evals.png │ │ └── trajectory-viewer.jpeg │ ├── bringing-computer-use-to-the-web.md │ ├── build-your-own-operator-on-macos-1.md │ ├── build-your-own-operator-on-macos-2.md │ ├── composite-agents.md │ ├── cua-hackathon.md │ ├── hack-the-north.md │ ├── hud-agent-evals.md │ ├── human-in-the-loop.md │ ├── introducing-cua-cloud-containers.md │ ├── lume-to-containerization.md │ ├── sandboxed-python-execution.md │ ├── training-computer-use-models-trajectories-1.md │ ├── trajectory-viewer.md │ ├── ubuntu-docker-support.md │ └── windows-sandbox.md ├── CONTRIBUTING.md ├── Development.md ├── Dockerfile ├── docs │ ├── .gitignore │ ├── .prettierrc │ ├── content │ │ └── docs │ │ ├── agent-sdk │ │ │ ├── agent-loops.mdx │ │ │ ├── benchmarks │ │ │ │ ├── index.mdx │ │ │ │ ├── interactive.mdx │ │ │ │ ├── introduction.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── osworld-verified.mdx │ │ │ │ ├── screenspot-pro.mdx │ │ │ │ └── screenspot-v2.mdx │ │ │ ├── callbacks │ │ │ │ ├── agent-lifecycle.mdx │ │ │ │ ├── cost-saving.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── logging.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── pii-anonymization.mdx │ │ │ │ └── trajectories.mdx │ │ │ ├── chat-history.mdx │ │ │ ├── custom-computer-handlers.mdx │ │ │ ├── custom-tools.mdx │ │ │ ├── customizing-computeragent.mdx │ │ │ ├── integrations │ │ │ │ ├── hud.mdx │ │ │ │ └── meta.json │ │ │ ├── message-format.mdx │ │ │ ├── meta.json │ │ │ ├── migration-guide.mdx │ │ │ ├── prompt-caching.mdx │ │ │ ├── supported-agents │ │ │ │ ├── composed-agents.mdx │ │ │ │ ├── computer-use-agents.mdx │ │ │ │ ├── grounding-models.mdx │ │ │ │ ├── human-in-the-loop.mdx │ │ │ │ └── meta.json │ │ │ ├── supported-model-providers │ │ │ │ ├── index.mdx │ │ │ │ └── local-models.mdx │ │ │ └── usage-tracking.mdx │ │ ├── computer-sdk │ │ │ ├── cloud-vm-management.mdx │ │ │ ├── commands.mdx │ │ │ ├── computer-ui.mdx │ │ │ ├── computers.mdx │ │ │ ├── meta.json │ │ │ └── sandboxed-python.mdx │ │ ├── index.mdx │ │ ├── libraries │ │ │ ├── agent │ │ │ │ └── index.mdx │ │ │ ├── computer │ │ │ │ └── index.mdx │ │ │ ├── computer-server │ │ │ │ ├── Commands.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── REST-API.mdx │ │ │ │ └── WebSocket-API.mdx │ │ │ ├── core │ │ │ │ └── index.mdx │ │ │ ├── lume │ │ │ │ ├── cli-reference.mdx │ │ │ │ ├── faq.md │ │ │ │ ├── http-api.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── meta.json │ │ │ │ └── prebuilt-images.mdx │ │ │ ├── lumier │ │ │ │ ├── building-lumier.mdx │ │ │ │ ├── docker-compose.mdx │ │ │ │ ├── docker.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ └── meta.json │ │ │ ├── mcp-server │ │ │ │ ├── client-integrations.mdx │ │ │ │ ├── configuration.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── llm-integrations.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── tools.mdx │ │ │ │ └── usage.mdx │ │ │ └── som │ │ │ ├── configuration.mdx │ │ │ └── index.mdx │ │ ├── meta.json │ │ ├── quickstart-cli.mdx │ │ ├── quickstart-devs.mdx │ │ └── telemetry.mdx │ ├── next.config.mjs │ ├── package-lock.json │ ├── package.json │ ├── pnpm-lock.yaml │ ├── postcss.config.mjs │ ├── public │ │ └── img │ │ ├── agent_gradio_ui.png │ │ ├── agent.png │ │ ├── cli.png │ │ ├── computer.png │ │ ├── som_box_threshold.png │ │ └── som_iou_threshold.png │ ├── README.md │ ├── source.config.ts │ ├── src │ │ ├── app │ │ │ ├── (home) │ │ │ │ ├── [[...slug]] │ │ │ │ │ └── page.tsx │ │ │ │ └── layout.tsx │ │ │ ├── api │ │ │ │ └── search │ │ │ │ └── route.ts │ │ │ ├── favicon.ico │ │ │ ├── global.css │ │ │ ├── layout.config.tsx │ │ │ ├── layout.tsx │ │ │ ├── llms.mdx │ │ │ │ └── [[...slug]] │ │ │ │ └── route.ts │ │ │ └── llms.txt │ │ │ └── route.ts │ │ ├── assets │ │ │ ├── discord-black.svg │ │ │ ├── discord-white.svg │ │ │ ├── logo-black.svg │ │ │ └── logo-white.svg │ │ ├── components │ │ │ ├── iou.tsx │ │ │ └── mermaid.tsx │ │ ├── lib │ │ │ ├── llms.ts │ │ │ └── source.ts │ │ └── mdx-components.tsx │ └── tsconfig.json ├── examples │ ├── agent_examples.py │ ├── agent_ui_examples.py │ ├── cloud_api_examples.py │ ├── computer_examples_windows.py │ ├── computer_examples.py │ ├── computer_ui_examples.py │ ├── computer-example-ts │ │ ├── .env.example │ │ ├── .gitignore │ │ ├── .prettierrc │ │ ├── package-lock.json │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── README.md │ │ ├── src │ │ │ ├── helpers.ts │ │ │ └── index.ts │ │ └── tsconfig.json │ ├── docker_examples.py │ ├── evals │ │ ├── hud_eval_examples.py │ │ └── wikipedia_most_linked.txt │ ├── pylume_examples.py │ ├── sandboxed_functions_examples.py │ ├── som_examples.py │ ├── utils.py │ └── winsandbox_example.py ├── img │ ├── agent_gradio_ui.png │ ├── agent.png │ ├── cli.png │ ├── computer.png │ ├── logo_black.png │ └── logo_white.png ├── libs │ ├── kasm │ │ ├── Dockerfile │ │ ├── LICENSE │ │ ├── README.md │ │ └── src │ │ └── ubuntu │ │ └── install │ │ └── firefox │ │ ├── custom_startup.sh │ │ ├── firefox.desktop │ │ └── install_firefox.sh │ ├── lume │ │ ├── .cursorignore │ │ ├── CONTRIBUTING.md │ │ ├── Development.md │ │ ├── img │ │ │ └── cli.png │ │ ├── Package.resolved │ │ ├── Package.swift │ │ ├── README.md │ │ ├── resources │ │ │ └── lume.entitlements │ │ ├── scripts │ │ │ ├── build │ │ │ │ ├── build-debug.sh │ │ │ │ ├── build-release-notarized.sh │ │ │ │ └── build-release.sh │ │ │ └── install.sh │ │ ├── src │ │ │ ├── Commands │ │ │ │ ├── Clone.swift │ │ │ │ ├── Config.swift │ │ │ │ ├── Create.swift │ │ │ │ ├── Delete.swift │ │ │ │ ├── Get.swift │ │ │ │ ├── Images.swift │ │ │ │ ├── IPSW.swift │ │ │ │ ├── List.swift │ │ │ │ ├── Logs.swift │ │ │ │ ├── Options │ │ │ │ │ └── FormatOption.swift │ │ │ │ ├── Prune.swift │ │ │ │ ├── Pull.swift │ │ │ │ ├── Push.swift │ │ │ │ ├── Run.swift │ │ │ │ ├── Serve.swift │ │ │ │ ├── Set.swift │ │ │ │ └── Stop.swift │ │ │ ├── ContainerRegistry │ │ │ │ ├── ImageContainerRegistry.swift │ │ │ │ ├── ImageList.swift │ │ │ │ └── ImagesPrinter.swift │ │ │ ├── Errors │ │ │ │ └── Errors.swift │ │ │ ├── FileSystem │ │ │ │ ├── Home.swift │ │ │ │ ├── Settings.swift │ │ │ │ ├── VMConfig.swift │ │ │ │ ├── VMDirectory.swift │ │ │ │ └── VMLocation.swift │ │ │ ├── LumeController.swift │ │ │ ├── Main.swift │ │ │ ├── Server │ │ │ │ ├── Handlers.swift │ │ │ │ ├── HTTP.swift │ │ │ │ ├── Requests.swift │ │ │ │ ├── Responses.swift │ │ │ │ └── Server.swift │ │ │ ├── Utils │ │ │ │ ├── CommandRegistry.swift │ │ │ │ ├── CommandUtils.swift │ │ │ │ ├── Logger.swift │ │ │ │ ├── NetworkUtils.swift │ │ │ │ ├── Path.swift │ │ │ │ ├── ProcessRunner.swift │ │ │ │ ├── ProgressLogger.swift │ │ │ │ ├── String.swift │ │ │ │ └── Utils.swift │ │ │ ├── Virtualization │ │ │ │ ├── DarwinImageLoader.swift │ │ │ │ ├── DHCPLeaseParser.swift │ │ │ │ ├── ImageLoaderFactory.swift │ │ │ │ └── VMVirtualizationService.swift │ │ │ ├── VM │ │ │ │ ├── DarwinVM.swift │ │ │ │ ├── LinuxVM.swift │ │ │ │ ├── VM.swift │ │ │ │ ├── VMDetails.swift │ │ │ │ ├── VMDetailsPrinter.swift │ │ │ │ ├── VMDisplayResolution.swift │ │ │ │ └── VMFactory.swift │ │ │ └── VNC │ │ │ ├── PassphraseGenerator.swift │ │ │ └── VNCService.swift │ │ └── tests │ │ ├── Mocks │ │ │ ├── MockVM.swift │ │ │ ├── MockVMVirtualizationService.swift │ │ │ └── MockVNCService.swift │ │ ├── VM │ │ │ └── VMDetailsPrinterTests.swift │ │ ├── VMTests.swift │ │ ├── VMVirtualizationServiceTests.swift │ │ └── VNCServiceTests.swift │ ├── lumier │ │ ├── .dockerignore │ │ ├── Dockerfile │ │ ├── README.md │ │ └── src │ │ ├── bin │ │ │ └── entry.sh │ │ ├── config │ │ │ └── constants.sh │ │ ├── hooks │ │ │ └── on-logon.sh │ │ └── lib │ │ ├── utils.sh │ │ └── vm.sh │ ├── python │ │ ├── agent │ │ │ ├── .bumpversion.cfg │ │ │ ├── agent │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── adapters │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── huggingfacelocal_adapter.py │ │ │ │ │ ├── human_adapter.py │ │ │ │ │ ├── mlxvlm_adapter.py │ │ │ │ │ └── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── qwen2_5_vl.py │ │ │ │ ├── agent.py │ │ │ │ ├── callbacks │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── budget_manager.py │ │ │ │ │ ├── image_retention.py │ │ │ │ │ ├── logging.py │ │ │ │ │ ├── operator_validator.py │ │ │ │ │ ├── pii_anonymization.py │ │ │ │ │ ├── prompt_instructions.py │ │ │ │ │ ├── telemetry.py │ │ │ │ │ └── trajectory_saver.py │ │ │ │ ├── cli.py │ │ │ │ ├── computers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cua.py │ │ │ │ │ └── custom.py │ │ │ │ ├── decorators.py │ │ │ │ ├── human_tool │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ ├── server.py │ │ │ │ │ └── ui.py │ │ │ │ ├── integrations │ │ │ │ │ └── hud │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── agent.py │ │ │ │ │ └── proxy.py │ │ │ │ ├── loops │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── anthropic.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── composed_grounded.py │ │ │ │ │ ├── gemini.py │ │ │ │ │ ├── glm45v.py │ │ │ │ │ ├── gta1.py │ │ │ │ │ ├── holo.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── model_types.csv │ │ │ │ │ ├── moondream3.py │ │ │ │ │ ├── omniparser.py │ │ │ │ │ ├── openai.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── uitars.py │ │ │ │ ├── proxy │ │ │ │ │ ├── examples.py │ │ │ │ │ └── handlers.py │ │ │ │ ├── responses.py │ │ │ │ ├── types.py │ │ │ │ └── ui │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ └── gradio │ │ │ │ ├── __init__.py │ │ │ │ ├── app.py │ │ │ │ └── ui_components.py │ │ │ ├── benchmarks │ │ │ │ ├── .gitignore │ │ │ │ ├── contrib.md │ │ │ │ ├── interactive.py │ │ │ │ ├── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ └── gta1.py │ │ │ │ ├── README.md │ │ │ │ ├── ss-pro.py │ │ │ │ ├── ss-v2.py │ │ │ │ └── utils.py │ │ │ ├── example.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer │ │ │ ├── .bumpversion.cfg │ │ │ ├── computer │ │ │ │ ├── __init__.py │ │ │ │ ├── computer.py │ │ │ │ ├── diorama_computer.py │ │ │ │ ├── helpers.py │ │ │ │ ├── interface │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ ├── models.py │ │ │ │ │ └── windows.py │ │ │ │ ├── logger.py │ │ │ │ ├── models.py │ │ │ │ ├── providers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cloud │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── docker │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── lume │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── lume_api.py │ │ │ │ │ ├── lumier │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── types.py │ │ │ │ │ └── winsandbox │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── provider.py │ │ │ │ │ └── setup_script.ps1 │ │ │ │ ├── ui │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ └── gradio │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── app.py │ │ │ │ └── utils.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer-server │ │ │ ├── .bumpversion.cfg │ │ │ ├── computer_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── cli.py │ │ │ │ ├── diorama │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── diorama_computer.py │ │ │ │ │ ├── diorama.py │ │ │ │ │ ├── draw.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── safezone.py │ │ │ │ ├── handlers │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── windows.py │ │ │ │ ├── main.py │ │ │ │ ├── server.py │ │ │ │ └── watchdog.py │ │ │ ├── examples │ │ │ │ ├── __init__.py │ │ │ │ └── usage_example.py │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ ├── run_server.py │ │ │ └── test_connection.py │ │ ├── core │ │ │ ├── .bumpversion.cfg │ │ │ ├── core │ │ │ │ ├── __init__.py │ │ │ │ └── telemetry │ │ │ │ ├── __init__.py │ │ │ │ └── posthog.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── mcp-server │ │ │ ├── .bumpversion.cfg │ │ │ ├── CONCURRENT_SESSIONS.md │ │ │ ├── mcp_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── server.py │ │ │ │ └── session_manager.py │ │ │ ├── pdm.lock │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ └── scripts │ │ │ ├── install_mcp_server.sh │ │ │ └── start_mcp_server.sh │ │ ├── pylume │ │ │ ├── __init__.py │ │ │ ├── .bumpversion.cfg │ │ │ ├── pylume │ │ │ │ ├── __init__.py │ │ │ │ ├── client.py │ │ │ │ ├── exceptions.py │ │ │ │ ├── lume │ │ │ │ ├── models.py │ │ │ │ ├── pylume.py │ │ │ │ └── server.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ └── som │ │ ├── .bumpversion.cfg │ │ ├── LICENSE │ │ ├── poetry.toml │ │ ├── pyproject.toml │ │ ├── README.md │ │ ├── som │ │ │ ├── __init__.py │ │ │ ├── detect.py │ │ │ ├── detection.py │ │ │ ├── models.py │ │ │ ├── ocr.py │ │ │ ├── util │ │ │ │ └── utils.py │ │ │ └── visualization.py │ │ └── tests │ │ └── test_omniparser.py │ ├── typescript │ │ ├── .gitignore │ │ ├── .nvmrc │ │ ├── agent │ │ │ ├── examples │ │ │ │ ├── playground-example.html │ │ │ │ └── README.md │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── client.ts │ │ │ │ ├── index.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ └── client.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── biome.json │ │ ├── computer │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── computer │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── providers │ │ │ │ │ │ ├── base.ts │ │ │ │ │ │ ├── cloud.ts │ │ │ │ │ │ └── index.ts │ │ │ │ │ └── types.ts │ │ │ │ ├── index.ts │ │ │ │ ├── interface │ │ │ │ │ ├── base.ts │ │ │ │ │ ├── factory.ts │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── linux.ts │ │ │ │ │ ├── macos.ts │ │ │ │ │ └── windows.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ ├── computer │ │ │ │ │ └── cloud.test.ts │ │ │ │ ├── interface │ │ │ │ │ ├── factory.test.ts │ │ │ │ │ ├── index.test.ts │ │ │ │ │ ├── linux.test.ts │ │ │ │ │ ├── macos.test.ts │ │ │ │ │ └── windows.test.ts │ │ │ │ └── setup.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── core │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── index.ts │ │ │ │ └── telemetry │ │ │ │ ├── clients │ │ │ │ │ ├── index.ts │ │ │ │ │ └── posthog.ts │ │ │ │ └── index.ts │ │ │ ├── tests │ │ │ │ └── telemetry.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── pnpm-workspace.yaml │ │ └── README.md │ └── xfce │ ├── .dockerignore │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ └── src │ ├── scripts │ │ ├── resize-display.sh │ │ ├── start-computer-server.sh │ │ ├── start-novnc.sh │ │ ├── start-vnc.sh │ │ └── xstartup.sh │ ├── supervisor │ │ └── supervisord.conf │ └── xfce-config │ ├── helpers.rc │ ├── xfce4-power-manager.xml │ └── xfce4-session.xml ├── LICENSE.md ├── Makefile ├── notebooks │ ├── agent_nb.ipynb │ ├── blog │ │ ├── build-your-own-operator-on-macos-1.ipynb │ │ └── build-your-own-operator-on-macos-2.ipynb │ ├── composite_agents_docker_nb.ipynb │ ├── computer_nb.ipynb │ ├── computer_server_nb.ipynb │ ├── customizing_computeragent.ipynb │ ├── eval_osworld.ipynb │ ├── ollama_nb.ipynb │ ├── pylume_nb.ipynb │ ├── README.md │ ├── sota_hackathon_cloud.ipynb │ └── sota_hackathon.ipynb ├── pdm.lock ├── pyproject.toml ├── pyrightconfig.json ├── README.md ├── samples │ └── community │ ├── global-online │ │ └── README.md │ └── hack-the-north │ └── README.md ├── scripts │ ├── build-uv.sh │ ├── build.ps1 │ ├── build.sh │ ├── cleanup.sh │ ├── playground-docker.sh │ ├── playground.sh │ └── run-docker-dev.sh └── tests ├── pytest.ini ├── shell_cmd.py ├── test_files.py ├── test_mcp_server_session_management.py ├── test_mcp_server_streaming.py ├── test_shell_bash.py ├── test_telemetry.py ├── test_venv.py └── test_watchdog.py ``` # Files -------------------------------------------------------------------------------- /libs/kasm/src/ubuntu/install/firefox/install_firefox.sh: -------------------------------------------------------------------------------- ```bash 1 | #!/usr/bin/env bash 2 | set -xe 3 | 4 | # Add icon 5 | if [ -f /dockerstartup/install/ubuntu/install/firefox/firefox.desktop ]; then 6 | mv /dockerstartup/install/ubuntu/install/firefox/firefox.desktop $HOME/Desktop/ 7 | fi 8 | 9 | ARCH=$(arch | sed 's/aarch64/arm64/g' | sed 's/x86_64/amd64/g') 10 | 11 | set_desktop_icon() { 12 | sed -i -e 's!Icon=.\+!Icon=/usr/share/icons/hicolor/48x48/apps/firefox.png!' "$HOME/Desktop/firefox.desktop" 13 | } 14 | 15 | echo "Install Firefox" 16 | if [[ "${DISTRO}" == @(oracle8|rockylinux9|rockylinux8|oracle9|rhel9|almalinux9|almalinux8|fedora39|fedora40) ]]; then 17 | dnf install -y firefox p11-kit 18 | elif [ "${DISTRO}" == "opensuse" ]; then 19 | zypper install -yn p11-kit-tools MozillaFirefox 20 | elif grep -q Jammy /etc/os-release || grep -q Noble /etc/os-release; then 21 | if [ ! -f '/etc/apt/preferences.d/mozilla-firefox' ]; then 22 | add-apt-repository -y ppa:mozillateam/ppa 23 | echo ' 24 | Package: * 25 | Pin: release o=LP-PPA-mozillateam 26 | Pin-Priority: 1001 27 | ' > /etc/apt/preferences.d/mozilla-firefox 28 | fi 29 | apt-get install -y firefox p11-kit-modules 30 | elif grep -q "ID=kali" /etc/os-release; then 31 | apt-get update 32 | apt-get install -y firefox-esr p11-kit-modules 33 | rm -f $HOME/Desktop/firefox.desktop 34 | cp \ 35 | /usr/share/applications/firefox-esr.desktop \ 36 | $HOME/Desktop/ 37 | chmod +x $HOME/Desktop/firefox-esr.desktop 38 | elif grep -q "ID=debian" /etc/os-release || grep -q "ID=parrot" /etc/os-release; then 39 | if [ "${ARCH}" == "amd64" ]; then 40 | install -d -m 0755 /etc/apt/keyrings 41 | wget -q https://packages.mozilla.org/apt/repo-signing-key.gpg -O- > /etc/apt/keyrings/packages.mozilla.org.asc 42 | echo "deb [signed-by=/etc/apt/keyrings/packages.mozilla.org.asc] https://packages.mozilla.org/apt mozilla main" > /etc/apt/sources.list.d/mozilla.list 43 | echo ' 44 | Package: * 45 | Pin: origin packages.mozilla.org 46 | Pin-Priority: 1000 47 | ' > /etc/apt/preferences.d/mozilla 48 | apt-get update 49 | apt-get install -y firefox p11-kit-modules 50 | else 51 | apt-get update 52 | apt-get install -y firefox-esr p11-kit-modules 53 | rm -f $HOME/Desktop/firefox.desktop 54 | cp \ 55 | /usr/share/applications/firefox-esr.desktop \ 56 | $HOME/Desktop/ 57 | chmod +x $HOME/Desktop/firefox-esr.desktop 58 | fi 59 | else 60 | apt-mark unhold firefox || : 61 | apt-get remove firefox 62 | apt-get update 63 | apt-get install -y firefox p11-kit-modules 64 | fi 65 | 66 | # Add Langpacks 67 | FIREFOX_VERSION=$(curl -sI https://download.mozilla.org/?product=firefox-latest | awk -F '(releases/|/win32)' '/Location/ {print $2}') 68 | RELEASE_URL="https://releases.mozilla.org/pub/firefox/releases/${FIREFOX_VERSION}/win64/xpi/" 69 | LANGS=$(curl -Ls ${RELEASE_URL} | awk -F '(xpi">|</a>)' '/href.*xpi/ {print $2}' | tr '\n' ' ') 70 | EXTENSION_DIR=/usr/lib/firefox-addons/distribution/extensions/ 71 | mkdir -p ${EXTENSION_DIR} 72 | for LANG in ${LANGS}; do 73 | LANGCODE=$(echo ${LANG} | sed 's/\.xpi//g') 74 | echo "Downloading ${LANG} Language pack" 75 | curl -o \ 76 | ${EXTENSION_DIR}langpack-${LANGCODE}@firefox.mozilla.org.xpi -Ls \ 77 | ${RELEASE_URL}${LANG} 78 | done 79 | 80 | # Cleanup and install flash if supported 81 | if [[ "${DISTRO}" == @(oracle8|rockylinux9|rockylinux8|oracle9|rhel9|almalinux9|almalinux8|fedora39|fedora40) ]]; then 82 | if [ -z ${SKIP_CLEAN+x} ]; then 83 | dnf clean all 84 | fi 85 | elif [ "${DISTRO}" == "opensuse" ]; then 86 | if [ -z ${SKIP_CLEAN+x} ]; then 87 | zypper clean --all 88 | fi 89 | else 90 | if [ "$ARCH" == "arm64" ] && [ "$(lsb_release -cs)" == "focal" ] ; then 91 | echo "Firefox flash player not supported on arm64 Ubuntu Focal Skipping" 92 | elif grep -q "ID=debian" /etc/os-release || grep -q "ID=kali" /etc/os-release || grep -q "ID=parrot" /etc/os-release; then 93 | echo "Firefox flash player not supported on Debian" 94 | elif grep -q Focal /etc/os-release; then 95 | # Plugin to support running flash videos for sites like vimeo 96 | apt-get update 97 | apt-get install -y browser-plugin-freshplayer-pepperflash 98 | apt-mark hold firefox 99 | if [ -z ${SKIP_CLEAN+x} ]; then 100 | apt-get autoclean 101 | rm -rf \ 102 | /var/lib/apt/lists/* \ 103 | /var/tmp/* 104 | fi 105 | fi 106 | fi 107 | 108 | if [[ "${DISTRO}" != @(oracle8|rockylinux9|rockylinux8|oracle9|rhel9|almalinux9|almalinux8|opensuse|fedora39|fedora40) ]]; then 109 | # Update firefox to utilize the system certificate store instead of the one that ships with firefox 110 | if grep -q "ID=debian" /etc/os-release || grep -q "ID=kali" /etc/os-release || grep -q "ID=parrot" /etc/os-release && [ "${ARCH}" == "arm64" ]; then 111 | rm -f /usr/lib/firefox-esr/libnssckbi.so 112 | ln /usr/lib/$(arch)-linux-gnu/pkcs11/p11-kit-trust.so /usr/lib/firefox-esr/libnssckbi.so 113 | elif grep -q "ID=kali" /etc/os-release && [ "${ARCH}" == "amd64" ]; then 114 | rm -f /usr/lib/firefox-esr/libnssckbi.so 115 | ln /usr/lib/$(arch)-linux-gnu/pkcs11/p11-kit-trust.so /usr/lib/firefox-esr/libnssckbi.so 116 | else 117 | rm -f /usr/lib/firefox/libnssckbi.so 118 | ln /usr/lib/$(arch)-linux-gnu/pkcs11/p11-kit-trust.so /usr/lib/firefox/libnssckbi.so 119 | fi 120 | fi 121 | 122 | if [[ "${DISTRO}" == @(oracle8|rockylinux9|rockylinux8|oracle9|rhel9|almalinux9|almalinux8|fedora39|fedora40) ]]; then 123 | if [[ "${DISTRO}" == @(fedora39|fedora40) ]]; then 124 | preferences_file=/usr/lib64/firefox/browser/defaults/preferences/firefox-redhat-default-prefs.js 125 | else 126 | preferences_file=/usr/lib64/firefox/browser/defaults/preferences/all-redhat.js 127 | fi 128 | sed -i -e '/homepage/d' "$preferences_file" 129 | elif [ "${DISTRO}" == "opensuse" ]; then 130 | preferences_file=/usr/lib64/firefox/browser/defaults/preferences/firefox.js 131 | elif grep -q "ID=kali" /etc/os-release; then 132 | preferences_file=/usr/lib/firefox-esr/defaults/pref/firefox.js 133 | elif grep -q "ID=debian" /etc/os-release || grep -q "ID=parrot" /etc/os-release; then 134 | if [ "${ARCH}" == "amd64" ]; then 135 | preferences_file=/usr/lib/firefox/defaults/pref/firefox.js 136 | else 137 | preferences_file=/usr/lib/firefox-esr/defaults/pref/firefox.js 138 | fi 139 | else 140 | preferences_file=/usr/lib/firefox/browser/defaults/preferences/firefox.js 141 | fi 142 | 143 | # Disabling default first run URL for Debian based images 144 | if [[ "${DISTRO}" != @(oracle8|rockylinux9|rockylinux8|oracle9|rhel9|almalinux9|almalinux8|opensuse|fedora39|fedora40) ]]; then 145 | cat >"$preferences_file" <<EOF 146 | pref("datareporting.policy.firstRunURL", ""); 147 | pref("datareporting.policy.dataSubmissionEnabled", false); 148 | pref("datareporting.healthreport.service.enabled", false); 149 | pref("datareporting.healthreport.uploadEnabled", false); 150 | pref("trailhead.firstrun.branches", "nofirstrun-empty"); 151 | pref("browser.aboutwelcome.enabled", false); 152 | EOF 153 | fi 154 | 155 | if [[ "${DISTRO}" == @(oracle8|rockylinux9|rockylinux8|oracle9|rhel9|almalinux9|almalinux8|opensuse|fedora39|fedora40) ]]; then 156 | # Creating a default profile 157 | chown -R root:root $HOME 158 | firefox -headless -CreateProfile "kasm $HOME/.mozilla/firefox/kasm" 159 | # Generate a certdb to be detected on squid start 160 | HOME=/root firefox --headless & 161 | mkdir -p /root/.mozilla 162 | CERTDB=$(find /root/.mozilla* -name "cert9.db") 163 | while [ -z "${CERTDB}" ] ; do 164 | sleep 1 165 | echo "waiting for certdb" 166 | CERTDB=$(find /root/.mozilla* -name "cert9.db") 167 | done 168 | sleep 2 169 | kill $(pgrep firefox) 170 | CERTDIR=$(dirname ${CERTDB}) 171 | mv ${CERTDB} $HOME/.mozilla/firefox/kasm/ 172 | rm -Rf /root/.mozilla 173 | else 174 | # Creating Default Profile 175 | chown -R 0:0 $HOME 176 | firefox -headless -CreateProfile "kasm $HOME/.mozilla/firefox/kasm" 177 | fi 178 | 179 | # Silence Firefox security nag "Some of Firefox's features may offer less protection on your current operating system". 180 | echo 'user_pref("security.sandbox.warn_unprivileged_namespaces", false);' > $HOME/.mozilla/firefox/kasm/user.js 181 | chown 1000:1000 $HOME/.mozilla/firefox/kasm/user.js 182 | 183 | if [[ "${DISTRO}" == @(oracle8|rockylinux9|rockylinux8|oracle9|rhel9|almalinux9|almalinux8|opensuse|fedora39|fedora40) ]]; then 184 | set_desktop_icon 185 | fi 186 | 187 | # Starting with version 67, Firefox creates a unique profile mapping per installation which is hash generated 188 | # based off the installation path. Because that path will be static for our deployments we can assume the hash 189 | # and thus assign our profile to the default for the installation 190 | if grep -q "ID=kali" /etc/os-release; then 191 | cat >>$HOME/.mozilla/firefox/profiles.ini <<EOL 192 | [Install3B6073811A6ABF12] 193 | Default=kasm 194 | Locked=1 195 | EOL 196 | elif grep -q "ID=debian" /etc/os-release || grep -q "ID=parrot" /etc/os-release; then 197 | if [ "${ARCH}" != "amd64" ]; then 198 | cat >>$HOME/.mozilla/firefox/profiles.ini <<EOL 199 | [Install3B6073811A6ABF12] 200 | Default=kasm 201 | Locked=1 202 | EOL 203 | else 204 | cat >>$HOME/.mozilla/firefox/profiles.ini <<EOL 205 | [Install4F96D1932A9F858E] 206 | Default=kasm 207 | Locked=1 208 | EOL 209 | fi 210 | elif [[ "${DISTRO}" != @(oracle8|rockylinux9|rockylinux8|oracle9|rhel9|almalinux9|almalinux8|opensuse|fedora39|fedora40) ]]; then 211 | cat >>$HOME/.mozilla/firefox/profiles.ini <<EOL 212 | [Install4F96D1932A9F858E] 213 | Default=kasm 214 | Locked=1 215 | EOL 216 | elif [[ "${DISTRO}" == @(oracle8|rockylinux9|rockylinux8|oracle9|rhel9|almalinux9|almalinux8|opensuse|fedora39|fedora40) ]]; then 217 | cat >>$HOME/.mozilla/firefox/profiles.ini <<EOL 218 | [Install11457493C5A56847] 219 | Default=kasm 220 | Locked=1 221 | EOL 222 | fi 223 | 224 | # Desktop Icon FIxes 225 | if [[ "${DISTRO}" == @(rockylinux9|oracle9|rhel9|almalinux9|fedora39|fedora40) ]]; then 226 | sed -i 's#Icon=/usr/lib/firefox#Icon=/usr/lib64/firefox#g' $HOME/Desktop/firefox.desktop 227 | fi 228 | 229 | # Cleanup for app layer 230 | chown -R 1000:0 $HOME 231 | find /usr/share/ -name "icon-theme.cache" -exec rm -f {} \; 232 | if [ -f $HOME/Desktop/firefox.desktop ]; then 233 | chmod +x $HOME/Desktop/firefox.desktop 234 | fi 235 | chown -R 1000:1000 $HOME/.mozilla 236 | 237 | ``` -------------------------------------------------------------------------------- /docs/content/docs/quickstart-devs.mdx: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | title: Quickstart 3 | description: Get started with Cua in three steps 4 | icon: Rocket 5 | --- 6 | 7 | import { Step, Steps } from 'fumadocs-ui/components/steps'; 8 | import { Tab, Tabs } from 'fumadocs-ui/components/tabs'; 9 | 10 | This quickstart guides you through setting up your [computer environment](#set-up-your-computer-environment), programmatic control with a [Cua computer](#using-computer), and task automation with a [Cua agent](#using-agent): 11 | 12 | <Steps> 13 | 14 | <Step> 15 | 16 | ## Set Up Your Computer Environment 17 | 18 | Choose how you want to run your Cua computer. This will be the environment where your automated tasks will execute. 19 | 20 | You can run your Cua computer in the cloud (recommended for easiest setup), locally on macOS with Lume, locally on Windows with a Windows Sandbox, or in a Docker container on any platform. Choose the option that matches your system and needs. 21 | 22 | <Tabs items={['☁️ Cloud', '🐳 Docker', '🍎 Lume', '🪟 Windows Sandbox']}> 23 | <Tab value="☁️ Cloud"> 24 | 25 | Cua Cloud Sandbox provides virtual machines that run Ubuntu. 26 | 27 | 1. Go to [trycua.com/signin](https://www.trycua.com/signin) 28 | 2. Navigate to **Dashboard > Containers > Create Instance** 29 | 3. Create a **Medium, Ubuntu 22** sandbox 30 | 4. Note your sandbox name and API key 31 | 32 | Your Cloud Sandbox will be automatically configured and ready to use. 33 | 34 | </Tab> 35 | <Tab value="🍎 Lume"> 36 | 37 | Lume containers are macOS virtual machines that run on a macOS host machine. 38 | 39 | 1. Install the Lume CLI: 40 | 41 | ```bash 42 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)" 43 | ``` 44 | 45 | 2. Start a local Cua sandbox: 46 | 47 | ```bash 48 | lume run macos-sequoia-cua:latest 49 | ``` 50 | 51 | </Tab> 52 | <Tab value="🪟 Windows Sandbox"> 53 | 54 | Windows Sandbox provides Windows virtual environments that run on a Windows host machine. 55 | 56 | 1. Enable [Windows Sandbox](https://learn.microsoft.com/en-us/windows/security/application-security/application-isolation/windows-sandbox/windows-sandbox-install) (requires Windows 10 Pro/Enterprise or Windows 11) 57 | 2. Install the `pywinsandbox` dependency: 58 | 59 | ```bash 60 | pip install -U git+git://github.com/karkason/pywinsandbox.git 61 | ``` 62 | 63 | 3. Windows Sandbox will be automatically configured when you run the CLI 64 | 65 | </Tab> 66 | <Tab value="🐳 Docker"> 67 | 68 | Docker provides a way to run Ubuntu containers on any host machine. 69 | 70 | 1. Install Docker Desktop or Docker Engine: 71 | 72 | 2. Pull the CUA Ubuntu sandbox: 73 | 74 | ```bash 75 | docker pull --platform=linux/amd64 trycua/cua-ubuntu:latest 76 | ``` 77 | 78 | </Tab> 79 | </Tabs> 80 | 81 | </Step> 82 | 83 | <Step> 84 | 85 | ## Using Computer 86 | 87 | Connect to your Cua computer and perform basic interactions, such as taking screenshots or simulating user input. 88 | 89 | <Tabs items={['Python', 'TypeScript']}> 90 | <Tab value="Python"> 91 | Install the Cua computer Python SDK: 92 | ```bash 93 | pip install cua-computer 94 | ``` 95 | 96 | Then, connect to your desired computer environment: 97 | 98 | <Tabs items={['☁️ Cloud', '🐳 Docker', '🍎 Lume', '🪟 Windows Sandbox', '🖥️ Host Desktop']}> 99 | <Tab value="☁️ Cloud"> 100 | ```python 101 | from computer import Computer 102 | 103 | computer = Computer( 104 | os_type="linux", 105 | provider_type="cloud", 106 | name="your-sandbox-name", 107 | api_key="your-api-key" 108 | ) 109 | await computer.run() # Connect to the sandbox 110 | ``` 111 | </Tab> 112 | <Tab value="🍎 Lume"> 113 | ```python 114 | from computer import Computer 115 | 116 | computer = Computer( 117 | os_type="macos", 118 | provider_type="lume", 119 | name="macos-sequoia-cua:latest" 120 | ) 121 | await computer.run() # Launch & connect to the container 122 | ``` 123 | </Tab> 124 | <Tab value="🪟 Windows Sandbox"> 125 | ```python 126 | from computer import Computer 127 | 128 | computer = Computer( 129 | os_type="windows", 130 | provider_type="windows_sandbox" 131 | ) 132 | await computer.run() # Launch & connect to the container 133 | ``` 134 | </Tab> 135 | <Tab value="🐳 Docker"> 136 | ```python 137 | from computer import Computer 138 | 139 | computer = Computer( 140 | os_type="linux", 141 | provider_type="docker", 142 | name="trycua/cua-ubuntu:latest" 143 | ) 144 | await computer.run() # Launch & connect to the container 145 | ``` 146 | </Tab> 147 | <Tab value="🖥️ Host Desktop"> 148 | Install and run `cua-computer-server`: 149 | ```bash 150 | pip install cua-computer-server 151 | python -m computer_server 152 | ``` 153 | 154 | Then, use the `Computer` object to connect: 155 | ```python 156 | from computer import Computer 157 | 158 | computer = Computer(use_host_computer_server=True) 159 | await computer.run() # Connect to the host desktop 160 | ``` 161 | </Tab> 162 | </Tabs> 163 | 164 | Once connected, you can perform interactions: 165 | ```python 166 | try: 167 | # Take a screenshot of the computer's current display 168 | screenshot = await computer.interface.screenshot() 169 | # Simulate a left-click at coordinates (100, 100) 170 | await computer.interface.left_click(100, 100) 171 | # Type "Hello!" into the active application 172 | await computer.interface.type("Hello!") 173 | finally: 174 | await computer.close() 175 | ``` 176 | </Tab> 177 | <Tab value="TypeScript"> 178 | Install the Cua computer TypeScript SDK: 179 | ```bash 180 | npm install @trycua/computer 181 | ``` 182 | 183 | Then, connect to your desired computer environment: 184 | 185 | <Tabs items={['☁️ Cloud','🐳 Docker', '🍎 Lume', '🪟 Windows Sandbox', '🖥️ Host Desktop']}> 186 | <Tab value="☁️ Cloud"> 187 | ```typescript 188 | import { Computer, OSType } from '@trycua/computer'; 189 | 190 | const computer = new Computer({ 191 | osType: OSType.LINUX, 192 | name: "your-sandbox-name", 193 | apiKey: "your-api-key" 194 | }); 195 | await computer.run(); // Connect to the sandbox 196 | ``` 197 | </Tab> 198 | <Tab value="🍎 Lume"> 199 | ```typescript 200 | import { Computer, OSType, ProviderType } from '@trycua/computer'; 201 | 202 | const computer = new Computer({ 203 | osType: OSType.MACOS, 204 | providerType: ProviderType.LUME, 205 | name: "macos-sequoia-cua:latest" 206 | }); 207 | await computer.run(); // Launch & connect to the container 208 | ``` 209 | </Tab> 210 | <Tab value="🪟 Windows Sandbox"> 211 | ```typescript 212 | import { Computer, OSType, ProviderType } from '@trycua/computer'; 213 | 214 | const computer = new Computer({ 215 | osType: OSType.WINDOWS, 216 | providerType: ProviderType.WINDOWS_SANDBOX 217 | }); 218 | await computer.run(); // Launch & connect to the container 219 | ``` 220 | </Tab> 221 | <Tab value="🐳 Docker"> 222 | ```typescript 223 | import { Computer, OSType, ProviderType } from '@trycua/computer'; 224 | 225 | const computer = new Computer({ 226 | osType: OSType.LINUX, 227 | providerType: ProviderType.DOCKER, 228 | name: "trycua/cua-ubuntu:latest" 229 | }); 230 | await computer.run(); // Launch & connect to the container 231 | ``` 232 | </Tab> 233 | <Tab value="🖥️ Host Desktop"> 234 | First, install and run `cua-computer-server`: 235 | ```bash 236 | pip install cua-computer-server 237 | python -m computer_server 238 | ``` 239 | 240 | Then, use the `Computer` object to connect: 241 | ```typescript 242 | import { Computer } from '@trycua/computer'; 243 | 244 | const computer = new Computer({ useHostComputerServer: true }); 245 | await computer.run(); // Connect to the host desktop 246 | ``` 247 | </Tab> 248 | </Tabs> 249 | 250 | Once connected, you can perform interactions: 251 | ```typescript 252 | try { 253 | // Take a screenshot of the computer's current display 254 | const screenshot = await computer.interface.screenshot(); 255 | // Simulate a left-click at coordinates (100, 100) 256 | await computer.interface.leftClick(100, 100); 257 | // Type "Hello!" into the active application 258 | await computer.interface.typeText("Hello!"); 259 | } finally { 260 | await computer.close(); 261 | } 262 | ``` 263 | </Tab> 264 | </Tabs> 265 | 266 | Learn more about computers in the [Cua computers documentation](/computer-sdk/computers). You will see how to automate computers with agents in the next step. 267 | 268 | </Step> 269 | 270 | <Step> 271 | 272 | ## Using Agent 273 | 274 | Utilize an Agent to automate complex tasks by providing it with a goal and allowing it to interact with the computer environment. 275 | 276 | Install the Cua agent Python SDK: 277 | ```bash 278 | pip install "cua-agent[all]" 279 | ``` 280 | 281 | Then, use the `ComputerAgent` object: 282 | ```python 283 | from agent import ComputerAgent 284 | 285 | agent = ComputerAgent( 286 | model="anthropic/claude-3-5-sonnet-20241022", 287 | tools=[computer], 288 | max_trajectory_budget=5.0 289 | ) 290 | 291 | messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}] 292 | 293 | async for result in agent.run(messages): 294 | for item in result["output"]: 295 | if item["type"] == "message": 296 | print(item["content"][0]["text"]) 297 | ``` 298 | 299 | Learn more about agents in [Agent Loops](/agent-sdk/agent-loops) and available models in [Supported Models](/agent-sdk/supported-model-providers/). 300 | 301 | </Step> 302 | </Steps> 303 | 304 | ## Next Steps 305 | 306 | - Learn more about [Cua computers](/computer-sdk/computers) and [computer commands](/computer-sdk/commands) 307 | - Read about [Agent loops](/agent-sdk/agent-loops), [tools](/agent-sdk/custom-tools), and [supported model providers](/agent-sdk/supported-model-providers/) 308 | - Join our [Discord community](https://discord.com/invite/mVnXXpdE85) for help 309 | ``` -------------------------------------------------------------------------------- /libs/python/computer-server/test_connection.py: -------------------------------------------------------------------------------- ```python 1 | #!/usr/bin/env python 2 | """ 3 | Connection test script for Computer Server. 4 | 5 | This script tests both WebSocket (/ws) and REST (/cmd) connections to the Computer Server 6 | and keeps it alive, allowing you to verify the server is running correctly. 7 | """ 8 | 9 | import asyncio 10 | import json 11 | import websockets 12 | import argparse 13 | import sys 14 | import aiohttp 15 | import os 16 | 17 | import dotenv 18 | dotenv.load_dotenv() 19 | 20 | async def test_websocket_connection(host="localhost", port=8000, keep_alive=False, container_name=None, api_key=None): 21 | """Test WebSocket connection to the Computer Server.""" 22 | if container_name: 23 | # Container mode: use WSS with container domain and port 8443 24 | uri = f"wss://{container_name}.containers.cloud.trycua.com:8443/ws" 25 | print(f"Connecting to container {container_name} at {uri}...") 26 | else: 27 | # Local mode: use WS with specified host and port 28 | uri = f"ws://{host}:{port}/ws" 29 | print(f"Connecting to local server at {uri}...") 30 | 31 | try: 32 | async with websockets.connect(uri) as websocket: 33 | print("WebSocket connection established!") 34 | 35 | # If container connection, send authentication first 36 | if container_name: 37 | if not api_key: 38 | print("Error: API key required for container connections") 39 | return False 40 | 41 | print("Sending authentication...") 42 | auth_message = { 43 | "command": "authenticate", 44 | "params": { 45 | "api_key": api_key, 46 | "container_name": container_name 47 | } 48 | } 49 | await websocket.send(json.dumps(auth_message)) 50 | auth_response = await websocket.recv() 51 | print(f"Authentication response: {auth_response}") 52 | 53 | # Check if authentication was successful 54 | auth_data = json.loads(auth_response) 55 | if not auth_data.get("success", False): 56 | print("Authentication failed!") 57 | return False 58 | print("Authentication successful!") 59 | 60 | # Send a test command to get version 61 | await websocket.send(json.dumps({"command": "version", "params": {}})) 62 | response = await websocket.recv() 63 | print(f"Version response: {response}") 64 | 65 | # Send a test command to get screen size 66 | await websocket.send(json.dumps({"command": "get_screen_size", "params": {}})) 67 | response = await websocket.recv() 68 | print(f"Screen size response: {response}") 69 | 70 | if keep_alive: 71 | print("\nKeeping WebSocket connection alive. Press Ctrl+C to exit...") 72 | while True: 73 | # Send a command every 5 seconds to keep the connection alive 74 | await asyncio.sleep(5) 75 | await websocket.send( 76 | json.dumps({"command": "get_cursor_position", "params": {}}) 77 | ) 78 | response = await websocket.recv() 79 | print(f"Cursor position: {response}") 80 | except websockets.exceptions.ConnectionClosed as e: 81 | print(f"WebSocket connection closed: {e}") 82 | return False 83 | except ConnectionRefusedError: 84 | print(f"Connection refused. Is the server running at {host}:{port}?") 85 | return False 86 | except Exception as e: 87 | print(f"WebSocket error: {e}") 88 | return False 89 | 90 | return True 91 | 92 | 93 | async def test_rest_connection(host="localhost", port=8000, keep_alive=False, container_name=None, api_key=None): 94 | """Test REST connection to the Computer Server.""" 95 | if container_name: 96 | # Container mode: use HTTPS with container domain and port 8443 97 | base_url = f"https://{container_name}.containers.cloud.trycua.com:8443" 98 | print(f"Connecting to container {container_name} at {base_url}...") 99 | else: 100 | # Local mode: use HTTP with specified host and port 101 | base_url = f"http://{host}:{port}" 102 | print(f"Connecting to local server at {base_url}...") 103 | 104 | try: 105 | async with aiohttp.ClientSession() as session: 106 | print("REST connection established!") 107 | 108 | # Prepare headers for container authentication 109 | headers = {} 110 | if container_name: 111 | if not api_key: 112 | print("Error: API key required for container connections") 113 | return False 114 | headers["X-Container-Name"] = container_name 115 | headers["X-API-Key"] = api_key 116 | print(f"Using container authentication headers") 117 | 118 | # Test screenshot endpoint 119 | async with session.post( 120 | f"{base_url}/cmd", 121 | json={"command": "screenshot", "params": {}}, 122 | headers=headers 123 | ) as response: 124 | if response.status == 200: 125 | text = await response.text() 126 | print(f"Screenshot response: {text}") 127 | else: 128 | print(f"Screenshot request failed with status: {response.status}") 129 | print(await response.text()) 130 | return False 131 | 132 | # Test screen size endpoint 133 | async with session.post( 134 | f"{base_url}/cmd", 135 | json={"command": "get_screen_size", "params": {}}, 136 | headers=headers 137 | ) as response: 138 | if response.status == 200: 139 | text = await response.text() 140 | print(f"Screen size response: {text}") 141 | else: 142 | print(f"Screen size request failed with status: {response.status}") 143 | print(await response.text()) 144 | return False 145 | 146 | if keep_alive: 147 | print("\nKeeping REST connection alive. Press Ctrl+C to exit...") 148 | while True: 149 | # Send a command every 5 seconds to keep testing 150 | await asyncio.sleep(5) 151 | async with session.post( 152 | f"{base_url}/cmd", 153 | json={"command": "get_cursor_position", "params": {}}, 154 | headers=headers 155 | ) as response: 156 | if response.status == 200: 157 | text = await response.text() 158 | print(f"Cursor position: {text}") 159 | else: 160 | print(f"Cursor position request failed with status: {response.status}") 161 | print(await response.text()) 162 | return False 163 | 164 | except aiohttp.ClientError as e: 165 | print(f"REST connection error: {e}") 166 | return False 167 | except Exception as e: 168 | print(f"REST error: {e}") 169 | return False 170 | 171 | return True 172 | 173 | 174 | async def test_connection(host="localhost", port=8000, keep_alive=False, container_name=None, use_rest=False, api_key=None): 175 | """Test connection to the Computer Server using WebSocket or REST.""" 176 | if use_rest: 177 | return await test_rest_connection(host, port, keep_alive, container_name, api_key) 178 | else: 179 | return await test_websocket_connection(host, port, keep_alive, container_name, api_key) 180 | 181 | 182 | def parse_args(): 183 | parser = argparse.ArgumentParser(description="Test connection to Computer Server") 184 | parser.add_argument("--host", default="localhost", help="Host address (default: localhost)") 185 | parser.add_argument("-p", "--port", type=int, default=8000, help="Port number (default: 8000)") 186 | parser.add_argument("-c", "--container-name", help="Container name for cloud connection (uses WSS/HTTPS and port 8443)") 187 | parser.add_argument("--api-key", help="API key for container authentication (can also use CUA_API_KEY env var)") 188 | parser.add_argument("--keep-alive", action="store_true", help="Keep connection alive") 189 | parser.add_argument("--rest", action="store_true", help="Use REST endpoint (/cmd) instead of WebSocket (/ws)") 190 | return parser.parse_args() 191 | 192 | 193 | async def main(): 194 | args = parse_args() 195 | 196 | # Convert hyphenated argument to underscore for function parameter 197 | container_name = getattr(args, 'container_name', None) 198 | 199 | # Get API key from argument or environment variable 200 | api_key = getattr(args, 'api_key', None) or os.environ.get('CUA_API_KEY') 201 | 202 | # Check if container name is provided but API key is missing 203 | if container_name and not api_key: 204 | print("Warning: Container name provided but no API key found.") 205 | print("Please provide --api-key argument or set CUA_API_KEY environment variable.") 206 | return 1 207 | 208 | print(f"Testing {'REST' if args.rest else 'WebSocket'} connection...") 209 | if container_name: 210 | print(f"Container: {container_name}") 211 | print(f"API Key: {'***' + api_key[-4:] if api_key and len(api_key) > 4 else 'Not provided'}") 212 | 213 | success = await test_connection( 214 | host=args.host, 215 | port=args.port, 216 | keep_alive=args.keep_alive, 217 | container_name=container_name, 218 | use_rest=args.rest, 219 | api_key=api_key 220 | ) 221 | return 0 if success else 1 222 | 223 | 224 | if __name__ == "__main__": 225 | try: 226 | sys.exit(asyncio.run(main())) 227 | except KeyboardInterrupt: 228 | print("\nExiting...") 229 | sys.exit(0) 230 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/proxy/handlers.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Request handlers for the proxy endpoints. 3 | """ 4 | 5 | import asyncio 6 | import json 7 | import logging 8 | import os 9 | from contextlib import contextmanager 10 | from typing import Dict, Any, List, Union, Optional 11 | 12 | from ..agent import ComputerAgent 13 | from computer import Computer 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class ResponsesHandler: 19 | """Handler for /responses endpoint that processes agent requests.""" 20 | 21 | def __init__(self): 22 | self.computer = None 23 | self.agent = None 24 | # Simple in-memory caches 25 | self._computer_cache: Dict[str, Any] = {} 26 | self._agent_cache: Dict[str, Any] = {} 27 | 28 | async def setup_computer_agent( 29 | self, 30 | model: str, 31 | agent_kwargs: Optional[Dict[str, Any]] = None, 32 | computer_kwargs: Optional[Dict[str, Any]] = None, 33 | ): 34 | """Set up (and cache) computer and agent instances. 35 | 36 | Caching keys: 37 | - Computer cache key: computer_kwargs 38 | - Agent cache key: {"model": model, **agent_kwargs} 39 | """ 40 | agent_kwargs = agent_kwargs or {} 41 | computer_kwargs = computer_kwargs or {} 42 | 43 | def _stable_key(obj: Dict[str, Any]) -> str: 44 | try: 45 | return json.dumps(obj, sort_keys=True, separators=(",", ":")) 46 | except Exception: 47 | # Fallback: stringify non-serializable values 48 | safe_obj = {} 49 | for k, v in obj.items(): 50 | try: 51 | json.dumps(v) 52 | safe_obj[k] = v 53 | except Exception: 54 | safe_obj[k] = str(v) 55 | return json.dumps(safe_obj, sort_keys=True, separators=(",", ":")) 56 | 57 | # Determine if custom tools are supplied; if so, skip computer setup entirely 58 | has_custom_tools = bool(agent_kwargs.get("tools")) 59 | 60 | computer = None 61 | if not has_custom_tools: 62 | # ---------- Computer setup (with cache) ---------- 63 | comp_key = _stable_key(computer_kwargs) 64 | 65 | computer = self._computer_cache.get(comp_key) 66 | if computer is None: 67 | # Default computer configuration 68 | default_c_config = { 69 | "os_type": "linux", 70 | "provider_type": "cloud", 71 | "name": os.getenv("CUA_CONTAINER_NAME"), 72 | "api_key": os.getenv("CUA_API_KEY"), 73 | } 74 | default_c_config.update(computer_kwargs) 75 | computer = Computer(**default_c_config) 76 | await computer.__aenter__() 77 | self._computer_cache[comp_key] = computer 78 | logger.info(f"Computer created and cached with key={comp_key} config={default_c_config}") 79 | else: 80 | logger.info(f"Reusing cached computer for key={comp_key}") 81 | 82 | # Bind current computer reference (None if custom tools supplied) 83 | self.computer = computer 84 | 85 | # ---------- Agent setup (with cache) ---------- 86 | # Build agent cache key from {model} + agent_kwargs (excluding tools unless explicitly passed) 87 | agent_kwargs_for_key = dict(agent_kwargs) 88 | agent_key_payload = {"model": model, **agent_kwargs_for_key} 89 | agent_key = _stable_key(agent_key_payload) 90 | 91 | agent = self._agent_cache.get(agent_key) 92 | if agent is None: 93 | # Default agent configuration 94 | default_a_config: Dict[str, Any] = {"model": model} 95 | if not has_custom_tools: 96 | default_a_config["tools"] = [computer] 97 | # Apply user overrides, but keep tools unless user explicitly sets 98 | if agent_kwargs: 99 | if not has_custom_tools: 100 | agent_kwargs.setdefault("tools", [computer]) 101 | default_a_config.update(agent_kwargs) 102 | # JSON-derived kwargs may have loose types; ignore static arg typing here 103 | agent = ComputerAgent(**default_a_config) # type: ignore[arg-type] 104 | self._agent_cache[agent_key] = agent 105 | logger.info(f"Agent created and cached with key={agent_key} model={model}") 106 | else: 107 | # Ensure cached agent uses the current computer tool (in case object differs) 108 | # Only update if tools not explicitly provided in agent_kwargs 109 | if not has_custom_tools: 110 | try: 111 | agent.tools = [computer] 112 | except Exception: 113 | pass 114 | logger.info(f"Reusing cached agent for key={agent_key}") 115 | 116 | # Bind current agent reference 117 | self.agent = agent 118 | 119 | async def process_request(self, request_data: Dict[str, Any]) -> Dict[str, Any]: 120 | """ 121 | Process a /responses request and return the result. 122 | 123 | Args: 124 | request_data: Dictionary containing model, input, and optional kwargs 125 | 126 | Returns: 127 | Dictionary with the agent's response 128 | """ 129 | try: 130 | # Extract request parameters 131 | model = request_data.get("model") 132 | input_data = request_data.get("input") 133 | agent_kwargs = request_data.get("agent_kwargs", {}) 134 | computer_kwargs = request_data.get("computer_kwargs", {}) 135 | env_overrides = request_data.get("env", {}) or {} 136 | 137 | if not model: 138 | raise ValueError("Model is required") 139 | if not input_data: 140 | raise ValueError("Input is required") 141 | 142 | # Apply env overrides for the duration of this request 143 | with self._env_overrides(env_overrides): 144 | # Set up (and possibly reuse) computer and agent via caches 145 | await self.setup_computer_agent(model, agent_kwargs, computer_kwargs) 146 | 147 | # Defensive: ensure agent is initialized for type checkers 148 | agent = self.agent 149 | if agent is None: 150 | raise RuntimeError("Agent failed to initialize") 151 | 152 | # Convert input to messages format 153 | messages = self._convert_input_to_messages(input_data) 154 | 155 | # Run agent and get first result 156 | async for result in agent.run(messages): 157 | # Return the first result and break 158 | return { 159 | "success": True, 160 | "result": result, 161 | "model": model 162 | } 163 | 164 | # If no results were yielded 165 | return { 166 | "success": False, 167 | "error": "No results from agent", 168 | "model": model 169 | } 170 | 171 | except Exception as e: 172 | logger.error(f"Error processing request: {e}") 173 | return { 174 | "success": False, 175 | "error": str(e), 176 | "model": request_data.get("model", "unknown") 177 | } 178 | 179 | def _convert_input_to_messages(self, input_data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]: 180 | """Convert input data to messages format.""" 181 | if isinstance(input_data, str): 182 | # Simple string input 183 | return [{"role": "user", "content": input_data}] 184 | elif isinstance(input_data, list): 185 | # Already in messages format 186 | messages = [] 187 | for msg in input_data: 188 | # Convert content array format if needed 189 | if isinstance(msg.get("content"), list): 190 | content_parts = [] 191 | for part in msg["content"]: 192 | if part.get("type") == "input_text": 193 | content_parts.append({"type": "text", "text": part["text"]}) 194 | elif part.get("type") == "input_image": 195 | content_parts.append({ 196 | "type": "image_url", 197 | "image_url": {"url": part["image_url"]} 198 | }) 199 | else: 200 | content_parts.append(part) 201 | messages.append({ 202 | "role": msg["role"], 203 | "content": content_parts 204 | }) 205 | else: 206 | messages.append(msg) 207 | return messages 208 | else: 209 | raise ValueError("Input must be string or list of messages") 210 | 211 | async def cleanup(self): 212 | """Clean up resources.""" 213 | if self.computer: 214 | try: 215 | await self.computer.__aexit__(None, None, None) 216 | except Exception as e: 217 | logger.error(f"Error cleaning up computer: {e}") 218 | finally: 219 | self.computer = None 220 | self.agent = None 221 | 222 | @staticmethod 223 | @contextmanager 224 | def _env_overrides(env: Dict[str, str]): 225 | """Temporarily apply environment variable overrides for the current process. 226 | Restores previous values after the context exits. 227 | 228 | Args: 229 | env: Mapping of env var names to override for this request. 230 | """ 231 | if not env: 232 | # No-op context 233 | yield 234 | return 235 | 236 | original: Dict[str, Optional[str]] = {} 237 | try: 238 | for k, v in env.items(): 239 | original[k] = os.environ.get(k) 240 | os.environ[k] = str(v) 241 | yield 242 | finally: 243 | for k, old in original.items(): 244 | if old is None: 245 | # Was not set before 246 | os.environ.pop(k, None) 247 | else: 248 | os.environ[k] = old 249 | ``` -------------------------------------------------------------------------------- /.github/workflows/publish-lume.yml: -------------------------------------------------------------------------------- ```yaml 1 | name: Publish Notarized Lume 2 | 3 | on: 4 | push: 5 | tags: 6 | - "lume-v*" 7 | workflow_dispatch: 8 | inputs: 9 | version: 10 | description: "Version to notarize (without v prefix)" 11 | required: true 12 | default: "0.1.0" 13 | workflow_call: 14 | inputs: 15 | version: 16 | description: "Version to notarize" 17 | required: true 18 | type: string 19 | secrets: 20 | APPLICATION_CERT_BASE64: 21 | required: true 22 | INSTALLER_CERT_BASE64: 23 | required: true 24 | CERT_PASSWORD: 25 | required: true 26 | APPLE_ID: 27 | required: true 28 | TEAM_ID: 29 | required: true 30 | APP_SPECIFIC_PASSWORD: 31 | required: true 32 | DEVELOPER_NAME: 33 | required: true 34 | 35 | permissions: 36 | contents: write 37 | 38 | env: 39 | APPLICATION_CERT_BASE64: ${{ secrets.APPLICATION_CERT_BASE64 }} 40 | INSTALLER_CERT_BASE64: ${{ secrets.INSTALLER_CERT_BASE64 }} 41 | CERT_PASSWORD: ${{ secrets.CERT_PASSWORD }} 42 | APPLE_ID: ${{ secrets.APPLE_ID }} 43 | TEAM_ID: ${{ secrets.TEAM_ID }} 44 | APP_SPECIFIC_PASSWORD: ${{ secrets.APP_SPECIFIC_PASSWORD }} 45 | DEVELOPER_NAME: ${{ secrets.DEVELOPER_NAME }} 46 | 47 | jobs: 48 | notarize: 49 | runs-on: macos-15 50 | outputs: 51 | sha256_checksums: ${{ steps.generate_checksums.outputs.checksums }} 52 | version: ${{ steps.set_version.outputs.version }} 53 | steps: 54 | - uses: actions/checkout@v4 55 | 56 | - name: Select Xcode 16 57 | run: | 58 | sudo xcode-select -s /Applications/Xcode_16.app 59 | xcodebuild -version 60 | 61 | - name: Install dependencies 62 | run: | 63 | brew install cpio 64 | 65 | - name: Create .release directory 66 | run: mkdir -p .release 67 | 68 | - name: Set version 69 | id: set_version 70 | run: | 71 | # Determine version from tag or input 72 | if [[ "$GITHUB_REF" == refs/tags/lume-v* ]]; then 73 | VERSION="${GITHUB_REF#refs/tags/lume-v}" 74 | echo "Using version from tag: $VERSION" 75 | elif [[ -n "${{ inputs.version }}" ]]; then 76 | VERSION="${{ inputs.version }}" 77 | echo "Using version from input: $VERSION" 78 | elif [[ -n "${{ inputs.version }}" ]]; then 79 | VERSION="${{ inputs.version }}" 80 | echo "Using version from workflow_call input: $VERSION" 81 | else 82 | echo "Error: No version found in tag or input" 83 | exit 1 84 | fi 85 | 86 | # Update version in Main.swift 87 | echo "Updating version in Main.swift to $VERSION" 88 | sed -i '' "s/static let current: String = \".*\"/static let current: String = \"$VERSION\"/" libs/lume/src/Main.swift 89 | 90 | # Set output for later steps 91 | echo "version=$VERSION" >> $GITHUB_OUTPUT 92 | 93 | - name: Import Certificates 94 | env: 95 | APPLICATION_CERT_BASE64: ${{ secrets.APPLICATION_CERT_BASE64 }} 96 | INSTALLER_CERT_BASE64: ${{ secrets.INSTALLER_CERT_BASE64 }} 97 | CERT_PASSWORD: ${{ secrets.CERT_PASSWORD }} 98 | KEYCHAIN_PASSWORD: "temp_password" 99 | run: | 100 | # Create a temporary keychain 101 | security create-keychain -p "$KEYCHAIN_PASSWORD" build.keychain 102 | security default-keychain -s build.keychain 103 | security unlock-keychain -p "$KEYCHAIN_PASSWORD" build.keychain 104 | security set-keychain-settings -t 3600 -l build.keychain 105 | 106 | # Import certificates 107 | echo $APPLICATION_CERT_BASE64 | base64 --decode > application.p12 108 | echo $INSTALLER_CERT_BASE64 | base64 --decode > installer.p12 109 | 110 | # Import certificates silently (minimize output) 111 | security import application.p12 -k build.keychain -P "$CERT_PASSWORD" -T /usr/bin/codesign -T /usr/bin/pkgbuild > /dev/null 2>&1 112 | security import installer.p12 -k build.keychain -P "$CERT_PASSWORD" -T /usr/bin/codesign -T /usr/bin/pkgbuild > /dev/null 2>&1 113 | 114 | # Allow codesign to access the certificates (minimal output) 115 | security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k "$KEYCHAIN_PASSWORD" build.keychain > /dev/null 2>&1 116 | 117 | # Verify certificates were imported 118 | echo "Verifying signing identities..." 119 | CERT_COUNT=$(security find-identity -v -p codesigning build.keychain | grep -c "Developer ID Application" || echo "0") 120 | INSTALLER_COUNT=$(security find-identity -v build.keychain | grep -c "Developer ID Installer" || echo "0") 121 | 122 | if [ "$CERT_COUNT" -eq 0 ]; then 123 | echo "Error: No Developer ID Application certificate found" 124 | security find-identity -v -p codesigning build.keychain 125 | exit 1 126 | fi 127 | 128 | if [ "$INSTALLER_COUNT" -eq 0 ]; then 129 | echo "Error: No Developer ID Installer certificate found" 130 | security find-identity -v build.keychain 131 | exit 1 132 | fi 133 | 134 | echo "Found $CERT_COUNT Developer ID Application certificate(s) and $INSTALLER_COUNT Developer ID Installer certificate(s)" 135 | echo "All required certificates verified successfully" 136 | 137 | # Clean up certificate files 138 | rm application.p12 installer.p12 139 | 140 | - name: Build and Notarize 141 | id: build_notarize 142 | env: 143 | APPLE_ID: ${{ secrets.APPLE_ID }} 144 | TEAM_ID: ${{ secrets.TEAM_ID }} 145 | APP_SPECIFIC_PASSWORD: ${{ secrets.APP_SPECIFIC_PASSWORD }} 146 | # These will now reference the imported certificates 147 | CERT_APPLICATION_NAME: "Developer ID Application: ${{ secrets.DEVELOPER_NAME }} (${{ secrets.TEAM_ID }})" 148 | CERT_INSTALLER_NAME: "Developer ID Installer: ${{ secrets.DEVELOPER_NAME }} (${{ secrets.TEAM_ID }})" 149 | VERSION: ${{ steps.set_version.outputs.version }} 150 | working-directory: ./libs/lume 151 | run: | 152 | # Minimal debug information 153 | echo "Starting build process..." 154 | echo "Swift version: $(swift --version | head -n 1)" 155 | echo "Building version: $VERSION" 156 | 157 | # Ensure .release directory exists 158 | mkdir -p .release 159 | chmod 755 .release 160 | 161 | # Build the project first (redirect verbose output) 162 | echo "Building project..." 163 | swift build --configuration release > build.log 2>&1 164 | echo "Build completed." 165 | 166 | # Run the notarization script with LOG_LEVEL env var 167 | chmod +x scripts/build/build-release-notarized.sh 168 | cd scripts/build 169 | LOG_LEVEL=minimal ./build-release-notarized.sh 170 | 171 | # Return to the lume directory 172 | cd ../.. 173 | 174 | # Debug: List what files were actually created 175 | echo "Files in .release directory:" 176 | find .release -type f -name "*.tar.gz" -o -name "*.pkg.tar.gz" 177 | 178 | # Get architecture for output filename 179 | ARCH=$(uname -m) 180 | OS_IDENTIFIER="darwin-${ARCH}" 181 | 182 | # Output paths for later use 183 | echo "tarball_path=.release/lume-${VERSION}-${OS_IDENTIFIER}.tar.gz" >> $GITHUB_OUTPUT 184 | echo "pkg_path=.release/lume-${VERSION}-${OS_IDENTIFIER}.pkg.tar.gz" >> $GITHUB_OUTPUT 185 | 186 | - name: Generate SHA256 Checksums 187 | id: generate_checksums 188 | working-directory: ./libs/lume/.release 189 | run: | 190 | # Use existing checksums file if it exists, otherwise generate one 191 | if [ -f "checksums.txt" ]; then 192 | echo "Using existing checksums file" 193 | cat checksums.txt 194 | else 195 | echo "## SHA256 Checksums" > checksums.txt 196 | echo '```' >> checksums.txt 197 | shasum -a 256 lume-*.tar.gz >> checksums.txt 198 | echo '```' >> checksums.txt 199 | fi 200 | 201 | checksums=$(cat checksums.txt) 202 | echo "checksums<<EOF" >> $GITHUB_OUTPUT 203 | echo "$checksums" >> $GITHUB_OUTPUT 204 | echo "EOF" >> $GITHUB_OUTPUT 205 | 206 | # Debug: Show all files in the release directory 207 | echo "All files in release directory:" 208 | ls -la 209 | 210 | - name: Create Standard Version Releases 211 | working-directory: ./libs/lume/.release 212 | run: | 213 | VERSION=${{ steps.set_version.outputs.version }} 214 | ARCH=$(uname -m) 215 | OS_IDENTIFIER="darwin-${ARCH}" 216 | 217 | # Create OS-tagged symlinks 218 | ln -sf "lume-${VERSION}-${OS_IDENTIFIER}.tar.gz" "lume-darwin.tar.gz" 219 | ln -sf "lume-${VERSION}-${OS_IDENTIFIER}.pkg.tar.gz" "lume-darwin.pkg.tar.gz" 220 | 221 | # Create simple symlinks 222 | ln -sf "lume-${VERSION}-${OS_IDENTIFIER}.tar.gz" "lume.tar.gz" 223 | ln -sf "lume-${VERSION}-${OS_IDENTIFIER}.pkg.tar.gz" "lume.pkg.tar.gz" 224 | 225 | # List all files (including symlinks) 226 | echo "Files with symlinks in release directory:" 227 | ls -la 228 | 229 | - name: Upload Notarized Package (Tarball) 230 | uses: actions/upload-artifact@v4 231 | with: 232 | name: lume-notarized-tarball 233 | path: ./libs/lume/${{ steps.build_notarize.outputs.tarball_path }} 234 | if-no-files-found: error 235 | 236 | - name: Upload Notarized Package (Installer) 237 | uses: actions/upload-artifact@v4 238 | with: 239 | name: lume-notarized-installer 240 | path: ./libs/lume/${{ steps.build_notarize.outputs.pkg_path }} 241 | if-no-files-found: error 242 | 243 | - name: Create Release 244 | if: startsWith(github.ref, 'refs/tags/lume-v') 245 | uses: softprops/action-gh-release@v1 246 | with: 247 | files: | 248 | ./libs/lume/${{ steps.build_notarize.outputs.tarball_path }} 249 | ./libs/lume/${{ steps.build_notarize.outputs.pkg_path }} 250 | ./libs/lume/.release/lume-darwin.tar.gz 251 | ./libs/lume/.release/lume-darwin.pkg.tar.gz 252 | ./libs/lume/.release/lume.tar.gz 253 | ./libs/lume/.release/lume.pkg.tar.gz 254 | body: | 255 | ${{ steps.generate_checksums.outputs.checksums }} 256 | 257 | ### Installation with script 258 | 259 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)" 260 | ``` 261 | generate_release_notes: true 262 | make_latest: true 263 | ``` -------------------------------------------------------------------------------- /scripts/playground-docker.sh: -------------------------------------------------------------------------------- ```bash 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | # Colors for output 6 | GREEN='\033[0;32m' 7 | BLUE='\033[0;34m' 8 | RED='\033[0;31m' 9 | YELLOW='\033[1;33m' 10 | NC='\033[0m' # No Color 11 | 12 | # Print with color 13 | print_info() { 14 | echo -e "${BLUE}==> $1${NC}" 15 | } 16 | 17 | print_success() { 18 | echo -e "${GREEN}==> $1${NC}" 19 | } 20 | 21 | print_error() { 22 | echo -e "${RED}==> $1${NC}" 23 | } 24 | 25 | print_warning() { 26 | echo -e "${YELLOW}==> $1${NC}" 27 | } 28 | 29 | echo "🚀 Launching Cua Computer-Use Agent UI..." 30 | 31 | # Check if Docker is installed 32 | if ! command -v docker &> /dev/null; then 33 | print_error "Docker is not installed!" 34 | echo "" 35 | echo "To use Cua with Docker containers, you need to install Docker first:" 36 | echo "" 37 | echo "📦 Install Docker:" 38 | echo " • macOS: Download Docker Desktop from https://docker.com/products/docker-desktop" 39 | echo " • Windows: Download Docker Desktop from https://docker.com/products/docker-desktop" 40 | echo " • Linux: Follow instructions at https://docs.docker.com/engine/install/" 41 | echo "" 42 | echo "After installing Docker, run this script again." 43 | exit 1 44 | fi 45 | 46 | # Check if Docker daemon is running 47 | if ! docker info &> /dev/null; then 48 | print_error "Docker is installed but not running!" 49 | echo "" 50 | echo "Please start Docker Desktop and try again." 51 | exit 1 52 | fi 53 | 54 | print_success "Docker is installed and running!" 55 | 56 | # Save the original working directory 57 | ORIGINAL_DIR="$(pwd)" 58 | 59 | DEMO_DIR="$HOME/.cua" 60 | mkdir -p "$DEMO_DIR" 61 | 62 | 63 | # Check if we're already in the cua repository 64 | # Look for the specific trycua identifier in pyproject.toml 65 | if [[ -f "pyproject.toml" ]] && grep -q "[email protected]" "pyproject.toml"; then 66 | print_success "Already in Cua repository - using current directory" 67 | REPO_DIR="$ORIGINAL_DIR" 68 | USE_EXISTING_REPO=true 69 | else 70 | # Directories used by the script when not in repo 71 | REPO_DIR="$DEMO_DIR/cua" 72 | USE_EXISTING_REPO=false 73 | fi 74 | 75 | # Function to clean up on exit 76 | cleanup() { 77 | cd "$ORIGINAL_DIR" 2>/dev/null || true 78 | } 79 | trap cleanup EXIT 80 | 81 | echo "" 82 | echo "Choose your Cua setup:" 83 | echo "1) ☁️ Cua Cloud Sandbox (works on any system)" 84 | echo "2) 🖥️ Local macOS VMs (requires Apple Silicon Mac + macOS 15+)" 85 | echo "3) 🖥️ Local Windows VMs (requires Windows 10 / 11)" 86 | echo "" 87 | read -p "Enter your choice (1, 2, or 3): " CHOICE 88 | 89 | if [[ "$CHOICE" == "1" ]]; then 90 | # Cua Cloud Sandbox setup 91 | echo "" 92 | print_info "Setting up Cua Cloud Sandbox..." 93 | echo "" 94 | 95 | # Check if existing .env.local already has CUA_API_KEY 96 | REPO_ENV_FILE="$REPO_DIR/.env.local" 97 | CURRENT_ENV_FILE="$ORIGINAL_DIR/.env.local" 98 | 99 | CUA_API_KEY="" 100 | 101 | # First check current directory 102 | if [[ -f "$CURRENT_ENV_FILE" ]] && grep -q "CUA_API_KEY=" "$CURRENT_ENV_FILE"; then 103 | EXISTING_CUA_KEY=$(grep "CUA_API_KEY=" "$CURRENT_ENV_FILE" | cut -d'=' -f2- | tr -d '"' | tr -d "'" | xargs) 104 | if [[ -n "$EXISTING_CUA_KEY" && "$EXISTING_CUA_KEY" != "your_cua_api_key_here" && "$EXISTING_CUA_KEY" != "" ]]; then 105 | CUA_API_KEY="$EXISTING_CUA_KEY" 106 | fi 107 | fi 108 | 109 | # Then check repo directory if not found in current dir 110 | if [[ -z "$CUA_API_KEY" ]] && [[ -f "$REPO_ENV_FILE" ]] && grep -q "CUA_API_KEY=" "$REPO_ENV_FILE"; then 111 | EXISTING_CUA_KEY=$(grep "CUA_API_KEY=" "$REPO_ENV_FILE" | cut -d'=' -f2- | tr -d '"' | tr -d "'" | xargs) 112 | if [[ -n "$EXISTING_CUA_KEY" && "$EXISTING_CUA_KEY" != "your_cua_api_key_here" && "$EXISTING_CUA_KEY" != "" ]]; then 113 | CUA_API_KEY="$EXISTING_CUA_KEY" 114 | fi 115 | fi 116 | 117 | # If no valid API key found, prompt for one 118 | if [[ -z "$CUA_API_KEY" ]]; then 119 | echo "To use Cua Cloud Sandbox, you need to:" 120 | echo "1. Sign up at https://trycua.com" 121 | echo "2. Create a Cloud Sandbox" 122 | echo "3. Generate an Api Key" 123 | echo "" 124 | read -p "Enter your Cua Api Key: " CUA_API_KEY 125 | 126 | if [[ -z "$CUA_API_KEY" ]]; then 127 | print_error "Cua Api Key is required for Cloud Sandbox." 128 | exit 1 129 | fi 130 | else 131 | print_success "Found existing CUA API key" 132 | fi 133 | 134 | USE_CLOUD=true 135 | COMPUTER_TYPE="cloud" 136 | 137 | elif [[ "$CHOICE" == "2" ]]; then 138 | # Local macOS VM setup 139 | echo "" 140 | print_info "Setting up local macOS VMs..." 141 | 142 | # Check for Apple Silicon Mac 143 | if [[ $(uname -s) != "Darwin" || $(uname -m) != "arm64" ]]; then 144 | print_error "Local macOS VMs require an Apple Silicon Mac (M1/M2/M3/M4)." 145 | echo "💡 Consider using Cua Cloud Sandbox instead (option 1)." 146 | exit 1 147 | fi 148 | 149 | # Check for macOS 15 (Sequoia) or newer 150 | OSVERSION=$(sw_vers -productVersion) 151 | if [[ $(echo "$OSVERSION 15.0" | tr " " "\n" | sort -V | head -n 1) != "15.0" ]]; then 152 | print_error "Local macOS VMs require macOS 15 (Sequoia) or newer. You have $OSVERSION." 153 | echo "💡 Consider using Cua Cloud Sandbox instead (option 1)." 154 | exit 1 155 | fi 156 | 157 | USE_CLOUD=false 158 | COMPUTER_TYPE="macos" 159 | 160 | elif [[ "$CHOICE" == "3" ]]; then 161 | # Local Windows VM setup 162 | echo "" 163 | print_info "Setting up local Windows VMs..." 164 | 165 | # Check if we're on Windows 166 | if [[ $(uname -s) != MINGW* && $(uname -s) != CYGWIN* && $(uname -s) != MSYS* ]]; then 167 | print_error "Local Windows VMs require Windows 10 or 11." 168 | echo "💡 Consider using Cua Cloud Sandbox instead (option 1)." 169 | echo "" 170 | echo "🔗 If you are using WSL, refer to the blog post to get started: https://www.trycua.com/blog/windows-sandbox" 171 | exit 1 172 | fi 173 | 174 | USE_CLOUD=false 175 | COMPUTER_TYPE="windows" 176 | 177 | else 178 | print_error "Invalid choice. Please run the script again and choose 1, 2, or 3." 179 | exit 1 180 | fi 181 | 182 | print_success "All checks passed! 🎉" 183 | 184 | # Create demo directory and handle repository 185 | if [[ "$USE_EXISTING_REPO" == "true" ]]; then 186 | print_info "Using existing repository in current directory" 187 | cd "$REPO_DIR" 188 | else 189 | # Clone or update the repository 190 | if [[ ! -d "$REPO_DIR" ]]; then 191 | print_info "Cloning Cua repository..." 192 | cd "$DEMO_DIR" 193 | git clone https://github.com/trycua/cua.git 194 | else 195 | print_info "Updating Cua repository..." 196 | cd "$REPO_DIR" 197 | git pull origin main 198 | fi 199 | 200 | cd "$REPO_DIR" 201 | fi 202 | 203 | # Create .env.local file with API keys 204 | ENV_FILE="$REPO_DIR/.env.local" 205 | if [[ ! -f "$ENV_FILE" ]]; then 206 | cat > "$ENV_FILE" << EOF 207 | # Uncomment and add your API keys here 208 | # OPENAI_API_KEY=your_openai_api_key_here 209 | # ANTHROPIC_API_KEY=your_anthropic_api_key_here 210 | CUA_API_KEY=your_cua_api_key_here 211 | EOF 212 | print_success "Created .env.local file with API key placeholders" 213 | else 214 | print_success "Found existing .env.local file - keeping your current settings" 215 | fi 216 | 217 | if [[ "$USE_CLOUD" == "true" ]]; then 218 | # Add CUA API key to .env.local if not already present 219 | if ! grep -q "CUA_API_KEY" "$ENV_FILE"; then 220 | echo "CUA_API_KEY=$CUA_API_KEY" >> "$ENV_FILE" 221 | print_success "Added CUA_API_KEY to .env.local" 222 | elif grep -q "CUA_API_KEY=your_cua_api_key_here" "$ENV_FILE"; then 223 | # Update placeholder with actual key 224 | sed -i.bak "s/CUA_API_KEY=your_cua_api_key_here/CUA_API_KEY=$CUA_API_KEY/" "$ENV_FILE" 225 | print_success "Updated CUA_API_KEY in .env.local" 226 | fi 227 | fi 228 | 229 | # Build the Docker image if it doesn't exist 230 | print_info "Checking Docker image..." 231 | if ! docker image inspect cua-dev-image &> /dev/null; then 232 | print_info "Building Docker image (this may take a while)..." 233 | ./scripts/run-docker-dev.sh build 234 | else 235 | print_success "Docker image already exists" 236 | fi 237 | 238 | # Install Lume if needed for local VMs 239 | if [[ "$USE_CLOUD" == "false" && "$COMPUTER_TYPE" == "macos" ]]; then 240 | if ! command -v lume &> /dev/null; then 241 | print_info "Installing Lume CLI..." 242 | curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh | bash 243 | 244 | # Add lume to PATH for this session if it's not already there 245 | if ! command -v lume &> /dev/null; then 246 | export PATH="$PATH:$HOME/.local/bin" 247 | fi 248 | fi 249 | 250 | # Pull the macOS CUA image if not already present 251 | if ! lume ls | grep -q "macos-sequoia-cua"; then 252 | # Check available disk space 253 | IMAGE_SIZE_GB=30 254 | AVAILABLE_SPACE_KB=$(df -k $HOME | tail -1 | awk '{print $4}') 255 | AVAILABLE_SPACE_GB=$(($AVAILABLE_SPACE_KB / 1024 / 1024)) 256 | 257 | echo "📊 The macOS CUA image will use approximately ${IMAGE_SIZE_GB}GB of disk space." 258 | echo " You currently have ${AVAILABLE_SPACE_GB}GB available on your system." 259 | 260 | # Prompt for confirmation 261 | read -p " Continue? [y]/n: " CONTINUE 262 | CONTINUE=${CONTINUE:-y} 263 | 264 | if [[ $CONTINUE =~ ^[Yy]$ ]]; then 265 | print_info "Pulling macOS CUA image (this may take a while)..." 266 | 267 | # Use caffeinate on macOS to prevent system sleep during the pull 268 | if command -v caffeinate &> /dev/null; then 269 | print_info "Using caffeinate to prevent system sleep during download..." 270 | caffeinate -i lume pull macos-sequoia-cua:latest 271 | else 272 | lume pull macos-sequoia-cua:latest 273 | fi 274 | else 275 | print_error "Installation cancelled." 276 | exit 1 277 | fi 278 | fi 279 | 280 | # Check if the VM is running 281 | print_info "Checking if the macOS CUA VM is running..." 282 | VM_RUNNING=$(lume ls | grep "macos-sequoia-cua" | grep "running" || echo "") 283 | 284 | if [ -z "$VM_RUNNING" ]; then 285 | print_info "Starting the macOS CUA VM in the background..." 286 | lume run macos-sequoia-cua:latest & 287 | # Wait a moment for the VM to initialize 288 | sleep 5 289 | print_success "VM started successfully." 290 | else 291 | print_success "macOS CUA VM is already running." 292 | fi 293 | fi 294 | 295 | # Create a convenience script to run the demo 296 | cat > "$DEMO_DIR/start_ui.sh" << EOF 297 | #!/bin/bash 298 | cd "$REPO_DIR" 299 | ./scripts/run-docker-dev.sh run agent_ui_examples.py 300 | EOF 301 | chmod +x "$DEMO_DIR/start_ui.sh" 302 | 303 | print_success "Setup complete!" 304 | 305 | if [[ "$USE_CLOUD" == "true" ]]; then 306 | echo "☁️ Cua Cloud Sandbox setup complete!" 307 | else 308 | echo "🖥️ Cua Local VM setup complete!" 309 | fi 310 | 311 | echo "📝 Edit $ENV_FILE to update your API keys" 312 | echo "🖥️ Start the playground by running: $DEMO_DIR/start_ui.sh" 313 | 314 | # Start the demo automatically 315 | echo 316 | print_info "Starting the Cua Computer-Use Agent UI..." 317 | echo "" 318 | 319 | print_success "Cua Computer-Use Agent UI is now running at http://localhost:7860/" 320 | echo 321 | echo "🌐 Open your browser and go to: http://localhost:7860/" 322 | echo 323 | "$DEMO_DIR/start_ui.sh" 324 | ``` -------------------------------------------------------------------------------- /tests/test_mcp_server_streaming.py: -------------------------------------------------------------------------------- ```python 1 | import asyncio 2 | import importlib.util 3 | import sys 4 | import types 5 | from pathlib import Path 6 | 7 | import pytest 8 | 9 | 10 | def _install_stub_module(name: str, module: types.ModuleType, registry: dict[str, types.ModuleType | None]) -> None: 11 | registry[name] = sys.modules.get(name) 12 | sys.modules[name] = module 13 | 14 | 15 | @pytest.fixture 16 | def server_module(): 17 | stubbed_modules: dict[str, types.ModuleType | None] = {} 18 | 19 | # Stub MCP Context primitives 20 | mcp_module = types.ModuleType("mcp") 21 | mcp_module.__path__ = [] # mark as package 22 | 23 | mcp_server_module = types.ModuleType("mcp.server") 24 | mcp_server_module.__path__ = [] 25 | 26 | fastmcp_module = types.ModuleType("mcp.server.fastmcp") 27 | 28 | class _StubContext: 29 | async def yield_message(self, *args, **kwargs): 30 | return None 31 | 32 | async def yield_tool_call(self, *args, **kwargs): 33 | return None 34 | 35 | async def yield_tool_output(self, *args, **kwargs): 36 | return None 37 | 38 | def report_progress(self, *_args, **_kwargs): 39 | return None 40 | 41 | def info(self, *_args, **_kwargs): 42 | return None 43 | 44 | def error(self, *_args, **_kwargs): 45 | return None 46 | 47 | class _StubImage: 48 | def __init__(self, format: str, data: bytes): 49 | self.format = format 50 | self.data = data 51 | 52 | class _StubFastMCP: 53 | def __init__(self, name: str): 54 | self.name = name 55 | self._tools: dict[str, types.FunctionType] = {} 56 | 57 | def tool(self, *args, **kwargs): 58 | def decorator(func): 59 | self._tools[func.__name__] = func 60 | return func 61 | 62 | return decorator 63 | 64 | def run(self): 65 | return None 66 | 67 | fastmcp_module.Context = _StubContext 68 | fastmcp_module.FastMCP = _StubFastMCP 69 | fastmcp_module.Image = _StubImage 70 | 71 | _install_stub_module("mcp", mcp_module, stubbed_modules) 72 | _install_stub_module("mcp.server", mcp_server_module, stubbed_modules) 73 | _install_stub_module("mcp.server.fastmcp", fastmcp_module, stubbed_modules) 74 | 75 | # Stub Computer module to avoid heavy dependencies 76 | computer_module = types.ModuleType("computer") 77 | 78 | class _StubInterface: 79 | async def screenshot(self) -> bytes: # pragma: no cover - default stub 80 | return b"" 81 | 82 | class _StubComputer: 83 | def __init__(self, *args, **kwargs): 84 | self.interface = _StubInterface() 85 | 86 | async def run(self): # pragma: no cover - default stub 87 | return None 88 | 89 | class _StubVMProviderType: 90 | CLOUD = "cloud" 91 | LOCAL = "local" 92 | 93 | computer_module.Computer = _StubComputer 94 | computer_module.VMProviderType = _StubVMProviderType 95 | 96 | _install_stub_module("computer", computer_module, stubbed_modules) 97 | 98 | # Stub agent module so server can import ComputerAgent 99 | agent_module = types.ModuleType("agent") 100 | 101 | class _StubComputerAgent: 102 | def __init__(self, *args, **kwargs): 103 | pass 104 | 105 | async def run(self, *_args, **_kwargs): # pragma: no cover - default stub 106 | if False: # pragma: no cover 107 | yield {} 108 | return 109 | 110 | agent_module.ComputerAgent = _StubComputerAgent 111 | 112 | _install_stub_module("agent", agent_module, stubbed_modules) 113 | 114 | module_name = "mcp_server_server_under_test" 115 | module_path = Path("libs/python/mcp-server/mcp_server/server.py").resolve() 116 | spec = importlib.util.spec_from_file_location(module_name, module_path) 117 | server_module = importlib.util.module_from_spec(spec) 118 | assert spec and spec.loader 119 | spec.loader.exec_module(server_module) 120 | 121 | server_instance = getattr(server_module, "server", None) 122 | if server_instance is not None and hasattr(server_instance, "_tools"): 123 | for name, func in server_instance._tools.items(): 124 | setattr(server_module, name, func) 125 | 126 | try: 127 | yield server_module 128 | finally: 129 | sys.modules.pop(module_name, None) 130 | for name, original in stubbed_modules.items(): 131 | if original is None: 132 | sys.modules.pop(name, None) 133 | else: 134 | sys.modules[name] = original 135 | 136 | 137 | class FakeContext: 138 | def __init__(self) -> None: 139 | self.events: list[tuple] = [] 140 | self.progress_updates: list[float] = [] 141 | 142 | def info(self, message: str) -> None: 143 | self.events.append(("info", message)) 144 | 145 | def error(self, message: str) -> None: 146 | self.events.append(("error", message)) 147 | 148 | def report_progress(self, value: float) -> None: 149 | self.progress_updates.append(value) 150 | 151 | async def yield_message(self, *, role: str, content): 152 | timestamp = asyncio.get_running_loop().time() 153 | self.events.append(("message", role, content, timestamp)) 154 | 155 | async def yield_tool_call(self, *, name: str | None, call_id: str, input): 156 | timestamp = asyncio.get_running_loop().time() 157 | self.events.append(("tool_call", name, call_id, input, timestamp)) 158 | 159 | async def yield_tool_output(self, *, call_id: str, output, is_error: bool = False): 160 | timestamp = asyncio.get_running_loop().time() 161 | self.events.append(("tool_output", call_id, output, is_error, timestamp)) 162 | 163 | 164 | def test_run_cua_task_streams_partial_results(server_module): 165 | async def _run_test(): 166 | class FakeAgent: 167 | script = [] 168 | 169 | def __init__(self, *args, **kwargs): 170 | pass 171 | 172 | async def run(self, messages): # type: ignore[override] 173 | for factory, delay in type(self).script: 174 | yield factory(messages) 175 | if delay: 176 | await asyncio.sleep(delay) 177 | 178 | FakeAgent.script = [ 179 | ( 180 | lambda _messages: { 181 | "output": [ 182 | { 183 | "type": "message", 184 | "role": "assistant", 185 | "content": [ 186 | {"type": "output_text", "text": "First chunk"} 187 | ], 188 | } 189 | ] 190 | }, 191 | 0.0, 192 | ), 193 | ( 194 | lambda _messages: { 195 | "output": [ 196 | { 197 | "type": "tool_use", 198 | "id": "call_1", 199 | "name": "computer", 200 | "input": {"action": "click"}, 201 | }, 202 | { 203 | "type": "computer_call_output", 204 | "call_id": "call_1", 205 | "output": [ 206 | {"type": "text", "text": "Tool completed"} 207 | ], 208 | }, 209 | ] 210 | }, 211 | 0.05, 212 | ), 213 | ] 214 | 215 | class FakeInterface: 216 | def __init__(self) -> None: 217 | self.calls = 0 218 | 219 | async def screenshot(self) -> bytes: 220 | self.calls += 1 221 | return b"final-image" 222 | 223 | fake_interface = FakeInterface() 224 | server_module.global_computer = types.SimpleNamespace(interface=fake_interface) 225 | server_module.ComputerAgent = FakeAgent # type: ignore[assignment] 226 | 227 | ctx = FakeContext() 228 | task = asyncio.create_task(server_module.run_cua_task(ctx, "open settings")) 229 | 230 | await asyncio.sleep(0.01) 231 | assert not task.done(), "Task should still be running to simulate long operation" 232 | message_events = [event for event in ctx.events if event[0] == "message"] 233 | assert message_events, "Expected message event before task completion" 234 | 235 | text_result, image = await task 236 | 237 | assert "First chunk" in text_result 238 | assert "Tool completed" in text_result 239 | assert image.data == b"final-image" 240 | assert fake_interface.calls == 1 241 | 242 | tool_call_events = [event for event in ctx.events if event[0] == "tool_call"] 243 | tool_output_events = [event for event in ctx.events if event[0] == "tool_output"] 244 | assert tool_call_events and tool_output_events 245 | assert tool_call_events[0][2] == "call_1" 246 | assert tool_output_events[0][1] == "call_1" 247 | 248 | asyncio.run(_run_test()) 249 | 250 | 251 | def test_run_multi_cua_tasks_reports_progress(server_module, monkeypatch): 252 | async def _run_test(): 253 | class FakeAgent: 254 | script = [] 255 | 256 | def __init__(self, *args, **kwargs): 257 | pass 258 | 259 | async def run(self, messages): # type: ignore[override] 260 | for factory, delay in type(self).script: 261 | yield factory(messages) 262 | if delay: 263 | await asyncio.sleep(delay) 264 | 265 | FakeAgent.script = [ 266 | ( 267 | lambda messages: { 268 | "output": [ 269 | { 270 | "type": "message", 271 | "role": "assistant", 272 | "content": [ 273 | { 274 | "type": "output_text", 275 | "text": f"Result for {messages[0].get('content')}", 276 | } 277 | ], 278 | } 279 | ] 280 | }, 281 | 0.0, 282 | ) 283 | ] 284 | 285 | server_module.ComputerAgent = FakeAgent # type: ignore[assignment] 286 | 287 | class FakeInterface: 288 | async def screenshot(self) -> bytes: 289 | return b"progress-image" 290 | 291 | server_module.global_computer = types.SimpleNamespace(interface=FakeInterface()) 292 | 293 | ctx = FakeContext() 294 | 295 | results = await server_module.run_multi_cua_tasks(ctx, ["a", "b", "c"]) 296 | 297 | assert len(results) == 3 298 | assert results[0][0] == "Result for a" 299 | assert ctx.progress_updates[0] == pytest.approx(0.0) 300 | assert ctx.progress_updates[-1] == pytest.approx(1.0) 301 | assert len(ctx.progress_updates) == 6 302 | 303 | asyncio.run(_run_test()) ``` -------------------------------------------------------------------------------- /libs/python/computer/computer/providers/cloud/provider.py: -------------------------------------------------------------------------------- ```python 1 | """Cloud VM provider implementation using CUA Public API. 2 | 3 | Implements the following public API endpoints: 4 | 5 | - GET /v1/vms 6 | - POST /v1/vms/:name/start 7 | - POST /v1/vms/:name/stop 8 | - POST /v1/vms/:name/restart 9 | """ 10 | 11 | import logging 12 | from typing import Dict, List, Optional, Any 13 | 14 | from ..base import BaseVMProvider, VMProviderType 15 | from ..types import ListVMsResponse, MinimalVM 16 | 17 | # Setup logging 18 | logger = logging.getLogger(__name__) 19 | 20 | import asyncio 21 | import aiohttp 22 | from urllib.parse import urlparse 23 | import os 24 | 25 | 26 | DEFAULT_API_BASE = os.getenv("CUA_API_BASE", "https://api.cua.ai") 27 | 28 | class CloudProvider(BaseVMProvider): 29 | """Cloud VM Provider implementation.""" 30 | def __init__( 31 | self, 32 | api_key: str, 33 | verbose: bool = False, 34 | api_base: Optional[str] = None, 35 | **kwargs, 36 | ): 37 | """ 38 | Args: 39 | api_key: API key for authentication 40 | name: Name of the VM 41 | verbose: Enable verbose logging 42 | """ 43 | assert api_key, "api_key required for CloudProvider" 44 | self.api_key = api_key 45 | self.verbose = verbose 46 | self.api_base = (api_base or DEFAULT_API_BASE).rstrip("/") 47 | 48 | @property 49 | def provider_type(self) -> VMProviderType: 50 | return VMProviderType.CLOUD 51 | 52 | async def __aenter__(self): 53 | return self 54 | 55 | async def __aexit__(self, exc_type, exc_val, exc_tb): 56 | pass 57 | 58 | async def get_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: 59 | """Get VM information by querying the VM status endpoint. 60 | 61 | - Build hostname via get_ip(name) → "{name}.containers.cloud.trycua.com" 62 | - Probe https://{hostname}:8443/status with a short timeout 63 | - If JSON contains a "status" field, return it; otherwise infer 64 | - Fallback to DNS resolve check to distinguish unknown vs not_found 65 | """ 66 | hostname = await self.get_ip(name=name) 67 | 68 | # Try HTTPS probe to the computer-server status endpoint (8443) 69 | try: 70 | timeout = aiohttp.ClientTimeout(total=3) 71 | async with aiohttp.ClientSession(timeout=timeout) as session: 72 | url = f"https://{hostname}:8443/status" 73 | async with session.get(url, allow_redirects=False) as resp: 74 | status_code = resp.status 75 | vm_status: str 76 | vm_os_type: Optional[str] = None 77 | if status_code == 200: 78 | try: 79 | data = await resp.json(content_type=None) 80 | vm_status = str(data.get("status", "ok")) 81 | vm_os_type = str(data.get("os_type")) 82 | except Exception: 83 | vm_status = "unknown" 84 | elif status_code < 500: 85 | vm_status = "unknown" 86 | else: 87 | vm_status = "unknown" 88 | return { 89 | "name": name, 90 | "status": "running" if vm_status == "ok" else vm_status, 91 | "api_url": f"https://{hostname}:8443", 92 | "os_type": vm_os_type, 93 | } 94 | except Exception: 95 | return {"name": name, "status": "not_found", "api_url": f"https://{hostname}:8443"} 96 | 97 | async def list_vms(self) -> ListVMsResponse: 98 | url = f"{self.api_base}/v1/vms" 99 | headers = { 100 | "Authorization": f"Bearer {self.api_key}", 101 | "Accept": "application/json", 102 | } 103 | async with aiohttp.ClientSession() as session: 104 | async with session.get(url, headers=headers) as resp: 105 | if resp.status == 200: 106 | try: 107 | data = await resp.json(content_type=None) 108 | except Exception: 109 | text = await resp.text() 110 | logger.error(f"Failed to parse list_vms JSON: {text}") 111 | return [] 112 | if isinstance(data, list): 113 | # Enrich with convenience URLs when possible. 114 | enriched: List[Dict[str, Any]] = [] 115 | for item in data: 116 | vm = dict(item) if isinstance(item, dict) else {} 117 | name = vm.get("name") 118 | password = vm.get("password") 119 | if isinstance(name, str) and name: 120 | host = f"{name}.containers.cloud.trycua.com" 121 | # api_url: always set if missing 122 | if not vm.get("api_url"): 123 | vm["api_url"] = f"https://{host}:8443" 124 | # vnc_url: only when password available 125 | if not vm.get("vnc_url") and isinstance(password, str) and password: 126 | vm[ 127 | "vnc_url" 128 | ] = f"https://{host}/vnc.html?autoconnect=true&password={password}" 129 | enriched.append(vm) 130 | return enriched # type: ignore[return-value] 131 | logger.warning("Unexpected response for list_vms; expected list") 132 | return [] 133 | elif resp.status == 401: 134 | logger.error("Unauthorized: invalid CUA API key for list_vms") 135 | return [] 136 | else: 137 | text = await resp.text() 138 | logger.error(f"list_vms failed: HTTP {resp.status} - {text}") 139 | return [] 140 | 141 | async def run_vm(self, name: str, image: Optional[str] = None, run_opts: Optional[Dict[str, Any]] = None, storage: Optional[str] = None) -> Dict[str, Any]: 142 | """Start a VM via public API. Returns a minimal status.""" 143 | url = f"{self.api_base}/v1/vms/{name}/start" 144 | headers = { 145 | "Authorization": f"Bearer {self.api_key}", 146 | "Accept": "application/json", 147 | } 148 | async with aiohttp.ClientSession() as session: 149 | async with session.post(url, headers=headers) as resp: 150 | if resp.status in (200, 201, 202, 204): 151 | return {"name": name, "status": "starting"} 152 | elif resp.status == 404: 153 | return {"name": name, "status": "not_found"} 154 | elif resp.status == 401: 155 | return {"name": name, "status": "unauthorized"} 156 | else: 157 | text = await resp.text() 158 | return {"name": name, "status": "error", "message": text} 159 | 160 | async def stop_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: 161 | """Stop a VM via public API.""" 162 | url = f"{self.api_base}/v1/vms/{name}/stop" 163 | headers = { 164 | "Authorization": f"Bearer {self.api_key}", 165 | "Accept": "application/json", 166 | } 167 | async with aiohttp.ClientSession() as session: 168 | async with session.post(url, headers=headers) as resp: 169 | if resp.status in (200, 202): 170 | # Spec says 202 with {"status":"stopping"} 171 | body_status: Optional[str] = None 172 | try: 173 | data = await resp.json(content_type=None) 174 | body_status = data.get("status") if isinstance(data, dict) else None 175 | except Exception: 176 | body_status = None 177 | return {"name": name, "status": body_status or "stopping"} 178 | elif resp.status == 404: 179 | return {"name": name, "status": "not_found"} 180 | elif resp.status == 401: 181 | return {"name": name, "status": "unauthorized"} 182 | else: 183 | text = await resp.text() 184 | return {"name": name, "status": "error", "message": text} 185 | 186 | async def restart_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: 187 | """Restart a VM via public API.""" 188 | url = f"{self.api_base}/v1/vms/{name}/restart" 189 | headers = { 190 | "Authorization": f"Bearer {self.api_key}", 191 | "Accept": "application/json", 192 | } 193 | async with aiohttp.ClientSession() as session: 194 | async with session.post(url, headers=headers) as resp: 195 | if resp.status in (200, 202): 196 | # Spec says 202 with {"status":"restarting"} 197 | body_status: Optional[str] = None 198 | try: 199 | data = await resp.json(content_type=None) 200 | body_status = data.get("status") if isinstance(data, dict) else None 201 | except Exception: 202 | body_status = None 203 | return {"name": name, "status": body_status or "restarting"} 204 | elif resp.status == 404: 205 | return {"name": name, "status": "not_found"} 206 | elif resp.status == 401: 207 | return {"name": name, "status": "unauthorized"} 208 | else: 209 | text = await resp.text() 210 | return {"name": name, "status": "error", "message": text} 211 | 212 | async def update_vm(self, name: str, update_opts: Dict[str, Any], storage: Optional[str] = None) -> Dict[str, Any]: 213 | logger.warning("CloudProvider.update_vm is not implemented via public API") 214 | return {"name": name, "status": "unchanged", "message": "update_vm not supported by public API"} 215 | 216 | async def get_ip(self, name: Optional[str] = None, storage: Optional[str] = None, retry_delay: int = 2) -> str: 217 | """ 218 | Return the VM's IP address as '{container_name}.containers.cloud.trycua.com'. 219 | Uses the provided 'name' argument (the VM name requested by the caller), 220 | falling back to self.name only if 'name' is None. 221 | Retries up to 3 times with retry_delay seconds if hostname is not available. 222 | """ 223 | if name is None: 224 | raise ValueError("VM name is required for CloudProvider.get_ip") 225 | return f"{name}.containers.cloud.trycua.com" 226 | ``` -------------------------------------------------------------------------------- /libs/lume/scripts/install.sh: -------------------------------------------------------------------------------- ```bash 1 | #!/bin/bash 2 | set -e 3 | 4 | # Lume Installer 5 | # This script installs Lume to your system 6 | 7 | # Define colors for output 8 | BOLD=$(tput bold) 9 | NORMAL=$(tput sgr0) 10 | RED=$(tput setaf 1) 11 | GREEN=$(tput setaf 2) 12 | BLUE=$(tput setaf 4) 13 | YELLOW=$(tput setaf 3) 14 | 15 | # Check if running as root or with sudo 16 | if [ "$(id -u)" -eq 0 ] || [ -n "$SUDO_USER" ]; then 17 | echo "${RED}Error: Do not run this script with sudo or as root.${NORMAL}" 18 | echo "If you need to install to a system directory, create it first with proper permissions:" 19 | echo " sudo mkdir -p /desired/directory && sudo chown $(whoami) /desired/directory" 20 | echo "Then run the installer normally:" 21 | echo " ./install.sh --install-dir=/desired/directory" 22 | exit 1 23 | fi 24 | 25 | # Default installation directory (user-specific, doesn't require sudo) 26 | DEFAULT_INSTALL_DIR="$HOME/.local/bin" 27 | INSTALL_DIR="${INSTALL_DIR:-$DEFAULT_INSTALL_DIR}" 28 | 29 | # GitHub info 30 | GITHUB_REPO="trycua/cua" 31 | LATEST_RELEASE_URL="https://api.github.com/repos/$GITHUB_REPO/releases/latest" 32 | 33 | # Option to skip background service setup (default: install it) 34 | INSTALL_BACKGROUND_SERVICE=true 35 | 36 | # Default port for lume serve (default: 7777) 37 | LUME_PORT=7777 38 | 39 | # Parse command line arguments 40 | while [ "$#" -gt 0 ]; do 41 | case "$1" in 42 | --install-dir) 43 | INSTALL_DIR="$2" 44 | shift 45 | ;; 46 | --port) 47 | LUME_PORT="$2" 48 | shift 49 | ;; 50 | --no-background-service) 51 | INSTALL_BACKGROUND_SERVICE=false 52 | ;; 53 | --help) 54 | echo "${BOLD}${BLUE}Lume Installer${NORMAL}" 55 | echo "Usage: $0 [OPTIONS]" 56 | echo "" 57 | echo "Options:" 58 | echo " --install-dir DIR Install to the specified directory (default: $DEFAULT_INSTALL_DIR)" 59 | echo " --port PORT Specify the port for lume serve (default: 7777)" 60 | echo " --no-background-service Do not setup the Lume background service (LaunchAgent)" 61 | echo " --help Display this help message" 62 | echo "" 63 | echo "Examples:" 64 | echo " $0 # Install to $DEFAULT_INSTALL_DIR and setup background service" 65 | echo " $0 --install-dir=/usr/local/bin # Install to system directory (may require root privileges)" 66 | echo " $0 --port 7778 # Use port 7778 instead of the default 7777" 67 | echo " $0 --no-background-service # Install without setting up the background service" 68 | echo " INSTALL_DIR=/opt/lume $0 # Install to /opt/lume (legacy env var support)" 69 | exit 0 70 | ;; 71 | *) 72 | echo "${RED}Unknown option: $1${NORMAL}" 73 | echo "Use --help for usage information" 74 | exit 1 75 | ;; 76 | esac 77 | shift 78 | done 79 | 80 | echo "${BOLD}${BLUE}Lume Installer${NORMAL}" 81 | echo "This script will install Lume to your system." 82 | 83 | # Check if we're running with appropriate permissions 84 | check_permissions() { 85 | # System directories that typically require root privileges 86 | SYSTEM_DIRS=("/usr/local/bin" "/usr/bin" "/bin" "/opt") 87 | 88 | NEEDS_ROOT=false 89 | for DIR in "${SYSTEM_DIRS[@]}"; do 90 | if [[ "$INSTALL_DIR" == "$DIR"* ]] && [ ! -w "$INSTALL_DIR" ]; then 91 | NEEDS_ROOT=true 92 | break 93 | fi 94 | done 95 | 96 | if [ "$NEEDS_ROOT" = true ]; then 97 | echo "${YELLOW}Warning: Installing to $INSTALL_DIR may require root privileges.${NORMAL}" 98 | echo "Consider these alternatives:" 99 | echo " • Install to a user-writable location: $0 --install-dir=$HOME/.local/bin" 100 | echo " • Create the directory with correct permissions first:" 101 | echo " sudo mkdir -p $INSTALL_DIR && sudo chown $(whoami) $INSTALL_DIR" 102 | echo "" 103 | 104 | # Check if we already have write permission (might have been set up previously) 105 | if [ ! -w "$INSTALL_DIR" ] && [ ! -w "$(dirname "$INSTALL_DIR")" ]; then 106 | echo "${RED}Error: You don't have write permission to $INSTALL_DIR${NORMAL}" 107 | echo "Please choose a different installation directory or ensure you have the proper permissions." 108 | exit 1 109 | fi 110 | fi 111 | } 112 | 113 | # Detect OS and architecture 114 | detect_platform() { 115 | OS=$(uname -s | tr '[:upper:]' '[:lower:]') 116 | ARCH=$(uname -m) 117 | 118 | if [ "$OS" != "darwin" ]; then 119 | echo "${RED}Error: Currently only macOS is supported.${NORMAL}" 120 | exit 1 121 | fi 122 | 123 | if [ "$ARCH" != "arm64" ]; then 124 | echo "${RED}Error: Lume only supports macOS on Apple Silicon (ARM64).${NORMAL}" 125 | exit 1 126 | fi 127 | 128 | PLATFORM="darwin-arm64" 129 | echo "Detected platform: ${BOLD}$PLATFORM${NORMAL}" 130 | } 131 | 132 | # Create temporary directory 133 | create_temp_dir() { 134 | TEMP_DIR=$(mktemp -d) 135 | echo "Using temporary directory: $TEMP_DIR" 136 | 137 | # Make sure we clean up on exit 138 | trap 'rm -rf "$TEMP_DIR"' EXIT 139 | } 140 | 141 | # Download the latest release 142 | download_release() { 143 | echo "Downloading latest Lume release..." 144 | 145 | # Use the direct download link with the non-versioned symlink 146 | DOWNLOAD_URL="https://github.com/$GITHUB_REPO/releases/latest/download/lume.tar.gz" 147 | echo "Downloading from: $DOWNLOAD_URL" 148 | 149 | # Download the tarball 150 | if command -v curl &> /dev/null; then 151 | curl -L --progress-bar "$DOWNLOAD_URL" -o "$TEMP_DIR/lume.tar.gz" 152 | 153 | # Verify the download was successful 154 | if [ ! -s "$TEMP_DIR/lume.tar.gz" ]; then 155 | echo "${RED}Error: Failed to download Lume.${NORMAL}" 156 | echo "The download URL may be incorrect or the file may not exist." 157 | exit 1 158 | fi 159 | 160 | # Verify the file is a valid archive 161 | if ! tar -tzf "$TEMP_DIR/lume.tar.gz" > /dev/null 2>&1; then 162 | echo "${RED}Error: The downloaded file is not a valid tar.gz archive.${NORMAL}" 163 | echo "Let's try the alternative URL..." 164 | 165 | # Try alternative URL 166 | ALT_DOWNLOAD_URL="https://github.com/$GITHUB_REPO/releases/latest/download/lume-$PLATFORM.tar.gz" 167 | echo "Downloading from alternative URL: $ALT_DOWNLOAD_URL" 168 | curl -L --progress-bar "$ALT_DOWNLOAD_URL" -o "$TEMP_DIR/lume.tar.gz" 169 | 170 | # Check again 171 | if ! tar -tzf "$TEMP_DIR/lume.tar.gz" > /dev/null 2>&1; then 172 | echo "${RED}Error: Could not download a valid Lume archive.${NORMAL}" 173 | echo "Please try installing Lume manually from: https://github.com/$GITHUB_REPO/releases/latest" 174 | exit 1 175 | fi 176 | fi 177 | else 178 | echo "${RED}Error: curl is required but not installed.${NORMAL}" 179 | exit 1 180 | fi 181 | } 182 | 183 | # Extract and install 184 | install_binary() { 185 | echo "Extracting archive..." 186 | tar -xzf "$TEMP_DIR/lume.tar.gz" -C "$TEMP_DIR" 187 | 188 | echo "Installing to $INSTALL_DIR..." 189 | 190 | # Create install directory if it doesn't exist 191 | mkdir -p "$INSTALL_DIR" 192 | 193 | # Move the binary to the installation directory 194 | mv "$TEMP_DIR/lume" "$INSTALL_DIR/" 195 | 196 | # Make the binary executable 197 | chmod +x "$INSTALL_DIR/lume" 198 | 199 | echo "${GREEN}Installation complete!${NORMAL}" 200 | echo "Lume has been installed to ${BOLD}$INSTALL_DIR/lume${NORMAL}" 201 | 202 | # Check if the installation directory is in PATH 203 | if [ -n "${PATH##*$INSTALL_DIR*}" ]; then 204 | SHELL_NAME=$(basename "$SHELL") 205 | echo "${YELLOW}Warning: $INSTALL_DIR is not in your PATH.${NORMAL}" 206 | case "$SHELL_NAME" in 207 | zsh) 208 | echo "To add it, run:" 209 | echo " echo 'export PATH=\"\$PATH:$INSTALL_DIR\"' >> ~/.zprofile" 210 | ;; 211 | bash) 212 | echo "To add it, run:" 213 | echo " echo 'export PATH=\"\$PATH:$INSTALL_DIR\"' >> ~/.bash_profile" 214 | ;; 215 | fish) 216 | echo "To add it, run:" 217 | echo " echo 'fish_add_path $INSTALL_DIR' >> ~/.config/fish/config.fish" 218 | ;; 219 | *) 220 | echo "Add $INSTALL_DIR to your PATH in your shell profile file." 221 | ;; 222 | esac 223 | fi 224 | } 225 | 226 | # Main installation flow 227 | main() { 228 | check_permissions 229 | detect_platform 230 | create_temp_dir 231 | download_release 232 | install_binary 233 | 234 | echo "" 235 | echo "${GREEN}${BOLD}Lume has been successfully installed!${NORMAL}" 236 | echo "Run ${BOLD}lume${NORMAL} to get started." 237 | 238 | if [ "$INSTALL_BACKGROUND_SERVICE" = true ]; then 239 | # --- Setup background service (LaunchAgent) for Lume --- 240 | SERVICE_NAME="com.trycua.lume_daemon" 241 | PLIST_PATH="$HOME/Library/LaunchAgents/$SERVICE_NAME.plist" 242 | LUME_BIN="$INSTALL_DIR/lume" 243 | 244 | echo "" 245 | echo "Setting up LaunchAgent to run lume daemon on login..." 246 | 247 | # Create LaunchAgents directory if it doesn't exist 248 | mkdir -p "$HOME/Library/LaunchAgents" 249 | 250 | # Unload existing service if present 251 | if [ -f "$PLIST_PATH" ]; then 252 | echo "Existing LaunchAgent found. Unloading..." 253 | launchctl unload "$PLIST_PATH" 2>/dev/null || true 254 | fi 255 | 256 | # Create the plist file 257 | cat <<EOF > "$PLIST_PATH" 258 | <?xml version="1.0" encoding="UTF-8"?> 259 | <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd"> 260 | <plist version="1.0"> 261 | <dict> 262 | <key>Label</key> 263 | <string>$SERVICE_NAME</string> 264 | <key>ProgramArguments</key> 265 | <array> 266 | <string>$LUME_BIN</string> 267 | <string>serve</string> 268 | <string>--port</string> 269 | <string>$LUME_PORT</string> 270 | </array> 271 | <key>RunAtLoad</key> 272 | <true/> 273 | <key>KeepAlive</key> 274 | <true/> 275 | <key>WorkingDirectory</key> 276 | <string>$HOME</string> 277 | <key>EnvironmentVariables</key> 278 | <dict> 279 | <key>PATH</key> 280 | <string>/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:$HOME/.local/bin</string> 281 | <key>HOME</key> 282 | <string>$HOME</string> 283 | </dict> 284 | <key>StandardOutPath</key> 285 | <string>/tmp/lume_daemon.log</string> 286 | <key>StandardErrorPath</key> 287 | <string>/tmp/lume_daemon.error.log</string> 288 | <key>ProcessType</key> 289 | <string>Interactive</string> 290 | <key>SessionType</key> 291 | <string>Aqua</string> 292 | </dict> 293 | </plist> 294 | EOF 295 | 296 | # Set permissions 297 | chmod 644 "$PLIST_PATH" 298 | touch /tmp/lume_daemon.log /tmp/lume_daemon.error.log 299 | chmod 644 /tmp/lume_daemon.log /tmp/lume_daemon.error.log 300 | 301 | # Load the LaunchAgent 302 | echo "Loading LaunchAgent..." 303 | launchctl unload "$PLIST_PATH" 2>/dev/null || true 304 | launchctl load "$PLIST_PATH" 305 | 306 | echo "${GREEN}Lume daemon LaunchAgent installed and loaded. It will start automatically on login!${NORMAL}" 307 | echo "To check status: launchctl list | grep $SERVICE_NAME" 308 | echo "To view logs: tail -f /tmp/lume_daemon.log" 309 | echo "" 310 | echo "To remove the lume daemon service, run:" 311 | echo " launchctl unload \"$PLIST_PATH\"" 312 | echo " rm \"$PLIST_PATH\"" 313 | else 314 | SERVICE_NAME="com.trycua.lume_daemon" 315 | PLIST_PATH="$HOME/Library/LaunchAgents/$SERVICE_NAME.plist" 316 | if [ -f "$PLIST_PATH" ]; then 317 | echo "Removing existing Lume background service (LaunchAgent)..." 318 | launchctl unload "$PLIST_PATH" 2>/dev/null || true 319 | rm "$PLIST_PATH" 320 | echo "Lume background service (LaunchAgent) removed." 321 | else 322 | echo "Skipping Lume background service (LaunchAgent) setup as requested (use --no-background-service)." 323 | fi 324 | fi 325 | } 326 | 327 | # Run the installation 328 | main 329 | ``` -------------------------------------------------------------------------------- /blog/hack-the-north.md: -------------------------------------------------------------------------------- ```markdown 1 | # What happens when hackathon judging is a public benchmark (Hack the North edition) 2 | 3 | *Written by Francesco Bonacci — Reviewed by Parth Patel (HUD W25) — Sept 25, 2025* 4 | 5 | ## Prologue 6 | 7 | Hack the North ran Sept 12–14 at the University of Waterloo. Official count this year: **1,778 hackers**, and a [Guinness World Record for the most people building interlocking plastic brick sculptures simultaneously](https://uwaterloo.ca/news/eweal-making-hackathons-fun-again-breaking-guinness-world-record). 8 | 9 | Our team arrived from Europe and the US one day before the hackathon, after a summer scattered post–YC X25, waiting for our O-1 visas. **HUD**’s founders Parth and Jay flew in from SF to help us run evaluations, and Michael and Parth from **Ollama** joined as co-sponsors. 10 | 11 | Our plan was ambitious: run the **first state-of-the-art Computer-Use Agents track**, score it on a public benchmark, and give the top performer a guaranteed YC interview. (Interview ≠ offer. YC didn’t judge.) 12 | 13 | The rest, as they say, was a 36h story worth telling—and a playbook worth sharing for anyone thinking about running or sponsoring this type of hackathon track. 14 | 15 |  16 | 17 | ## The sign-up problem we had to invent 18 | 19 | We joined as a sponsor at the last minute, thanks to a push from our friend @Michael Chiang at Ollama—Waterloo alum, naturally. It’s kind of an open secret that UWaterloo turns out some of the sharpest hackers around (*no pun intended, HackMIT*). It was a bit of a scramble, but also great timing—our Agent framework had just finished a major refactor, with support for **100+ VLM configurations** now live. Naturally, we wanted to stress-test it at scale—and see whether teams could come up with SOTA-level setups. *This wasn’t a blank-slate, build-whatever-you-want kind of track.* 20 | 21 | From day one, though, we knew we’d have to fight for sign-ups. This was a niche track, and a guaranteed YC interview alone wouldn’t be enough to pull people in. 22 | 23 | Unfortunately, Hack the North (HTN) didn’t offer an interest form to help us estimate demand, which made capacity planning tricky—especially with early-stage infra. Stress-testing takes foresight, and multimodal language model usage is still costly (~1.5× to 3–4× the price of comparable text-only models). 24 | 25 | On top of that, we were discouraged from external promotion on [lu.ma](http://lu.ma). So we spun up our own sign-up page at **trycua.com/hackathon** and built ad-hoc Discord channels to share track details. We emphasized—repeatedly—that only students already accepted to Hack the North should register. 26 | 27 | *(Moral: the “measure-zero effect”—no matter how many times you say it, some people won’t see it. Plenty of invalid sign-ups still slipped through.)* 28 | 29 | Even so, having your own form is absolutely worth it: it gives you an **early funnel**, surfaces demand signals ahead of time, and—crucially—**lets you require platform sign-up before kickoff**. In our case, Hack the North didn’t provide Devpost access until the very end, so our form was the only way to build a working roster. 30 | 31 | Only a small trickle of sign-ups came through by the time the event kicked off—too few to plan around, but clearly the right kind of crowd. Several were already familiar with computer-use agents; one was even interning at Shopify, working on this space. 32 | 33 | ## At the Sponsor Booth 34 | 35 | Day 0 on campus made the difference. We arrived a couple of hours early to collect swag shipments (around 1,200 stickers of our new **Cua-la** mascot, plus t-shirts and hats—always plan ~1.5× the estimated number of hackers!). After walking the sponsor floor and explaining the track at our booth, ~40 hackers signed up. 36 | 37 | **Moral:** sponsor booths are still the most effective way to recruit for a track. 38 | 39 | **Suggestions to maximize booth time (for HTN this is only ~24 of the total 36 hours):** 40 | 41 | - **Be unmistakable.** Run a mini-challenge and a visible giveaway. We offered 5 × $200 Anthropic credits as a lightning raffle and constantly advertised in HTN Slack. Shout-out to our neighbors at **Mintlify**, who dressed their teammate as a mint plant - memorable and effective. 42 | - **Create multiple touchpoints.** Hand out flyers and QR codes, and ask nearby booths to cross-refer. Big thanks to the YC team for flyer space and student connections - and to Michael (Ollama) for pointing visitors our way. 43 | - **Never leave the booth empty.** Keep someone at the booth at all times and rotate shifts. With four founding engineers on-site, coverage was easy. Even after hacking kicked off, the booth stayed a point of reference - and even then multiple participants DM’d us asking where to meet up. 44 | - **Students are organic DevRel.** Our runner-up, Adam, hung out with us at the booth, pulling more people in. Peer-to-peer energy creates the network effect you need! 45 | 46 |  47 | 48 | *(Our Founding Engineer, Morgan, hangs out with students at the stand, while Adam (runner-up) hacks on the side.)* 49 | 50 | ## 02:30 a.m. is still prime time at a hackathon 51 | 52 | Hack the North gives sponsors a 30-minute API Workshop during the early hours of the event—a perfect moment to shift from talking to building. 53 | 54 | Our slot landed at **2:30 a.m.** (*perks of the cheapest sponsor tier*). Thirty students showed up, energy surprisingly high. James, our new Founding DevRel Engineer, led the session and nailed it. 55 | 56 | **Our track rules were simple:** 57 | 58 | 1. Build a Computer-Use Agent with the [Cua framework](https://github.com/trycua/cua) 59 | 2. Benchmark the agent on [HUD](https://www.hud.so) 60 | 3. Use [OSWorld-Tiny](https://huggingface.co/datasets/ddupont/OSWorld-Tiny-Public): a 14-task distillation of the full benchmark (~360 tasks, >1h) 61 | 62 | **Suggestions:** 63 | 64 | - **Leave something tangible.** We provided a Jupyter Notebook teams could run immediately. 65 | - **Narrow scope, strong starts.** The more focused the challenge, the more **robust starting points** you should provide. 66 | - **Want the details?** [Here’s the notebook we left participants](https://github.com/trycua/cua/blob/main/notebooks/sota_hackathon.ipynb). 67 | 68 |  69 | 70 | *(Our CUA Workshop at 2:30 AM.)* 71 | 72 | ## Making it possible to focus on the work 73 | 74 | If you’re an OSS framework, it’s tempting to have hackers self-host on laptops. **Don’t.** You’ll spend the workshop debugging setups instead of reviewing ideas. 75 | 76 | **Lesson learned:** within hours, we shifted to **cloud-only Sandboxes**. Payoff: consistent environments, faster starts, far less tech support. 77 | 78 | We provided: 79 | 80 | - **Credits:** $200 Cua Cloud + $200 HUD per team (manual top-ups for visible progress) 81 | - **LLMs/VLMs:** Anthropic assigned $50 per participant—tight for VLM iteration—so we added capped access under our org 82 | - **Pre-kickoff provisioning:** Platform sign-up auto-created projects, keys, and sandboxes 83 | 84 | **Takeaway:** every minute not spent on setup is a minute gained for iterating. 85 | 86 | ## 12 Hours in the Hackathon 87 | 88 | **After the workshop buzz.** Morning interest was high, but Docker setup + requiring focus on a single track thinned the crowd. Most sponsor prizes are broad (“use our product and you qualify”), letting students stack tracks. Ours required commitment. Upside: those who stayed shipped sharper, higher-quality submissions. 89 | 90 | **The bell curve of submissions.** Most entries used *claude-sonnet-4-20250514*—proof that docs and public leaderboards ([OSWorld](https://os-world.github.io/#benchmark)) guide choices. Results clustered around the safe pick, with fewer pushing boundaries. 91 | 92 | **Who went beyond the baseline.** A few tried multi-agent/tool graphs. One standout—[**cuala**](https://github.com/YeIIcw/cuala)—was a clean reference: deterministic actions, verifiable state changes, callbacks for saving images and trajectories. 93 | 94 | **Bottom line:** Early excitement is easy; keeping teams engaged requires reducing friction and offering multiple entry points. 95 | 96 | ### What broke (and why) 97 | 98 | We skipped a full end-to-end **Cua × HUD** dry-run. It showed. 99 | 100 | - Hackers ran out of inference credits. Desktop tasks are token-heavy. A full OSWorld run (200 max steps) for *computer-use-preview* (OpenAI Operator API) can cost >$600. Serious attempts: ~400k tokens × 14 tasks. 101 | - Python version/build mismatches surfaced, requiring debug time across both OSS repos. 102 | - Our Cua framework lacked a **Response Agent** to complete evaluation loops. Some runs stalled until patched. 103 | 104 | ## Scoring and Results 105 | 106 | ### Participation & Outcomes 107 | 108 | - ~**30** hackers gave the track a serious try; **5** crossed the finish line 109 | - All submissions were **solo**, mostly undergrads 110 | - Judging: OSWorld-Tiny on HUD, with Cua + HUD reruns to verify scores 111 | - Final leaderboard: [HUD Leaderboard](https://www.hud.so/leaderboards/ddupont/OSWorld-Tiny-Public) 112 | 113 |  114 | 115 | *(Leaderboard on HUD)* 116 | 117 | ### Winners 118 | 119 | **🥇 Winner — Ram** 120 | - Devpost: https://devpost.com/software/sota-computer-use-agent-challenge 121 | - Code: https://github.com/Ram-Raghav-S/cua/tree/ram 122 | - Score: 68.3% 123 | 124 | **🥈 Runner-up — Aryan** 125 | - Devpost: https://devpost.com/software/loopdeloop-computer-use-agent-sota-attempt 126 | - Code: https://github.com/Tumph/cua 127 | - Score: 55.9% 128 | 129 | **🥉 Special Mention — Adam** 130 | - Devpost: https://devpost.com/software/cuala 131 | - Code: https://github.com/YeIIcw/cuala 132 | - Score: 42.1% 133 | 134 |  135 | 136 | *(Our finalists before the award ceremony)* 137 | 138 | ## What We’d Keep 139 | 140 | - **Sponsor Hack the North again** 141 | - **Keep a visible, staffed booth** 142 | - **Publish a compact FAQ** 143 | - **Simple, transparent scoring** 144 | 145 | ## What We’d Change 146 | 147 | - **Run a full Cua × HUD dry-run under load** 148 | - **Offer multiple on-ramps (evals, creative, RL)** 149 | - **Keep a private eval set for judging** 150 | - **Default to cloud sandboxes** 151 | - **Handle ops earlier (swag, signage, QR codes)** 152 | - **Reward generalization, not lucky runs** 153 | 154 | ## Closing Thoughts 155 | 156 | Our first outing as sponsors wasn’t perfect, but it gave us a working playbook: **provision cloud early, keep scoring simple, always dry-run infra, and make the booth unforgettable**. 157 | 158 | If more hackathon tracks leaned on **public benchmarks**, weekends like this would produce fewer demos-for-show and more measurable progress. 159 | 160 | **P.S.** Huge thanks to the Ollama and HUD teams for co-sponsoring the track, and to our YC Partner Diana for offering a **guaranteed YC interview** as first prize. 161 | 162 | Whether you’re a hacker who wants to participate, or a company looking to sponsor, let’s talk — we’re especially excited to support benchmark-first hackathon tracks in the Bay Area this year. 163 | 164 |  165 | 166 | *(HTN Closing Ceremony — Cua Track Winner Announcement)* ``` -------------------------------------------------------------------------------- /libs/typescript/computer/src/interface/base.ts: -------------------------------------------------------------------------------- ```typescript 1 | /** 2 | * Base interface for computer control. 3 | */ 4 | 5 | import pino from 'pino'; 6 | import WebSocket from 'ws'; 7 | import type { ScreenSize } from '../types'; 8 | 9 | export type MouseButton = 'left' | 'middle' | 'right'; 10 | 11 | export interface CursorPosition { 12 | x: number; 13 | y: number; 14 | } 15 | 16 | export interface AccessibilityNode { 17 | role: string; 18 | title?: string; 19 | value?: string; 20 | description?: string; 21 | bounds?: { 22 | x: number; 23 | y: number; 24 | width: number; 25 | height: number; 26 | }; 27 | children?: AccessibilityNode[]; 28 | } 29 | 30 | /** 31 | * Base class for computer control interfaces. 32 | */ 33 | export abstract class BaseComputerInterface { 34 | protected ipAddress: string; 35 | protected username: string; 36 | protected password: string; 37 | protected closed = false; 38 | protected commandLock: Promise<unknown> = Promise.resolve(); 39 | protected ws: WebSocket; 40 | protected apiKey?: string; 41 | protected vmName?: string; 42 | 43 | protected logger = pino({ name: 'computer.interface-base' }); 44 | 45 | constructor( 46 | ipAddress: string, 47 | username = 'lume', 48 | password = 'lume', 49 | apiKey?: string, 50 | vmName?: string 51 | ) { 52 | this.ipAddress = ipAddress; 53 | this.username = username; 54 | this.password = password; 55 | this.apiKey = apiKey; 56 | this.vmName = vmName; 57 | 58 | // Initialize WebSocket with headers if needed 59 | const headers: { [key: string]: string } = {}; 60 | if (this.apiKey && this.vmName) { 61 | headers['X-API-Key'] = this.apiKey; 62 | headers['X-VM-Name'] = this.vmName; 63 | } 64 | 65 | // Create the WebSocket instance 66 | this.ws = new WebSocket(this.wsUri, { headers }); 67 | } 68 | 69 | /** 70 | * Get the WebSocket URI for connection. 71 | * Subclasses can override this to customize the URI. 72 | */ 73 | protected get wsUri(): string { 74 | const protocol = this.apiKey ? 'wss' : 'ws'; 75 | 76 | // Check if ipAddress already includes a port 77 | if (this.ipAddress.includes(':')) { 78 | return `${protocol}://${this.ipAddress}/ws`; 79 | } 80 | 81 | // Otherwise, append the default port 82 | const port = this.apiKey ? '8443' : '8000'; 83 | return `${protocol}://${this.ipAddress}:${port}/ws`; 84 | } 85 | 86 | /** 87 | * Wait for interface to be ready. 88 | * @param timeout Maximum time to wait in seconds 89 | * @throws Error if interface is not ready within timeout 90 | */ 91 | async waitForReady(timeout = 60): Promise<void> { 92 | const startTime = Date.now(); 93 | 94 | while (Date.now() - startTime < timeout * 1000) { 95 | try { 96 | await this.connect(); 97 | return; 98 | } catch (error) { 99 | console.log(error); 100 | // Wait a bit before retrying 101 | this.logger.error( 102 | `Error connecting to websocket: ${JSON.stringify(error)}` 103 | ); 104 | await new Promise((resolve) => setTimeout(resolve, 1000)); 105 | } 106 | } 107 | 108 | throw new Error(`Interface not ready after ${timeout} seconds`); 109 | } 110 | 111 | /** 112 | * Authenticate with the WebSocket server. 113 | * This should be called immediately after the WebSocket connection is established. 114 | */ 115 | private async authenticate(): Promise<void> { 116 | if (!this.apiKey || !this.vmName) { 117 | // No authentication needed 118 | return; 119 | } 120 | 121 | this.logger.info('Performing authentication handshake...'); 122 | const authMessage = { 123 | command: 'authenticate', 124 | params: { 125 | api_key: this.apiKey, 126 | container_name: this.vmName, 127 | }, 128 | }; 129 | 130 | return new Promise<void>((resolve, reject) => { 131 | const authHandler = (data: WebSocket.RawData) => { 132 | try { 133 | const authResult = JSON.parse(data.toString()); 134 | if (!authResult.success) { 135 | const errorMsg = authResult.error || 'Authentication failed'; 136 | this.logger.error(`Authentication failed: ${errorMsg}`); 137 | this.ws.close(); 138 | reject(new Error(`Authentication failed: ${errorMsg}`)); 139 | } else { 140 | this.logger.info('Authentication successful'); 141 | this.ws.off('message', authHandler); 142 | resolve(); 143 | } 144 | } catch (error) { 145 | this.ws.off('message', authHandler); 146 | reject(error); 147 | } 148 | }; 149 | 150 | this.ws.on('message', authHandler); 151 | this.ws.send(JSON.stringify(authMessage)); 152 | }); 153 | } 154 | 155 | /** 156 | * Connect to the WebSocket server. 157 | */ 158 | public async connect(): Promise<void> { 159 | // If the WebSocket is already open, check if we need to authenticate 160 | if (this.ws.readyState === WebSocket.OPEN) { 161 | this.logger.info( 162 | 'Websocket is open, ensuring authentication is complete.' 163 | ); 164 | return this.authenticate(); 165 | } 166 | 167 | // If the WebSocket is closed or closing, reinitialize it 168 | if ( 169 | this.ws.readyState === WebSocket.CLOSED || 170 | this.ws.readyState === WebSocket.CLOSING 171 | ) { 172 | this.logger.info('Websocket is closed. Reinitializing connection.'); 173 | const headers: { [key: string]: string } = {}; 174 | if (this.apiKey && this.vmName) { 175 | headers['X-API-Key'] = this.apiKey; 176 | headers['X-VM-Name'] = this.vmName; 177 | } 178 | this.ws = new WebSocket(this.wsUri, { headers }); 179 | return this.authenticate(); 180 | } 181 | 182 | // Connect and authenticate 183 | return new Promise((resolve, reject) => { 184 | const onOpen = async () => { 185 | try { 186 | // Always authenticate immediately after connection 187 | await this.authenticate(); 188 | resolve(); 189 | } catch (error) { 190 | reject(error); 191 | } 192 | }; 193 | 194 | // If already connecting, wait for it to complete then authenticate 195 | if (this.ws.readyState === WebSocket.CONNECTING) { 196 | this.ws.addEventListener('open', onOpen, { once: true }); 197 | this.ws.addEventListener('error', (error) => reject(error), { 198 | once: true, 199 | }); 200 | return; 201 | } 202 | 203 | // Set up event handlers 204 | this.ws.on('open', onOpen); 205 | 206 | this.ws.on('error', (error: Error) => { 207 | reject(error); 208 | }); 209 | 210 | this.ws.on('close', () => { 211 | if (!this.closed) { 212 | // Attempt to reconnect 213 | setTimeout(() => this.connect(), 1000); 214 | } 215 | }); 216 | }); 217 | } 218 | 219 | /** 220 | * Send a command to the WebSocket server. 221 | */ 222 | public async sendCommand( 223 | command: string, 224 | params: { [key: string]: unknown } = {} 225 | ): Promise<{ [key: string]: unknown }> { 226 | // Create a new promise for this specific command 227 | const commandPromise = new Promise<{ [key: string]: unknown }>( 228 | (resolve, reject) => { 229 | // Chain it to the previous commands 230 | const executeCommand = async (): Promise<{ 231 | [key: string]: unknown; 232 | }> => { 233 | if (!this.ws || this.ws.readyState !== WebSocket.OPEN) { 234 | await this.connect(); 235 | } 236 | 237 | return new Promise<{ [key: string]: unknown }>( 238 | (innerResolve, innerReject) => { 239 | const messageHandler = (data: WebSocket.RawData) => { 240 | try { 241 | const response = JSON.parse(data.toString()); 242 | if (response.error) { 243 | innerReject(new Error(response.error)); 244 | } else { 245 | innerResolve(response); 246 | } 247 | } catch (error) { 248 | innerReject(error); 249 | } 250 | this.ws.off('message', messageHandler); 251 | }; 252 | 253 | this.ws.on('message', messageHandler); 254 | const wsCommand = { command, params }; 255 | this.ws.send(JSON.stringify(wsCommand)); 256 | } 257 | ); 258 | }; 259 | 260 | // Add this command to the lock chain 261 | this.commandLock = this.commandLock.then(() => 262 | executeCommand().then(resolve, reject) 263 | ); 264 | } 265 | ); 266 | 267 | return commandPromise; 268 | } 269 | 270 | /** 271 | * Check if the WebSocket is connected. 272 | */ 273 | public isConnected(): boolean { 274 | return this.ws && this.ws.readyState === WebSocket.OPEN; 275 | } 276 | 277 | /** 278 | * Close the interface connection. 279 | */ 280 | disconnect(): void { 281 | this.closed = true; 282 | if (this.ws && this.ws.readyState === WebSocket.OPEN) { 283 | this.ws.close(); 284 | } else if (this.ws && this.ws.readyState === WebSocket.CONNECTING) { 285 | // If still connecting, terminate the connection attempt 286 | this.ws.terminate(); 287 | } 288 | } 289 | 290 | /** 291 | * Force close the interface connection. 292 | * By default, this just calls close(), but subclasses can override 293 | * to provide more forceful cleanup. 294 | */ 295 | forceClose(): void { 296 | this.disconnect(); 297 | } 298 | 299 | // Mouse Actions 300 | abstract mouseDown( 301 | x?: number, 302 | y?: number, 303 | button?: MouseButton 304 | ): Promise<void>; 305 | abstract mouseUp(x?: number, y?: number, button?: MouseButton): Promise<void>; 306 | abstract leftClick(x?: number, y?: number): Promise<void>; 307 | abstract rightClick(x?: number, y?: number): Promise<void>; 308 | abstract doubleClick(x?: number, y?: number): Promise<void>; 309 | abstract moveCursor(x: number, y: number): Promise<void>; 310 | abstract dragTo( 311 | x: number, 312 | y: number, 313 | button?: MouseButton, 314 | duration?: number 315 | ): Promise<void>; 316 | abstract drag( 317 | path: Array<[number, number]>, 318 | button?: MouseButton, 319 | duration?: number 320 | ): Promise<void>; 321 | 322 | // Keyboard Actions 323 | abstract keyDown(key: string): Promise<void>; 324 | abstract keyUp(key: string): Promise<void>; 325 | abstract typeText(text: string): Promise<void>; 326 | abstract pressKey(key: string): Promise<void>; 327 | abstract hotkey(...keys: string[]): Promise<void>; 328 | 329 | // Scrolling Actions 330 | abstract scroll(x: number, y: number): Promise<void>; 331 | abstract scrollDown(clicks?: number): Promise<void>; 332 | abstract scrollUp(clicks?: number): Promise<void>; 333 | 334 | // Screen Actions 335 | abstract screenshot(): Promise<Buffer>; 336 | abstract getScreenSize(): Promise<ScreenSize>; 337 | abstract getCursorPosition(): Promise<CursorPosition>; 338 | 339 | // Clipboard Actions 340 | abstract copyToClipboard(): Promise<string>; 341 | abstract setClipboard(text: string): Promise<void>; 342 | 343 | // File System Actions 344 | abstract fileExists(path: string): Promise<boolean>; 345 | abstract directoryExists(path: string): Promise<boolean>; 346 | abstract listDir(path: string): Promise<string[]>; 347 | abstract readText(path: string): Promise<string>; 348 | abstract writeText(path: string, content: string): Promise<void>; 349 | abstract readBytes(path: string): Promise<Buffer>; 350 | abstract writeBytes(path: string, content: Buffer): Promise<void>; 351 | abstract deleteFile(path: string): Promise<void>; 352 | abstract createDir(path: string): Promise<void>; 353 | abstract deleteDir(path: string): Promise<void>; 354 | abstract runCommand(command: string): Promise<[string, string]>; 355 | 356 | // Accessibility Actions 357 | abstract getAccessibilityTree(): Promise<AccessibilityNode>; 358 | abstract toScreenCoordinates(x: number, y: number): Promise<[number, number]>; 359 | abstract toScreenshotCoordinates( 360 | x: number, 361 | y: number 362 | ): Promise<[number, number]>; 363 | } 364 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/integrations/hud/proxy.py: -------------------------------------------------------------------------------- ```python 1 | """HUD ComputerAgent wrapper and Fake AsyncOpenAI client. 2 | 3 | Provides FakeAsyncOpenAI that adapts our ComputerAgent to the OpenAI Responses 4 | interface needed by HUD's OperatorAgent. It implements only `responses.create` 5 | and returns an OpenAI Response object with `id` and `output` fields, where `output` is a list of 6 | OpenAI-like response blocks. We intentionally only support a single-step call 7 | by consuming the first yielded result from `ComputerAgent.run()`. 8 | """ 9 | 10 | import traceback 11 | import time 12 | import uuid 13 | from typing import Any, Dict, List, Optional 14 | 15 | from agent.agent import ComputerAgent as BaseComputerAgent 16 | from agent.callbacks import PromptInstructionsCallback 17 | from hud.tools.computer.settings import computer_settings 18 | from PIL import Image 19 | from hud.agents import OperatorAgent 20 | 21 | # OpenAI Responses typed models (required) 22 | from openai.types.responses import ( 23 | Response, 24 | ResponseInputParam, 25 | ResponseOutputItem, 26 | ResponseComputerToolCall, 27 | ResponseOutputMessage, 28 | ResponseOutputText, 29 | ResponseReasoningItem, 30 | ResponseUsage, 31 | ) 32 | 33 | def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> List[ResponseOutputItem]: 34 | """Map our agent output items to OpenAI ResponseOutputItem typed models. 35 | 36 | Only a subset is supported: computer_call, assistant message (text), and reasoning. 37 | Unknown types are ignored. 38 | """ 39 | blocks: List[ResponseOutputItem] = [] 40 | for item in output_items or []: 41 | t = item.get("type") 42 | if t == "computer_call": 43 | comp = ResponseComputerToolCall.model_validate({ 44 | "id": item.get("id") or f"cu_{uuid.uuid4().hex}", 45 | "type": "computer_call", 46 | "call_id": item["call_id"], 47 | "action": item["action"], 48 | "pending_safety_checks": item.get("pending_safety_checks", []), 49 | "status": "completed", 50 | }) 51 | blocks.append(comp) 52 | # we will exit early here as the responses api only supports a single step 53 | break 54 | elif t == "message" and item.get("role") == "assistant": 55 | content_blocks: List[ResponseOutputText] = [] 56 | for c in item.get("content", []) or []: 57 | content_blocks.append( 58 | ResponseOutputText.model_validate({ 59 | "type": "output_text", 60 | "text": c["text"], 61 | "annotations": [], 62 | }) 63 | ) 64 | if content_blocks: 65 | msg = ResponseOutputMessage.model_validate({ 66 | "id": item.get("id") or f"msg_{uuid.uuid4()}", 67 | "type": "message", 68 | "role": "assistant", 69 | "status": "completed", 70 | "content": [ct.model_dump() for ct in content_blocks], 71 | }) 72 | blocks.append(msg) 73 | elif t == "reasoning": 74 | reasoning = ResponseReasoningItem.model_validate({ 75 | "id": item.get("id") or f"rsn_{uuid.uuid4()}", 76 | "type": "reasoning", 77 | "summary": item["summary"], 78 | }) 79 | blocks.append(reasoning) 80 | # Unhandled types are ignored 81 | return blocks 82 | 83 | def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]: 84 | out: List[Dict[str, Any]] = [] 85 | for it in list(items): 86 | if hasattr(it, "model_dump"): 87 | out.append(it.model_dump()) # type: ignore[attr-defined] 88 | elif isinstance(it, dict): 89 | out.append(it) 90 | else: 91 | # Strict: rely on default __dict__ if present 92 | out.append(dict(it)) # may raise if not mapping 93 | return out 94 | 95 | class FakeAsyncOpenAI: 96 | """Minimal fake OpenAI client with only `responses.create` implemented. 97 | 98 | It uses a provided `ComputerAgent` instance to produce a single-step 99 | response compatible with HUD's OperatorAgent loop. 100 | """ 101 | 102 | def __init__(self, computer_agent: BaseComputerAgent) -> None: 103 | self._agent = computer_agent 104 | self.responses = self._Responses(self) 105 | 106 | class _Responses: 107 | def __init__(self, parent: "FakeAsyncOpenAI") -> None: 108 | # Caches for cross-call context when using previous_response_id 109 | self.blocks_cache: Dict[str, ResponseInputParam | ResponseOutputItem] = {} 110 | self.context_cache: Dict[str, List[str]] = {} 111 | self.agent = parent._agent 112 | 113 | async def create( 114 | self, 115 | *, 116 | model: str, 117 | input: ResponseInputParam, 118 | tools: Optional[List[Dict[str, Any]]] = None, 119 | instructions: Optional[str] = None, 120 | previous_response_id: Optional[str] = None, 121 | max_retries: int = 5, 122 | **_: Any, 123 | ) -> Any: 124 | for attempt in range(max_retries): 125 | # Prepend cached blocks from previous_response_id to input 126 | full_input = input 127 | if previous_response_id is not None: 128 | prev_block_ids = self.context_cache[previous_response_id] 129 | prev_blocks = [self.blocks_cache[b_id] for b_id in prev_block_ids] 130 | full_input = _to_plain_dict_list(prev_blocks + input) 131 | 132 | # Pre-pend instructions message 133 | effective_input = full_input 134 | if instructions: 135 | effective_input = [{ 136 | "role": "user", 137 | "content": instructions, 138 | }] + full_input 139 | 140 | # Run a single iteration of the ComputerAgent 141 | agent_result: Optional[Dict[str, Any]] = None 142 | async for result in self.agent.run(effective_input): # type: ignore[arg-type] 143 | agent_result = result 144 | break 145 | assert agent_result is not None, "Agent failed to produce result" 146 | 147 | output = _map_agent_output_to_openai_blocks(agent_result["output"]) 148 | usage = agent_result["usage"] 149 | 150 | # Cache conversation context using the last response id 151 | block_ids: List[str] = [] 152 | blocks_to_cache = full_input + output 153 | for b in blocks_to_cache: 154 | bid = getattr(b, "id", None) or f"tmp-{hash(repr(b))}" 155 | self.blocks_cache[bid] = b # type: ignore[assignment] 156 | block_ids.append(bid) 157 | response_id = agent_result.get("id") or f"fake-{int(time.time()*1000)}" 158 | self.context_cache[response_id] = block_ids 159 | 160 | try: 161 | return Response.model_validate({ 162 | "id": response_id, 163 | "created_at": time.time(), 164 | "object": "response", 165 | "model": model, 166 | "output": output, 167 | "parallel_tool_calls": False, 168 | "tool_choice": "auto", 169 | "tools": [], 170 | "previous_response_id": previous_response_id, 171 | "usage": ResponseUsage.model_validate({ 172 | "input_tokens": usage.get("input_tokens", 0), 173 | "output_tokens": usage.get("output_tokens", 0), 174 | "total_tokens": usage.get("total_tokens", 0), 175 | "input_tokens_details": usage.get("input_tokens_details", { "cached_tokens": 0 }), 176 | "output_tokens_details": usage.get("output_tokens_details", { "reasoning_tokens": 0 }), 177 | }), 178 | }) 179 | except Exception as e: 180 | print(f"Error while validating agent response (attempt {attempt + 1}/{max_retries}): ", e) 181 | if attempt == max_retries - 1: 182 | print(traceback.format_exc()) 183 | raise e 184 | 185 | 186 | # --------------------------------------------------------------------------- 187 | # Proxy OperatorAgent (moved from __init__.py) 188 | # --------------------------------------------------------------------------- 189 | 190 | 191 | class ProxyOperatorAgent(OperatorAgent): 192 | """OperatorAgent that proxies model calls through our ComputerAgent. 193 | 194 | Accepts the same config keys we pass via hud.run_dataset `agent_config`: 195 | - model: str | None 196 | - allowed_tools: list[str] | None 197 | Additional kwargs are forwarded to OperatorAgent (if any are supported). 198 | """ 199 | 200 | def __init__( 201 | self, 202 | *, 203 | model: str | None = None, 204 | allowed_tools: list[str] | None = None, 205 | trajectory_dir: str | dict | None = None, 206 | # === ComputerAgent kwargs === 207 | tools: list[Any] | None = None, 208 | custom_loop: Any | None = None, 209 | only_n_most_recent_images: int | None = None, 210 | callbacks: list[Any] | None = None, 211 | instructions: str | None = None, 212 | verbosity: int | None = None, 213 | max_retries: int | None = 3, 214 | screenshot_delay: float | int = 0.5, 215 | use_prompt_caching: bool | None = False, 216 | max_trajectory_budget: float | dict | None = None, 217 | telemetry_enabled: bool | None = True, 218 | **kwargs: Any, 219 | ) -> None: 220 | model = model or "computer-use-preview" 221 | allowed_tools = allowed_tools or ["openai_computer"] 222 | 223 | computer_shim = { 224 | 'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)), 225 | 'environment': 'linux', 226 | 'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT) 227 | } 228 | # Build tools ensuring the computer_shim is included 229 | agent_tools: list[Any] = [computer_shim] 230 | if tools: 231 | agent_tools.extend(tools) 232 | 233 | # Build callbacks, injecting prompt instructions if provided 234 | agent_callbacks = list(callbacks or []) 235 | if instructions: 236 | agent_callbacks.append(PromptInstructionsCallback(instructions)) 237 | 238 | computer_agent = BaseComputerAgent( 239 | model=model, 240 | tools=agent_tools, 241 | custom_loop=custom_loop, 242 | only_n_most_recent_images=only_n_most_recent_images, 243 | callbacks=agent_callbacks, 244 | verbosity=verbosity, 245 | trajectory_dir=trajectory_dir, 246 | max_retries=max_retries, 247 | screenshot_delay=screenshot_delay, 248 | use_prompt_caching=use_prompt_caching, 249 | max_trajectory_budget=max_trajectory_budget, 250 | telemetry_enabled=telemetry_enabled, 251 | ) 252 | model_client = FakeAsyncOpenAI(computer_agent) 253 | 254 | super().__init__( 255 | model_client=model_client, # type: ignore[arg-type] 256 | model=model, 257 | allowed_tools=allowed_tools, 258 | **kwargs, 259 | ) 260 | 261 | __all__ = [ 262 | "FakeAsyncOpenAI", 263 | "ProxyOperatorAgent", 264 | ] 265 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/callbacks/logging.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Logging callback for ComputerAgent that provides configurable logging of agent lifecycle events. 3 | """ 4 | 5 | import json 6 | import logging 7 | from typing import Dict, List, Any, Optional, Union 8 | from .base import AsyncCallbackHandler 9 | 10 | 11 | def sanitize_image_urls(data: Any) -> Any: 12 | """ 13 | Recursively search for 'image_url' keys and set their values to '[omitted]'. 14 | 15 | Args: 16 | data: Any data structure (dict, list, or primitive type) 17 | 18 | Returns: 19 | A deep copy of the data with all 'image_url' values replaced with '[omitted]' 20 | """ 21 | if isinstance(data, dict): 22 | # Create a copy of the dictionary 23 | sanitized = {} 24 | for key, value in data.items(): 25 | if key == "image_url": 26 | sanitized[key] = "[omitted]" 27 | else: 28 | # Recursively sanitize the value 29 | sanitized[key] = sanitize_image_urls(value) 30 | return sanitized 31 | 32 | elif isinstance(data, list): 33 | # Recursively sanitize each item in the list 34 | return [sanitize_image_urls(item) for item in data] 35 | 36 | else: 37 | # For primitive types (str, int, bool, None, etc.), return as-is 38 | return data 39 | 40 | 41 | class LoggingCallback(AsyncCallbackHandler): 42 | """ 43 | Callback handler that logs agent lifecycle events with configurable verbosity. 44 | 45 | Logging levels: 46 | - DEBUG: All events including API calls, message preprocessing, and detailed outputs 47 | - INFO: Major lifecycle events (start/end, messages, outputs) 48 | - WARNING: Only warnings and errors 49 | - ERROR: Only errors 50 | """ 51 | 52 | def __init__(self, logger: Optional[logging.Logger] = None, level: int = logging.INFO): 53 | """ 54 | Initialize the logging callback. 55 | 56 | Args: 57 | logger: Logger instance to use. If None, creates a logger named 'agent.ComputerAgent' 58 | level: Logging level (logging.DEBUG, logging.INFO, etc.) 59 | """ 60 | self.logger = logger or logging.getLogger('agent.ComputerAgent') 61 | self.level = level 62 | 63 | # Set up logger if it doesn't have handlers 64 | if not self.logger.handlers: 65 | handler = logging.StreamHandler() 66 | formatter = logging.Formatter( 67 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s' 68 | ) 69 | handler.setFormatter(formatter) 70 | self.logger.addHandler(handler) 71 | self.logger.setLevel(level) 72 | 73 | def _update_usage(self, usage: Dict[str, Any]) -> None: 74 | """Update total usage statistics.""" 75 | def add_dicts(target: Dict[str, Any], source: Dict[str, Any]) -> None: 76 | for key, value in source.items(): 77 | if isinstance(value, dict): 78 | if key not in target: 79 | target[key] = {} 80 | add_dicts(target[key], value) 81 | else: 82 | if key not in target: 83 | target[key] = 0 84 | target[key] += value 85 | add_dicts(self.total_usage, usage) 86 | 87 | async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None: 88 | """Called before the run starts.""" 89 | self.total_usage = {} 90 | 91 | async def on_usage(self, usage: Dict[str, Any]) -> None: 92 | """Called when usage information is received.""" 93 | self._update_usage(usage) 94 | 95 | async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None: 96 | """Called after the run ends.""" 97 | def format_dict(d, indent=0): 98 | lines = [] 99 | prefix = f" - {' ' * indent}" 100 | for key, value in d.items(): 101 | if isinstance(value, dict): 102 | lines.append(f"{prefix}{key}:") 103 | lines.extend(format_dict(value, indent + 1)) 104 | elif isinstance(value, float): 105 | lines.append(f"{prefix}{key}: ${value:.4f}") 106 | else: 107 | lines.append(f"{prefix}{key}: {value}") 108 | return lines 109 | 110 | formatted_output = "\n".join(format_dict(self.total_usage)) 111 | self.logger.info(f"Total usage:\n{formatted_output}") 112 | 113 | async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 114 | """Called before LLM processing starts.""" 115 | if self.logger.isEnabledFor(logging.INFO): 116 | self.logger.info(f"LLM processing started with {len(messages)} messages") 117 | if self.logger.isEnabledFor(logging.DEBUG): 118 | sanitized_messages = [sanitize_image_urls(msg) for msg in messages] 119 | self.logger.debug(f"LLM input messages: {json.dumps(sanitized_messages, indent=2)}") 120 | return messages 121 | 122 | async def on_llm_end(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 123 | """Called after LLM processing ends.""" 124 | if self.logger.isEnabledFor(logging.DEBUG): 125 | sanitized_messages = [sanitize_image_urls(msg) for msg in messages] 126 | self.logger.debug(f"LLM output: {json.dumps(sanitized_messages, indent=2)}") 127 | return messages 128 | 129 | async def on_computer_call_start(self, item: Dict[str, Any]) -> None: 130 | """Called when a computer call starts.""" 131 | action = item.get("action", {}) 132 | action_type = action.get("type", "unknown") 133 | action_args = {k: v for k, v in action.items() if k != "type"} 134 | 135 | # INFO level logging for the action 136 | self.logger.info(f"Computer: {action_type}({action_args})") 137 | 138 | # DEBUG level logging for full details 139 | if self.logger.isEnabledFor(logging.DEBUG): 140 | self.logger.debug(f"Computer call started: {json.dumps(action, indent=2)}") 141 | 142 | async def on_computer_call_end(self, item: Dict[str, Any], result: Any) -> None: 143 | """Called when a computer call ends.""" 144 | if self.logger.isEnabledFor(logging.DEBUG): 145 | action = item.get("action", "unknown") 146 | self.logger.debug(f"Computer call completed: {json.dumps(action, indent=2)}") 147 | if result: 148 | sanitized_result = sanitize_image_urls(result) 149 | self.logger.debug(f"Computer call result: {json.dumps(sanitized_result, indent=2)}") 150 | 151 | async def on_function_call_start(self, item: Dict[str, Any]) -> None: 152 | """Called when a function call starts.""" 153 | name = item.get("name", "unknown") 154 | arguments = item.get("arguments", "{}") 155 | 156 | # INFO level logging for the function call 157 | self.logger.info(f"Function: {name}({arguments})") 158 | 159 | # DEBUG level logging for full details 160 | if self.logger.isEnabledFor(logging.DEBUG): 161 | self.logger.debug(f"Function call started: {name}") 162 | 163 | async def on_function_call_end(self, item: Dict[str, Any], result: Any) -> None: 164 | """Called when a function call ends.""" 165 | # INFO level logging for function output (similar to function_call_output) 166 | if result: 167 | # Handle both list and direct result formats 168 | if isinstance(result, list) and len(result) > 0: 169 | output = result[0].get("output", str(result)) if isinstance(result[0], dict) else str(result[0]) 170 | else: 171 | output = str(result) 172 | 173 | # Truncate long outputs 174 | if len(output) > 100: 175 | output = output[:100] + "..." 176 | 177 | self.logger.info(f"Output: {output}") 178 | 179 | # DEBUG level logging for full details 180 | if self.logger.isEnabledFor(logging.DEBUG): 181 | name = item.get("name", "unknown") 182 | self.logger.debug(f"Function call completed: {name}") 183 | if result: 184 | self.logger.debug(f"Function call result: {json.dumps(result, indent=2)}") 185 | 186 | async def on_text(self, item: Dict[str, Any]) -> None: 187 | """Called when a text message is encountered.""" 188 | # Get the role to determine if it's Agent or User 189 | role = item.get("role", "unknown") 190 | content_items = item.get("content", []) 191 | 192 | # Process content items to build display text 193 | text_parts = [] 194 | for content_item in content_items: 195 | content_type = content_item.get("type", "output_text") 196 | if content_type == "output_text": 197 | text_content = content_item.get("text", "") 198 | if not text_content.strip(): 199 | text_parts.append("[empty]") 200 | else: 201 | # Truncate long text and add ellipsis 202 | if len(text_content) > 2048: 203 | text_parts.append(text_content[:2048] + "...") 204 | else: 205 | text_parts.append(text_content) 206 | else: 207 | # Non-text content, show as [type] 208 | text_parts.append(f"[{content_type}]") 209 | 210 | # Join all text parts 211 | display_text = ''.join(text_parts) if text_parts else "[empty]" 212 | 213 | # Log with appropriate level and format 214 | if role == "assistant": 215 | self.logger.info(f"Agent: {display_text}") 216 | elif role == "user": 217 | self.logger.info(f"User: {display_text}") 218 | else: 219 | # Fallback for unknown roles, use debug level 220 | if self.logger.isEnabledFor(logging.DEBUG): 221 | self.logger.debug(f"Text message ({role}): {display_text}") 222 | 223 | async def on_api_start(self, kwargs: Dict[str, Any]) -> None: 224 | """Called when an API call is about to start.""" 225 | if self.logger.isEnabledFor(logging.DEBUG): 226 | model = kwargs.get("model", "unknown") 227 | self.logger.debug(f"API call starting for model: {model}") 228 | # Log sanitized messages if present 229 | if "messages" in kwargs: 230 | sanitized_messages = sanitize_image_urls(kwargs["messages"]) 231 | self.logger.debug(f"API call messages: {json.dumps(sanitized_messages, indent=2)}") 232 | elif "input" in kwargs: 233 | sanitized_input = sanitize_image_urls(kwargs["input"]) 234 | self.logger.debug(f"API call input: {json.dumps(sanitized_input, indent=2)}") 235 | 236 | async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None: 237 | """Called when an API call has completed.""" 238 | if self.logger.isEnabledFor(logging.DEBUG): 239 | model = kwargs.get("model", "unknown") 240 | self.logger.debug(f"API call completed for model: {model}") 241 | self.logger.debug(f"API call result: {json.dumps(sanitize_image_urls(result), indent=2)}") 242 | 243 | async def on_screenshot(self, item: Union[str, bytes], name: str = "screenshot") -> None: 244 | """Called when a screenshot is taken.""" 245 | if self.logger.isEnabledFor(logging.DEBUG): 246 | image_size = len(item) / 1024 247 | self.logger.debug(f"Screenshot captured: {name} {image_size:.2f} KB") ```