This is page 4 of 16. Use http://codebase.md/trycua/cua?page={x} to view the full context. # Directory Structure ``` ├── .all-contributorsrc ├── .cursorignore ├── .devcontainer │ ├── devcontainer.json │ ├── post-install.sh │ └── README.md ├── .dockerignore ├── .gitattributes ├── .github │ ├── FUNDING.yml │ ├── scripts │ │ ├── get_pyproject_version.py │ │ └── tests │ │ ├── __init__.py │ │ ├── README.md │ │ └── test_get_pyproject_version.py │ └── workflows │ ├── ci-lume.yml │ ├── docker-publish-kasm.yml │ ├── docker-publish-xfce.yml │ ├── docker-reusable-publish.yml │ ├── npm-publish-computer.yml │ ├── npm-publish-core.yml │ ├── publish-lume.yml │ ├── pypi-publish-agent.yml │ ├── pypi-publish-computer-server.yml │ ├── pypi-publish-computer.yml │ ├── pypi-publish-core.yml │ ├── pypi-publish-mcp-server.yml │ ├── pypi-publish-pylume.yml │ ├── pypi-publish-som.yml │ ├── pypi-reusable-publish.yml │ └── test-validation-script.yml ├── .gitignore ├── .vscode │ ├── docs.code-workspace │ ├── launch.json │ ├── libs-ts.code-workspace │ ├── lume.code-workspace │ ├── lumier.code-workspace │ └── py.code-workspace ├── blog │ ├── app-use.md │ ├── assets │ │ ├── composite-agents.png │ │ ├── docker-ubuntu-support.png │ │ ├── hack-booth.png │ │ ├── hack-closing-ceremony.jpg │ │ ├── hack-cua-ollama-hud.jpeg │ │ ├── hack-leaderboard.png │ │ ├── hack-the-north.png │ │ ├── hack-winners.jpeg │ │ ├── hack-workshop.jpeg │ │ ├── hud-agent-evals.png │ │ └── trajectory-viewer.jpeg │ ├── bringing-computer-use-to-the-web.md │ ├── build-your-own-operator-on-macos-1.md │ ├── build-your-own-operator-on-macos-2.md │ ├── composite-agents.md │ ├── cua-hackathon.md │ ├── hack-the-north.md │ ├── hud-agent-evals.md │ ├── human-in-the-loop.md │ ├── introducing-cua-cloud-containers.md │ ├── lume-to-containerization.md │ ├── sandboxed-python-execution.md │ ├── training-computer-use-models-trajectories-1.md │ ├── trajectory-viewer.md │ ├── ubuntu-docker-support.md │ └── windows-sandbox.md ├── CONTRIBUTING.md ├── Development.md ├── Dockerfile ├── docs │ ├── .gitignore │ ├── .prettierrc │ ├── content │ │ └── docs │ │ ├── agent-sdk │ │ │ ├── agent-loops.mdx │ │ │ ├── benchmarks │ │ │ │ ├── index.mdx │ │ │ │ ├── interactive.mdx │ │ │ │ ├── introduction.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── osworld-verified.mdx │ │ │ │ ├── screenspot-pro.mdx │ │ │ │ └── screenspot-v2.mdx │ │ │ ├── callbacks │ │ │ │ ├── agent-lifecycle.mdx │ │ │ │ ├── cost-saving.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── logging.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── pii-anonymization.mdx │ │ │ │ └── trajectories.mdx │ │ │ ├── chat-history.mdx │ │ │ ├── custom-computer-handlers.mdx │ │ │ ├── custom-tools.mdx │ │ │ ├── customizing-computeragent.mdx │ │ │ ├── integrations │ │ │ │ ├── hud.mdx │ │ │ │ └── meta.json │ │ │ ├── message-format.mdx │ │ │ ├── meta.json │ │ │ ├── migration-guide.mdx │ │ │ ├── prompt-caching.mdx │ │ │ ├── supported-agents │ │ │ │ ├── composed-agents.mdx │ │ │ │ ├── computer-use-agents.mdx │ │ │ │ ├── grounding-models.mdx │ │ │ │ ├── human-in-the-loop.mdx │ │ │ │ └── meta.json │ │ │ ├── supported-model-providers │ │ │ │ ├── index.mdx │ │ │ │ └── local-models.mdx │ │ │ └── usage-tracking.mdx │ │ ├── computer-sdk │ │ │ ├── cloud-vm-management.mdx │ │ │ ├── commands.mdx │ │ │ ├── computer-ui.mdx │ │ │ ├── computers.mdx │ │ │ ├── meta.json │ │ │ └── sandboxed-python.mdx │ │ ├── index.mdx │ │ ├── libraries │ │ │ ├── agent │ │ │ │ └── index.mdx │ │ │ ├── computer │ │ │ │ └── index.mdx │ │ │ ├── computer-server │ │ │ │ ├── Commands.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── REST-API.mdx │ │ │ │ └── WebSocket-API.mdx │ │ │ ├── core │ │ │ │ └── index.mdx │ │ │ ├── lume │ │ │ │ ├── cli-reference.mdx │ │ │ │ ├── faq.md │ │ │ │ ├── http-api.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── meta.json │ │ │ │ └── prebuilt-images.mdx │ │ │ ├── lumier │ │ │ │ ├── building-lumier.mdx │ │ │ │ ├── docker-compose.mdx │ │ │ │ ├── docker.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ └── meta.json │ │ │ ├── mcp-server │ │ │ │ ├── client-integrations.mdx │ │ │ │ ├── configuration.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── llm-integrations.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── tools.mdx │ │ │ │ └── usage.mdx │ │ │ └── som │ │ │ ├── configuration.mdx │ │ │ └── index.mdx │ │ ├── meta.json │ │ ├── quickstart-cli.mdx │ │ ├── quickstart-devs.mdx │ │ └── telemetry.mdx │ ├── next.config.mjs │ ├── package-lock.json │ ├── package.json │ ├── pnpm-lock.yaml │ ├── postcss.config.mjs │ ├── public │ │ └── img │ │ ├── agent_gradio_ui.png │ │ ├── agent.png │ │ ├── cli.png │ │ ├── computer.png │ │ ├── som_box_threshold.png │ │ └── som_iou_threshold.png │ ├── README.md │ ├── source.config.ts │ ├── src │ │ ├── app │ │ │ ├── (home) │ │ │ │ ├── [[...slug]] │ │ │ │ │ └── page.tsx │ │ │ │ └── layout.tsx │ │ │ ├── api │ │ │ │ └── search │ │ │ │ └── route.ts │ │ │ ├── favicon.ico │ │ │ ├── global.css │ │ │ ├── layout.config.tsx │ │ │ ├── layout.tsx │ │ │ ├── llms.mdx │ │ │ │ └── [[...slug]] │ │ │ │ └── route.ts │ │ │ └── llms.txt │ │ │ └── route.ts │ │ ├── assets │ │ │ ├── discord-black.svg │ │ │ ├── discord-white.svg │ │ │ ├── logo-black.svg │ │ │ └── logo-white.svg │ │ ├── components │ │ │ ├── iou.tsx │ │ │ └── mermaid.tsx │ │ ├── lib │ │ │ ├── llms.ts │ │ │ └── source.ts │ │ └── mdx-components.tsx │ └── tsconfig.json ├── examples │ ├── agent_examples.py │ ├── agent_ui_examples.py │ ├── cloud_api_examples.py │ ├── computer_examples_windows.py │ ├── computer_examples.py │ ├── computer_ui_examples.py │ ├── computer-example-ts │ │ ├── .env.example │ │ ├── .gitignore │ │ ├── .prettierrc │ │ ├── package-lock.json │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── README.md │ │ ├── src │ │ │ ├── helpers.ts │ │ │ └── index.ts │ │ └── tsconfig.json │ ├── docker_examples.py │ ├── evals │ │ ├── hud_eval_examples.py │ │ └── wikipedia_most_linked.txt │ ├── pylume_examples.py │ ├── sandboxed_functions_examples.py │ ├── som_examples.py │ ├── utils.py │ └── winsandbox_example.py ├── img │ ├── agent_gradio_ui.png │ ├── agent.png │ ├── cli.png │ ├── computer.png │ ├── logo_black.png │ └── logo_white.png ├── libs │ ├── kasm │ │ ├── Dockerfile │ │ ├── LICENSE │ │ ├── README.md │ │ └── src │ │ └── ubuntu │ │ └── install │ │ └── firefox │ │ ├── custom_startup.sh │ │ ├── firefox.desktop │ │ └── install_firefox.sh │ ├── lume │ │ ├── .cursorignore │ │ ├── CONTRIBUTING.md │ │ ├── Development.md │ │ ├── img │ │ │ └── cli.png │ │ ├── Package.resolved │ │ ├── Package.swift │ │ ├── README.md │ │ ├── resources │ │ │ └── lume.entitlements │ │ ├── scripts │ │ │ ├── build │ │ │ │ ├── build-debug.sh │ │ │ │ ├── build-release-notarized.sh │ │ │ │ └── build-release.sh │ │ │ └── install.sh │ │ ├── src │ │ │ ├── Commands │ │ │ │ ├── Clone.swift │ │ │ │ ├── Config.swift │ │ │ │ ├── Create.swift │ │ │ │ ├── Delete.swift │ │ │ │ ├── Get.swift │ │ │ │ ├── Images.swift │ │ │ │ ├── IPSW.swift │ │ │ │ ├── List.swift │ │ │ │ ├── Logs.swift │ │ │ │ ├── Options │ │ │ │ │ └── FormatOption.swift │ │ │ │ ├── Prune.swift │ │ │ │ ├── Pull.swift │ │ │ │ ├── Push.swift │ │ │ │ ├── Run.swift │ │ │ │ ├── Serve.swift │ │ │ │ ├── Set.swift │ │ │ │ └── Stop.swift │ │ │ ├── ContainerRegistry │ │ │ │ ├── ImageContainerRegistry.swift │ │ │ │ ├── ImageList.swift │ │ │ │ └── ImagesPrinter.swift │ │ │ ├── Errors │ │ │ │ └── Errors.swift │ │ │ ├── FileSystem │ │ │ │ ├── Home.swift │ │ │ │ ├── Settings.swift │ │ │ │ ├── VMConfig.swift │ │ │ │ ├── VMDirectory.swift │ │ │ │ └── VMLocation.swift │ │ │ ├── LumeController.swift │ │ │ ├── Main.swift │ │ │ ├── Server │ │ │ │ ├── Handlers.swift │ │ │ │ ├── HTTP.swift │ │ │ │ ├── Requests.swift │ │ │ │ ├── Responses.swift │ │ │ │ └── Server.swift │ │ │ ├── Utils │ │ │ │ ├── CommandRegistry.swift │ │ │ │ ├── CommandUtils.swift │ │ │ │ ├── Logger.swift │ │ │ │ ├── NetworkUtils.swift │ │ │ │ ├── Path.swift │ │ │ │ ├── ProcessRunner.swift │ │ │ │ ├── ProgressLogger.swift │ │ │ │ ├── String.swift │ │ │ │ └── Utils.swift │ │ │ ├── Virtualization │ │ │ │ ├── DarwinImageLoader.swift │ │ │ │ ├── DHCPLeaseParser.swift │ │ │ │ ├── ImageLoaderFactory.swift │ │ │ │ └── VMVirtualizationService.swift │ │ │ ├── VM │ │ │ │ ├── DarwinVM.swift │ │ │ │ ├── LinuxVM.swift │ │ │ │ ├── VM.swift │ │ │ │ ├── VMDetails.swift │ │ │ │ ├── VMDetailsPrinter.swift │ │ │ │ ├── VMDisplayResolution.swift │ │ │ │ └── VMFactory.swift │ │ │ └── VNC │ │ │ ├── PassphraseGenerator.swift │ │ │ └── VNCService.swift │ │ └── tests │ │ ├── Mocks │ │ │ ├── MockVM.swift │ │ │ ├── MockVMVirtualizationService.swift │ │ │ └── MockVNCService.swift │ │ ├── VM │ │ │ └── VMDetailsPrinterTests.swift │ │ ├── VMTests.swift │ │ ├── VMVirtualizationServiceTests.swift │ │ └── VNCServiceTests.swift │ ├── lumier │ │ ├── .dockerignore │ │ ├── Dockerfile │ │ ├── README.md │ │ └── src │ │ ├── bin │ │ │ └── entry.sh │ │ ├── config │ │ │ └── constants.sh │ │ ├── hooks │ │ │ └── on-logon.sh │ │ └── lib │ │ ├── utils.sh │ │ └── vm.sh │ ├── python │ │ ├── agent │ │ │ ├── agent │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── adapters │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── huggingfacelocal_adapter.py │ │ │ │ │ ├── human_adapter.py │ │ │ │ │ ├── mlxvlm_adapter.py │ │ │ │ │ └── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── qwen2_5_vl.py │ │ │ │ ├── agent.py │ │ │ │ ├── callbacks │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── budget_manager.py │ │ │ │ │ ├── image_retention.py │ │ │ │ │ ├── logging.py │ │ │ │ │ ├── operator_validator.py │ │ │ │ │ ├── pii_anonymization.py │ │ │ │ │ ├── prompt_instructions.py │ │ │ │ │ ├── telemetry.py │ │ │ │ │ └── trajectory_saver.py │ │ │ │ ├── cli.py │ │ │ │ ├── computers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cua.py │ │ │ │ │ └── custom.py │ │ │ │ ├── decorators.py │ │ │ │ ├── human_tool │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ ├── server.py │ │ │ │ │ └── ui.py │ │ │ │ ├── integrations │ │ │ │ │ └── hud │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── agent.py │ │ │ │ │ └── proxy.py │ │ │ │ ├── loops │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── anthropic.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── composed_grounded.py │ │ │ │ │ ├── glm45v.py │ │ │ │ │ ├── gta1.py │ │ │ │ │ ├── holo.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── model_types.csv │ │ │ │ │ ├── moondream3.py │ │ │ │ │ ├── omniparser.py │ │ │ │ │ ├── openai.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── uitars.py │ │ │ │ ├── proxy │ │ │ │ │ ├── examples.py │ │ │ │ │ └── handlers.py │ │ │ │ ├── responses.py │ │ │ │ ├── types.py │ │ │ │ └── ui │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ └── gradio │ │ │ │ ├── __init__.py │ │ │ │ ├── app.py │ │ │ │ └── ui_components.py │ │ │ ├── benchmarks │ │ │ │ ├── .gitignore │ │ │ │ ├── contrib.md │ │ │ │ ├── interactive.py │ │ │ │ ├── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ └── gta1.py │ │ │ │ ├── README.md │ │ │ │ ├── ss-pro.py │ │ │ │ ├── ss-v2.py │ │ │ │ └── utils.py │ │ │ ├── example.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer │ │ │ ├── computer │ │ │ │ ├── __init__.py │ │ │ │ ├── computer.py │ │ │ │ ├── diorama_computer.py │ │ │ │ ├── helpers.py │ │ │ │ ├── interface │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ ├── models.py │ │ │ │ │ └── windows.py │ │ │ │ ├── logger.py │ │ │ │ ├── models.py │ │ │ │ ├── providers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cloud │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── docker │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── lume │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── lume_api.py │ │ │ │ │ ├── lumier │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── types.py │ │ │ │ │ └── winsandbox │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── provider.py │ │ │ │ │ └── setup_script.ps1 │ │ │ │ ├── ui │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ └── gradio │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── app.py │ │ │ │ └── utils.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer-server │ │ │ ├── computer_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── cli.py │ │ │ │ ├── diorama │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── diorama_computer.py │ │ │ │ │ ├── diorama.py │ │ │ │ │ ├── draw.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── safezone.py │ │ │ │ ├── handlers │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── windows.py │ │ │ │ ├── main.py │ │ │ │ ├── server.py │ │ │ │ └── watchdog.py │ │ │ ├── examples │ │ │ │ ├── __init__.py │ │ │ │ └── usage_example.py │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ ├── run_server.py │ │ │ └── test_connection.py │ │ ├── core │ │ │ ├── core │ │ │ │ ├── __init__.py │ │ │ │ └── telemetry │ │ │ │ ├── __init__.py │ │ │ │ └── posthog.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── mcp-server │ │ │ ├── mcp_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ └── server.py │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ └── scripts │ │ │ ├── install_mcp_server.sh │ │ │ └── start_mcp_server.sh │ │ ├── pylume │ │ │ ├── __init__.py │ │ │ ├── pylume │ │ │ │ ├── __init__.py │ │ │ │ ├── client.py │ │ │ │ ├── exceptions.py │ │ │ │ ├── lume │ │ │ │ ├── models.py │ │ │ │ ├── pylume.py │ │ │ │ └── server.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ └── som │ │ ├── LICENSE │ │ ├── poetry.toml │ │ ├── pyproject.toml │ │ ├── README.md │ │ ├── som │ │ │ ├── __init__.py │ │ │ ├── detect.py │ │ │ ├── detection.py │ │ │ ├── models.py │ │ │ ├── ocr.py │ │ │ ├── util │ │ │ │ └── utils.py │ │ │ └── visualization.py │ │ └── tests │ │ └── test_omniparser.py │ ├── typescript │ │ ├── .gitignore │ │ ├── .nvmrc │ │ ├── agent │ │ │ ├── examples │ │ │ │ ├── playground-example.html │ │ │ │ └── README.md │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── client.ts │ │ │ │ ├── index.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ └── client.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── biome.json │ │ ├── computer │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── computer │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── providers │ │ │ │ │ │ ├── base.ts │ │ │ │ │ │ ├── cloud.ts │ │ │ │ │ │ └── index.ts │ │ │ │ │ └── types.ts │ │ │ │ ├── index.ts │ │ │ │ ├── interface │ │ │ │ │ ├── base.ts │ │ │ │ │ ├── factory.ts │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── linux.ts │ │ │ │ │ ├── macos.ts │ │ │ │ │ └── windows.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ ├── computer │ │ │ │ │ └── cloud.test.ts │ │ │ │ ├── interface │ │ │ │ │ ├── factory.test.ts │ │ │ │ │ ├── index.test.ts │ │ │ │ │ ├── linux.test.ts │ │ │ │ │ ├── macos.test.ts │ │ │ │ │ └── windows.test.ts │ │ │ │ └── setup.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── core │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── index.ts │ │ │ │ └── telemetry │ │ │ │ ├── clients │ │ │ │ │ ├── index.ts │ │ │ │ │ └── posthog.ts │ │ │ │ └── index.ts │ │ │ ├── tests │ │ │ │ └── telemetry.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── pnpm-workspace.yaml │ │ └── README.md │ └── xfce │ ├── .dockerignore │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ └── src │ ├── scripts │ │ ├── resize-display.sh │ │ ├── start-computer-server.sh │ │ ├── start-novnc.sh │ │ ├── start-vnc.sh │ │ └── xstartup.sh │ ├── supervisor │ │ └── supervisord.conf │ └── xfce-config │ ├── helpers.rc │ ├── xfce4-power-manager.xml │ └── xfce4-session.xml ├── LICENSE.md ├── notebooks │ ├── agent_nb.ipynb │ ├── blog │ │ ├── build-your-own-operator-on-macos-1.ipynb │ │ └── build-your-own-operator-on-macos-2.ipynb │ ├── composite_agents_docker_nb.ipynb │ ├── computer_nb.ipynb │ ├── computer_server_nb.ipynb │ ├── customizing_computeragent.ipynb │ ├── eval_osworld.ipynb │ ├── ollama_nb.ipynb │ ├── pylume_nb.ipynb │ ├── README.md │ ├── sota_hackathon_cloud.ipynb │ └── sota_hackathon.ipynb ├── pdm.lock ├── pyproject.toml ├── pyrightconfig.json ├── README.md ├── samples │ └── community │ ├── global-online │ │ └── README.md │ └── hack-the-north │ └── README.md ├── scripts │ ├── build-uv.sh │ ├── build.ps1 │ ├── build.sh │ ├── cleanup.sh │ ├── playground-docker.sh │ ├── playground.sh │ └── run-docker-dev.sh └── tests ├── pytest.ini ├── shell_cmd.py ├── test_files.py ├── test_shell_bash.py ├── test_telemetry.py ├── test_venv.py └── test_watchdog.py ``` # Files -------------------------------------------------------------------------------- /docs/content/docs/libraries/lumier/docker.mdx: -------------------------------------------------------------------------------- ```markdown --- title: Docker --- You can use Lumier through Docker: ### Run a macOS VM (ephemeral) ```bash # Run the container with temporary storage (using pre-built image from Docker Hub) docker run -it --rm \ --name macos-vm \ -p 8006:8006 \ -e VM_NAME=macos-vm \ -e VERSION=ghcr.io/trycua/macos-sequoia-cua:latest \ -e CPU_CORES=4 \ -e RAM_SIZE=8192 \ trycua/lumier:latest ``` Access the VM in your browser at [http://localhost:8006](http://localhost:8006). After running the command above, you can access your macOS VM through a web browser (e.g., http://localhost:8006). <Callout title="Note"> With the basic setup above, your VM will be reset when you stop the container (ephemeral mode). This means any changes you make inside the macOS VM will be lost. See the section below for how to save your VM state. </Callout> ## Saving Your VM State To save your VM state between sessions (so your changes persist when you stop and restart the container), you'll need to set up a storage location: ```bash # First, create a storage directory if it doesn't exist mkdir -p storage # Then run the container with persistent storage docker run -it --rm \ --name lumier-vm \ -p 8006:8006 \ -v $(pwd)/storage:/storage \ -e VM_NAME=lumier-vm \ -e VERSION=ghcr.io/trycua/macos-sequoia-cua:latest \ -e CPU_CORES=4 \ -e RAM_SIZE=8192 \ -e HOST_STORAGE_PATH=$(pwd)/storage \ trycua/lumier:latest ``` This command creates a connection between a folder on your Mac (`$(pwd)/storage`) and a folder inside the Docker container (`/storage`). The `-v` flag (volume mount) and the `HOST_STORAGE_PATH` variable work together to ensure your VM data is saved on your host Mac. ## Sharing Files with Your VM To share files between your Mac and the virtual machine, you can set up a shared folder: ```bash # Create both storage and shared folders mkdir -p storage shared # Run with both persistent storage and a shared folder docker run -it --rm \ --name lumier-vm \ -p 8006:8006 \ -v $(pwd)/storage:/storage \ -v $(pwd)/shared:/shared \ -e VM_NAME=lumier-vm \ -e VERSION=ghcr.io/trycua/macos-sequoia-cua:latest \ -e CPU_CORES=4 \ -e RAM_SIZE=8192 \ -e HOST_STORAGE_PATH=$(pwd)/storage \ -e HOST_SHARED_PATH=$(pwd)/shared \ trycua/lumier:latest ``` With this setup, any files you place in the `shared` folder on your Mac will be accessible from within the macOS VM, and vice versa. ## Automating VM Startup with on-logon.sh You can automatically run scripts when the VM starts up by placing an `on-logon.sh` script in the shared folder's lifecycle directory. This is useful for setting up your VM environment each time it starts. ```bash # Create the lifecycle directory in your shared folder mkdir -p shared/lifecycle # Create a sample on-logon.sh script cat > shared/lifecycle/on-logon.sh << 'EOF' #!/usr/bin/env bash # Create a file on the desktop echo "Hello from Lumier!" > /Users/lume/Desktop/hello_lume.txt # You can add more commands to execute at VM startup # For example: # - Configure environment variables # - Start applications # - Mount network drives # - Set up development environments EOF # Make the script executable chmod +x shared/lifecycle/on-logon.sh ``` The script will be automatically executed when the VM starts up. It runs in the VM context and has access to: - The `/Users/lume` user directory (home directory in the VM) - The shared folder at `/Volumes/My Shared Files` inside the VM - Any resources available to the VM This feature enables automation of VM setup without modifying the base VM image. ## Configuration Options When running Lumier, you'll need to configure a few things: - **Port forwarding** (`-p 8006:8006`): Makes the VM's VNC interface accessible in your browser. If port 8006 is already in use, you can use a different port like `-p 8007:8006`. - **Environment variables** (`-e`): Configure your VM settings: - `VM_NAME`: A name for your virtual machine - `VERSION`: The macOS image to use - `CPU_CORES`: Number of CPU cores to allocate - `RAM_SIZE`: Memory in MB to allocate - `HOST_STORAGE_PATH`: Path to save VM state (when using persistent storage) - `HOST_SHARED_PATH`: Path to the shared folder (optional) - **Background service**: The `lume serve` service should be running on your host (starts automatically when you install Lume using the `install.sh` script above). ``` -------------------------------------------------------------------------------- /libs/typescript/agent/src/types.ts: -------------------------------------------------------------------------------- ```typescript // #region Request export type ConnectionType = 'http' | 'https' | 'peer'; export interface AgentClientOptions { timeout?: number; retries?: number; /** Optional CUA API key to send as X-API-Key header for HTTP requests */ apiKey?: string; } // Request types matching the Python proxy API export interface AgentRequest { model: string; input: string | AgentMessage[]; agent_kwargs?: { save_trajectory?: boolean; verbosity?: number; [key: string]: any; }; computer_kwargs?: { os_type?: string; provider_type?: string; [key: string]: any; }; /** * Optional per-request environment variable overrides. * Keys and values are strings and will be forwarded to the backend proxy. */ env?: Record<string, string>; } // #endregion // #region Response // Response types export interface AgentResponse { output: AgentMessage[]; usage: Usage; status: 'completed' | 'failed'; error?: string; } // Usage information export interface Usage { prompt_tokens: number; completion_tokens: number; total_tokens: number; response_cost: number; } // #endregion // #region Messages // Agent message types - can be one of several different message types export type AgentMessage = | UserMessage | AssistantMessage | ReasoningMessage | ComputerCallMessage | ComputerCallOutputMessage | FunctionCallMessage | FunctionCallOutputMessage; // Input message export interface UserMessage { type?: 'message'; role: 'user' | 'system' | 'developer'; content: string | InputContent[]; } // Output message export interface AssistantMessage { type: 'message'; role: 'assistant'; content: OutputContent[]; } // Output reasoning/thinking message export interface ReasoningMessage { type: 'reasoning'; summary: SummaryContent[]; } // Output computer action call export interface ComputerCallMessage { type: 'computer_call'; call_id: string; status: 'completed' | 'failed' | 'pending'; action: ComputerAction; } // Output computer action result (always a screenshot) export interface ComputerCallOutputMessage { type: 'computer_call_output'; call_id: string; output: ComputerResultContent; } // Output function call export interface FunctionCallMessage { type: 'function_call'; call_id: string; status: 'completed' | 'failed' | 'pending'; name: string; arguments: string; // JSON dict of kwargs } // Output function call result (always text) export interface FunctionCallOutputMessage { type: 'function_call_output'; call_id: string; output: string; } // #endregion // #region Message Content export interface InputContent { type: 'input_image' | 'input_text'; text?: string; image_url?: string; } export interface OutputContent { type: 'output_text'; text: string; } export interface SummaryContent { type: 'summary_text'; text: string; } export interface ComputerResultContent { type: 'computer_screenshot' | 'input_image'; image_url: string; } // #endregion // #region Actions export type ComputerAction = | ComputerActionOpenAI | ComputerActionAnthropic; // OpenAI Computer Actions export type ComputerActionOpenAI = | ClickAction | DoubleClickAction | DragAction | KeyPressAction | MoveAction | ScreenshotAction | ScrollAction | TypeAction | WaitAction; export interface ClickAction { type: 'click'; button: 'left' | 'right' | 'wheel' | 'back' | 'forward'; x: number; y: number; } export interface DoubleClickAction { type: 'double_click'; button?: 'left' | 'right' | 'wheel' | 'back' | 'forward'; x: number; y: number; } export interface DragAction { type: 'drag'; button?: 'left' | 'right' | 'wheel' | 'back' | 'forward'; path: Array<[number, number]>; } export interface KeyPressAction { type: 'keypress'; keys: string[]; } export interface MoveAction { type: 'move'; x: number; y: number; } export interface ScreenshotAction { type: 'screenshot'; } export interface ScrollAction { type: 'scroll'; scroll_x: number; scroll_y: number; x: number; y: number; } export interface TypeAction { type: 'type'; text: string; } export interface WaitAction { type: 'wait'; } // Anthropic Computer Actions export type ComputerActionAnthropic = | LeftMouseDownAction | LeftMouseUpAction; export interface LeftMouseDownAction { type: 'left_mouse_down'; x: number; y: number; } export interface LeftMouseUpAction { type: 'left_mouse_up'; x: number; y: number; } // #endregion ``` -------------------------------------------------------------------------------- /libs/python/agent/example.py: -------------------------------------------------------------------------------- ```python """ Example usage of the agent library with docstring-based tool definitions. """ import asyncio import logging from agent import ComputerAgent from computer import Computer from computer.helpers import sandboxed @sandboxed() def read_file(location: str) -> str: """Read contents of a file Parameters ---------- location : str Path to the file to read Returns ------- str Contents of the file or error message """ try: with open(location, 'r') as f: return f.read() except Exception as e: return f"Error reading file: {str(e)}" def save_note(content: str, filename: str = "note.txt") -> str: """Save content to a note file Parameters ---------- content : str Content to save to the file filename : str, optional Name of the file to save to (default is "note.txt") Returns ------- str Success or error message """ try: with open(filename, 'w') as f: f.write(content) return f"Saved note to {filename}" except Exception as e: return f"Error saving note: {str(e)}" def calculate(a: int, b: int) -> int: """Calculate the sum of two integers Parameters ---------- a : int First integer b : int Second integer Returns ------- int Sum of the two integers """ return a + b async def main(): """Example usage of ComputerAgent with different models""" # Example 1: Using Claude with computer and custom tools print("=== Example 1: Claude with Computer ===") import os import dotenv import json dotenv.load_dotenv() assert os.getenv("CUA_CONTAINER_NAME") is not None, "CUA_CONTAINER_NAME is not set" assert os.getenv("CUA_API_KEY") is not None, "CUA_API_KEY is not set" async with Computer( os_type="linux", provider_type="cloud", name=os.getenv("CUA_CONTAINER_NAME") or "", api_key=os.getenv("CUA_API_KEY") or "" ) as computer: agent = ComputerAgent( # Supported models: # == OpenAI CUA (computer-use-preview) == model="openai/computer-use-preview", # == Anthropic CUA (Claude > 3.5) == # model="anthropic/claude-opus-4-20250514", # model="anthropic/claude-sonnet-4-20250514", # model="anthropic/claude-3-7-sonnet-20250219", # model="anthropic/claude-3-5-sonnet-20241022", # == UI-TARS == # model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", # TODO: add local mlx provider # model="mlx-community/UI-TARS-1.5-7B-6bit", # model="ollama_chat/0000/ui-tars-1.5-7b", # == Omniparser + Any LLM == # model="omniparser+..." # model="omniparser+anthropic/claude-opus-4-20250514", tools=[computer], only_n_most_recent_images=3, verbosity=logging.INFO, trajectory_dir="trajectories", use_prompt_caching=True, max_trajectory_budget={ "max_budget": 1.0, "raise_error": True, "reset_after_each_run": False }, ) history = [] while True: user_input = input("> ") history.append({"role": "user", "content": user_input}) # Non-streaming usage async for result in agent.run(history, stream=False): history += result["output"] # # Print output # for item in result["output"]: # if item["type"] == "message": # print(item["content"][0]["text"]) # elif item["type"] == "computer_call": # action = item["action"] # action_type = action["type"] # action_args = {k: v for k, v in action.items() if k != "type"} # print(f"{action_type}({action_args})") # elif item["type"] == "function_call": # action = item["name"] # action_args = item["arguments"] # print(f"{action}({action_args})") # elif item["type"] == "function_call_output": # print("===>", item["output"]) if __name__ == "__main__": asyncio.run(main()) ``` -------------------------------------------------------------------------------- /blog/trajectory-viewer.md: -------------------------------------------------------------------------------- ```markdown # Trajectory Viewer for Cua *Published on May 13, 2025 by Dillon DuPont* Don’t forget to check out [Part 1: Building your own Computer-Use Operator](build-your-own-operator-on-macos-1) and [Part 2: Using the Agent framework](build-your-own-operator-on-macos-2) for setting up your Cua environment and basic tips and tricks! ## Introduction Okay, so you’ve gotten your environment up and also tested a few agent runs. You’ll likely have encountered cases where your agent was successful at doing some tasks but also places where it got stuck or outright failed. Now what? If you’ve ever wondered exactly what your computer agent is doing and why it sometimes doesn’t do what you expected, then the Trajectory Viewer for Cua is here to help! Whether you’re a seasoned developer or someone who just wants to dive in and see results, this tool makes it easy to explore every step your agent takes on your screen. Plus, if you want to start thinking about generating data to train your own agentic model (we’ll cover training in an upcoming blog, so look forward to it), then our Trajectory Viewer might be for you. ## So, what’s a “trajectory”? Think of a trajectory as a detailed video recording of your agent’s journey: - **Observations**: What did the agent see (the exact screen content) at each point in time? - **Actions**: What clicks, keystrokes, or commands did it perform in response? - **Decisions**: Which options did it choose, and why? Especially for longer and more complex tasks, your agent will make multiple steps, take multiple actions, and make multiple observations. By examining this record, you can pinpoint where things go right, and more importantly, where they go wrong. ## So, what’s Cua’s Trajectory Viewer and why use it? The Trajectory Player for Cua is a GUI tool that helps you explore saved trajectories generated from your Cua computer agent runs. This tool provides a powerful way to: - **Debug your agents**: See exactly what your agent saw to reproduce bugs - **Analyze failure cases**: Identify the moment when your agent went off-script - **Collect training data**: Export your trajectories for your own processing, training, and more! The viewer allows you to see exactly what your agent observed and how it interacted with the computer all through your browser. ## Opening Trajectory Viewer in 3 Simple Steps 1. **Visit**: Open your browser and go to [https://www.trycua.com/trajectory-viewer](https://www.trycua.com/trajectory-viewer). 2. **Upload**: Drag and drop a trajectories folder or click Select Folder. 3. **Explore**: View your agent’s trajectories! All data stays in your browser unless you give permission otherwise.  ## Recording a Trajectory ### Using the Gradio UI The simplest way to create agent trajectories is through the [Cua Agent Gradio UI](https://www.trycua.com/docs/quickstart-ui) by checking the "Save Trajectory" option. ### Using the ComputerAgent API Trajectories are saved by default when using the ComputerAgent API: ```python agent.run("book a flight for me") ``` You can explicitly control trajectory saving with the `save_trajectory` parameter: ```python from cua import ComputerAgent agent = ComputerAgent(save_trajectory=True) agent.run("search for hotels in Boston") ``` Each trajectory folder is saved in a `trajectories` directory with a timestamp format, for example: `trajectories/20250501_222749` ## Exploring and Analyzing Trajectories Our Trajectory Viewer is designed to allow for thorough analysis and debugging in a friendly way. Once loaded, the viewer presents: - **Timeline Slider**: Jump to any step in the session - **Screen Preview**: See exactly what the agent saw - **Action Details**: Review clicks, keypresses, and API calls - **Logs & Metadata**: Inspect debug logs or performance stats Use these features to: - Step through each action and observation; understand your agent’s decision-making - Understand why and where your agent failed - Collect insights for improving your instructions, prompts, tasks, agent, etc. The trajectory viewer provides a visual interface for stepping through each action your agent took, making it easy to see what your agent “sees”. ## Getting Started Ready to see your agent in action? Head over to the Trajectory Viewer and load up your first session. Debug smarter, train faster, and stay in control (all within your browser). Happy tinkering and Cua on! Have questions or want to share feedback? Join our community on Discord or open an issue on GitHub. ``` -------------------------------------------------------------------------------- /docs/content/docs/agent-sdk/supported-agents/composed-agents.mdx: -------------------------------------------------------------------------------- ```markdown --- title: Composed Agents description: Combine grounding models with any LLM for computer-use capabilities --- Composed agents combine the best of both worlds: specialized grounding models for precise click prediction and powerful LLMs for task planning and reasoning. Use the format `"grounding_model+planning_model"` to create a composed agent with any vision-enabled LiteLLM-compatible model. ## How Composed Agents Work 1. **Planning Phase**: The planning model (LLM) analyzes the task and decides what actions to take (e.g., `click("find the login button")`, `type("username")`) 2. **Grounding Phase**: The grounding model converts element descriptions to precise coordinates 3. **Execution**: Actions are performed using the predicted coordinates ## Supported Grounding Models Any model that supports `predict_click()` can be used as the grounding component. See the full list on [Grounding Models](./grounding-models). - OpenCUA: `huggingface-local/xlangai/OpenCUA-{7B,32B}` - GTA1 family: `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` - Holo 1.5 family: `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` - InternVL 3.5 family: `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` - UI‑TARS 1.5: `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` (also supports full CU) - OmniParser (OCR): `omniparser` (requires combination with a LiteLLM vision model) - Moondream3: `moondream3` (requires combination with a LiteLLM vision/text model) ## Supported Planning Models Any vision-enabled LiteLLM-compatible model can be used as the planning component: - Any All‑in‑one CUA (planning-capable). See [All‑in‑one CUAs](./computer-use-agents). - Any VLM via LiteLLM providers: `anthropic/*`, `openai/*`, `openrouter/*`, `gemini/*`, `vertex_ai/*`, `huggingface-local/*`, `mlx/*`, etc. - Examples: - **Anthropic**: `anthropic/claude-3-5-sonnet-20241022`, `anthropic/claude-opus-4-1-20250805` - **OpenAI**: `openai/gpt-5`, `openai/gpt-o3`, `openai/gpt-4o` - **Google**: `gemini/gemini-1.5-pro`, `vertex_ai/gemini-pro-vision` - **Local models**: Any Hugging Face vision-language model ## Usage Examples ### GTA1 + GPT-5 Use Google's Gemini for planning with specialized grounding: ```python agent = ComputerAgent( "huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-5", tools=[computer] ) async for _ in agent.run("Take a screenshot, analyze the UI, and click on the most prominent button"): pass ``` ### GTA1 + Claude 3.5 Sonnet Combine state-of-the-art grounding with powerful reasoning: ```python agent = ComputerAgent( "huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-3-5-sonnet-20241022", tools=[computer] ) async for _ in agent.run("Open Firefox, navigate to github.com, and search for 'computer-use'"): pass # Success! 🎉 # - Claude 3.5 Sonnet plans the sequence of actions # - GTA1-7B provides precise click coordinates for each UI element ``` ### UI-TARS + GPT-4o Combine two different vision models for enhanced capabilities: ```python agent = ComputerAgent( "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B+openai/gpt-4o", tools=[computer] ) async for _ in agent.run("Help me fill out this form with my personal information"): pass ``` ### Moondream3 + GPT-4o Use the built-in Moondream3 grounding with any planning model. Moondream3 will detect UI elements on the latest screenshot, label them, and provide a user message listing detected element names. ```python from agent import ComputerAgent from computer import computer agent = ComputerAgent( "moondream3+openai/gpt-4o", tools=[computer] ) async for _ in agent.run("Close the settings window, then open the Downloads folder"): pass ``` ## Benefits of Composed Agents - **Specialized Grounding**: Use models optimized for click prediction accuracy - **Flexible Planning**: Choose any LLM for task reasoning and planning - **Cost Optimization**: Use smaller grounding models with larger planning models only when needed - **Performance**: Leverage the strengths of different model architectures ## Capabilities Composed agents support both capabilities: ```python agent = ComputerAgent("huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-3-5-sonnet-20241022") # Full computer-use agent capabilities async for _ in agent.run("Complete this online form"): pass # Direct click prediction (uses grounding model only) coords = agent.predict_click("find the submit button") ``` --- For more information on individual model capabilities, see [Computer-Use Agents](./computer-use-agents) and [Grounding Models](./grounding-models). ``` -------------------------------------------------------------------------------- /blog/composite-agents.md: -------------------------------------------------------------------------------- ```markdown # Announcing Cua Agent framework 0.4 and Composite Agents *Published on August 26, 2025 by Dillon DuPont* <img src="./assets/composite-agents.png" alt="Composite Agents"> So you want to build an agent that can use a computer. Great! You've probably discovered that there are now dozens of different AI models that claim they can click GUI buttons and fill out forms. Less great: actually getting them to work together is like trying to coordinate a group project where everyone speaks a different language and has invented seventeen different ways to say "click here". Here's the thing about new GUI models: they're all special snowflakes. One model wants you to feed it images and expects coordinates back as percentages from 0 to 1. Another wants absolute pixel coordinates. A third model has invented its own numeral system with `<|loc095|><|loc821|>` tokens inside tool calls. Some models output Python code that calls `pyautogui.click(x, y)`. Others will start hallucinating coordinates if you forget to format all previous messages within a very specific GUI system prompt. This is the kind of problem that makes you wonder if we're building the future of computing or just recreating the Tower of Babel with more GPUs. ## What we fixed Agent framework 0.4 solves this by doing something radical: making all these different models speak the same language. Instead of writing separate code for each model's peculiarities, you now just pick a model with a string like `"anthropic/claude-3-5-sonnet-20241022"` or `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`, and everything else Just Works™. Behind the scenes, we handle all the coordinate normalization, token parsing, and image preprocessing so you don't have to. ```python # This works the same whether you're using Anthropic, OpenAI, or that new model you found on Hugging Face agent = ComputerAgent( model="anthropic/claude-3-5-sonnet-20241022", # or any other supported model tools=[computer] ) ``` The output format is consistent across all providers (OpenAI, Anthropic, Vertex, Hugging Face, OpenRouter, etc.). No more writing different parsers for each model's creative interpretation of how to represent a mouse click. ## Composite Agents: Two Brains Are Better Than One Here's where it gets interesting. We realized that you don't actually need one model to be good at everything. Some models are excellent at understanding what's on the screen—they can reliably identify buttons and text fields and figure out where to click. Other models are great at planning and reasoning but might be a bit fuzzy on the exact pixel coordinates. So we let you combine them with a `+` sign: ```python agent = ComputerAgent( # specify the grounding model first, then the planning model model="huggingface-local/HelloKKMe/GTA1-7B+huggingface-local/OpenGVLab/InternVL3_5-8B", tools=[computer] ) ``` This creates a composite agent where one model (the "grounding" model) handles the visual understanding and precise UI interactions, while the other (the "planning" model) handles the high-level reasoning and task orchestration. It's like having a pilot and a navigator, except they're both AI models and they're trying to help you star a GitHub repository. You can even take a model that was never designed for computer use—like GPT-4o—and give it GUI capabilities by pairing it with a specialized vision model: ```python agent = ComputerAgent( model="huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-4o", tools=[computer] ) ``` ## Example notebook For a full, ready-to-run demo (install deps, local computer using Docker, and a composed agent example), see the notebook: - https://github.com/trycua/cua/blob/models/opencua/notebooks/composite_agents_docker_nb.ipynb ## What's next We're building integration with HUD evals, allowing us to curate and benchmark model combinations. This will help us identify which composite agent pairs work best for different types of tasks, and provide you with tested recommendations rather than just throwing model names at the wall to see what sticks. If you try out version 0.4.x, we'd love to hear how it goes. Join us on Discord to share your results and let us know what model combinations work best for your projects. --- ## Links * **Composite Agent Docs:** [https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) * **Discord:** [https://discord.gg/cua-ai](https://discord.gg/cua-ai) Questions or weird edge cases? Ping us on Discord—we’re curious to see what you build. ``` -------------------------------------------------------------------------------- /docs/content/docs/computer-sdk/computers.mdx: -------------------------------------------------------------------------------- ```markdown --- title: Cua Computers description: Understanding cua computer types and connection methods --- <Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/computer_nb.ipynb" target="_blank">Jupyter Notebook</a> and <a href="https://github.com/trycua/cua/tree/main/examples/computer-example-ts" target="_blank">NodeJS project</a> are available for this documentation.</Callout> Before we can automate apps using AI, we need to first connect to a Computer Server to give the AI a safe environment to execute workflows in. Cua Computers are preconfigured virtual machines running the Computer Server. They can be either macOS, Linux, or Windows. They're found in either a cloud-native container, or on your host desktop. ## Cloud Sandbox **Easiest & safest way to get started - works on any host OS** This is a Cloud Sandbox running the Computer Server. Get a container at [trycua.com](https://www.trycua.com/). <Tabs items={['Python', 'TypeScript']}> <Tab value="Python"> ```python from computer import Computer computer = Computer( os_type="linux", provider_type="cloud", name="your-sandbox-name", api_key="your-api-key" ) await computer.run() # Connect to the sandbox ``` </Tab> <Tab value="TypeScript"> ```typescript import { Computer, OSType } from '@trycua/computer'; const computer = new Computer({ osType: OSType.LINUX, name: "your-sandbox-name", apiKey: "your-api-key" }); await computer.run(); // Connect to the sandbox ``` </Tab> </Tabs> ## Linux on Docker **Run Linux desktop locally on macOS, Windows, or Linux hosts** Cua provides two Docker images for running Linux desktops: <Tabs items={['XFCE (Lightweight)', 'KASM (Full-Featured)']}> <Tab value="XFCE (Lightweight)"> **Recommended for most use cases** - lightweight XFCE desktop with Firefox 1. Install Docker Desktop or Docker Engine 2. Pull the CUA XFCE image ```bash docker pull --platform=linux/amd64 trycua/cua-xfce:latest ``` 3. Connect with Computer ```python from computer import Computer computer = Computer( os_type="linux", provider_type="docker", image="trycua/cua-xfce:latest", name="my-xfce-container" ) await computer.run() # Launch & connect to Docker sandbox ``` </Tab> <Tab value="KASM (Full-Featured)"> **Full-featured Ubuntu desktop** with additional applications 1. Install Docker Desktop or Docker Engine 2. Build or pull the CUA KASM image ```bash # Option 1: Pull from Docker Hub docker pull --platform=linux/amd64 trycua/cua-ubuntu:latest # Option 2: Build locally cd libs/kasm docker build -t cua-ubuntu:latest . ``` 3. Connect with Computer ```python from computer import Computer computer = Computer( os_type="linux", provider_type="docker", image="trycua/cua-ubuntu:latest", name="my-kasm-container" ) await computer.run() # Launch & connect to Docker sandbox ``` </Tab> </Tabs> ## Windows Sandbox **Windows hosts only - requires Windows 10 Pro/Enterprise or Windows 11** 1. Enable Windows Sandbox 2. Install pywinsandbox dependency ```bash pip install -U git+git://github.com/karkason/pywinsandbox.git ``` 3. Connect with Computer ```python from computer import Computer computer = Computer( os_type="windows", provider_type="winsandbox", ephemeral=True # Windows Sandbox is always ephemeral ) await computer.run() # Launch & connect to Windows Sandbox ``` ## macOS VM **macOS hosts only - requires Lume CLI** 1. Install lume cli ```bash /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)" ``` 2. Start a local cua macOS VM ```bash lume run macos-sequoia-cua:latest ``` 3. Connect with Computer ```python from computer import Computer computer = Computer( os_type="macos", provider_type="lume", name="macos-sequoia-cua:latest" ) await computer.run() # Launch & connect to the sandbox ``` ## Your host desktop You can also have agents control your desktop directly by running Computer Server without any containerization layer. Beware that AI models may perform risky actions. ```bash pip install cua-computer-server python -m computer_server ``` Connect with: <Tabs items={['Python']}> <Tab value="Python"> ```python computer = Computer(use_host_computer_server=True) await computer.run() # Connect to the host desktop ``` </Tab> </Tabs> ``` -------------------------------------------------------------------------------- /libs/lumier/src/bin/entry.sh: -------------------------------------------------------------------------------- ```bash #!/usr/bin/env bash # Configure SSH to prevent known hosts warnings export SSHPASS_PROMPT= export SSH_ASKPASS=/bin/echo # Set SSH quiet mode via the SSHPASS environment variable export SSHPASS_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR -q" # We'll enable strict error checking AFTER initialization # to prevent premature exits # Source configuration files CONFIG_DIR="/run/config" LIB_DIR="/run/lib" # Source constants if available if [ -f "${CONFIG_DIR}/constants.sh" ]; then source "${CONFIG_DIR}/constants.sh" fi # Import utilities for lib in "${LIB_DIR}"/*.sh; do if [ -f "$lib" ]; then source "$lib" fi done # Set VM_NAME to env or fallback to container name (from --name) if [ -z "${VM_NAME:-}" ]; then VM_NAME="$(cat /etc/hostname)" export VM_NAME fi # Set HOST_STORAGE_PATH to a lume ephemeral storage if not set if [ -z "${HOST_STORAGE_PATH:-}" ]; then HOST_STORAGE_PATH="ephemeral" # Tell user that ephemeral storage is being used echo "Using ephemeral storage. VM state will be lost when macOS cleans up temporary files." export HOST_STORAGE_PATH fi # Only check and report mountpoints in debug mode if [ "${LUMIER_DEBUG:-0}" == "1" ]; then if mountpoint -q /storage; then echo "/storage is mounted" fi if mountpoint -q /shared; then echo "/shared is mounted" fi # if mountpoint -q /data; then # echo "/data is mounted" # fi fi # Check if we're running as PID 1 (important for Docker signal handling) if [ $$ -ne 1 ]; then echo "Warning: This script is not running as PID 1 (current PID: $$)." echo "Docker signal handling may not work properly when stopped from Docker Desktop." fi # Log startup info echo "Lumier VM is starting..." # Cleanup function to ensure VM and noVNC proxy shutdown on container stop # Counter for signal handling SIGNAL_COUNT=0 cleanup() { local signal_name=$1 set +e # Don't exit on error in cleanup # Increment signal counter SIGNAL_COUNT=$((SIGNAL_COUNT + 1)) # If this is the first signal, try graceful shutdown if [ $SIGNAL_COUNT -eq 1 ]; then echo "[cleanup] Caught $signal_name signal, shutting down..." # Check if we're in the middle of an image pull if [[ "$PULL_IN_PROGRESS" == "1" ]]; then echo "[cleanup] Interrupted during image pull, skipping VM stop." else echo "[cleanup] Stopping VM..." stop_vm true fi # Attempt to clean up ephemeral storage if it's in the /private/tmp directory if [[ "$HOST_STORAGE_PATH" == "ephemeral" ]]; then # First check if VM actually exists VM_INFO=$(lume_get "$VM_NAME" "$HOST_STORAGE_PATH" "json" "false") # Only try VM deletion if VM exists and not in the middle of a pull if [[ "$PULL_IN_PROGRESS" != "1" && $VM_INFO != *"Virtual machine not found"* ]]; then echo "[cleanup] Cleaning up VM..." lume_delete "$VM_NAME" "$HOST_STORAGE_PATH" > /dev/null 2>&1 fi fi else # For multiple signals, force an immediate exit echo "got $SIGNAL_COUNT SIGTERM/SIGINTs, forcefully exiting" fi # If we've received multiple signals, just exit immediately if [ $SIGNAL_COUNT -ge 3 ]; then exit 1 fi # Exit with success for the first signal if [ $SIGNAL_COUNT -eq 1 ]; then exit 0 fi } # Ensure we catch all typical container termination signals trap 'cleanup SIGTERM' SIGTERM trap 'cleanup SIGINT' SIGINT trap 'cleanup SIGHUP' SIGHUP # Now enable strict error handling after initialization set -euo pipefail # Start the VM with error handling if ! start_vm; then echo "ERROR: Failed to start VM!" >&2 exit 1 fi # Start noVNC for VNC access NOVNC_PID="" if [ -n "${VNC_PORT:-}" ] && [ -n "${VNC_PASSWORD:-}" ]; then # Only show this in debug mode if [ "${LUMIER_DEBUG:-0}" == "1" ]; then echo "Starting noVNC proxy with optimized color settings..." fi ${NOVNC_PATH}/utils/novnc_proxy --vnc host.docker.internal:${VNC_PORT} --listen 8006 --web ${NOVNC_PATH} > /dev/null 2>&1 & NOVNC_PID=$! disown $NOVNC_PID echo "noVNC interface available at: http://localhost:8006/vnc.html?password=${VNC_PASSWORD}&autoconnect=true (replace PORT with the port you forwarded to 8006)" fi echo "Lumier is running. Press Ctrl+C to stop." # Instead of tail -f /dev/null, use a wait loop that can be interrupted by signals while true; do # Sleep in small increments to make signal handling more responsive sleep 1 & wait $! # Break the loop if we've received a signal if [ $SIGNAL_COUNT -gt 0 ]; then break fi done ``` -------------------------------------------------------------------------------- /libs/lume/src/Server/Requests.swift: -------------------------------------------------------------------------------- ```swift import ArgumentParser import Foundation import Virtualization struct RunVMRequest: Codable { let noDisplay: Bool? let sharedDirectories: [SharedDirectoryRequest]? let recoveryMode: Bool? let storage: String? struct SharedDirectoryRequest: Codable { let hostPath: String let readOnly: Bool? } func parse() throws -> [SharedDirectory] { guard let sharedDirectories = sharedDirectories else { return [] } return try sharedDirectories.map { dir -> SharedDirectory in // Validate that the host path exists and is a directory var isDirectory: ObjCBool = false guard FileManager.default.fileExists(atPath: dir.hostPath, isDirectory: &isDirectory), isDirectory.boolValue else { throw ValidationError( "Host path does not exist or is not a directory: \(dir.hostPath)") } return SharedDirectory( hostPath: dir.hostPath, tag: VZVirtioFileSystemDeviceConfiguration.macOSGuestAutomountTag, readOnly: dir.readOnly ?? false ) } } } struct PullRequest: Codable { let image: String let name: String? var registry: String var organization: String let storage: String? enum CodingKeys: String, CodingKey { case image, name, registry, organization, storage } init(from decoder: Decoder) throws { let container = try decoder.container(keyedBy: CodingKeys.self) image = try container.decode(String.self, forKey: .image) name = try container.decodeIfPresent(String.self, forKey: .name) registry = try container.decodeIfPresent(String.self, forKey: .registry) ?? "ghcr.io" organization = try container.decodeIfPresent(String.self, forKey: .organization) ?? "trycua" storage = try container.decodeIfPresent(String.self, forKey: .storage) } } struct CreateVMRequest: Codable { let name: String let os: String let cpu: Int let memory: String let diskSize: String let display: String let ipsw: String? let storage: String? func parse() throws -> (memory: UInt64, diskSize: UInt64) { return ( memory: try parseSize(memory), diskSize: try parseSize(diskSize) ) } } struct SetVMRequest: Codable { let cpu: Int? let memory: String? let diskSize: String? let display: String? let storage: String? func parse() throws -> (memory: UInt64?, diskSize: UInt64?, display: VMDisplayResolution?) { return ( memory: try memory.map { try parseSize($0) }, diskSize: try diskSize.map { try parseSize($0) }, display: try display.map { guard let resolution = VMDisplayResolution(string: $0) else { throw ValidationError( "Invalid display resolution format: \($0). Expected format: WIDTHxHEIGHT") } return resolution } ) } } struct CloneRequest: Codable { let name: String let newName: String let sourceLocation: String? let destLocation: String? } struct PushRequest: Codable { let name: String // Name of the local VM let imageName: String // Base name for the image in the registry let tags: [String] // List of tags to push var registry: String // Registry URL var organization: String // Organization/user in the registry let storage: String? // Optional VM storage location or direct path var chunkSizeMb: Int // Chunk size // dryRun and reassemble are less common for API, default to false? // verbose is usually handled by server logging enum CodingKeys: String, CodingKey { case name, imageName, tags, registry, organization, storage, chunkSizeMb } // Provide default values for optional fields during decoding init(from decoder: Decoder) throws { let container = try decoder.container(keyedBy: CodingKeys.self) name = try container.decode(String.self, forKey: .name) imageName = try container.decode(String.self, forKey: .imageName) tags = try container.decode([String].self, forKey: .tags) registry = try container.decodeIfPresent(String.self, forKey: .registry) ?? "ghcr.io" organization = try container.decodeIfPresent(String.self, forKey: .organization) ?? "trycua" storage = try container.decodeIfPresent(String.self, forKey: .storage) chunkSizeMb = try container.decodeIfPresent(Int.self, forKey: .chunkSizeMb) ?? 512 } } ``` -------------------------------------------------------------------------------- /libs/python/agent/benchmarks/contrib.md: -------------------------------------------------------------------------------- ```markdown # Contributing Reference Agent Implementations This guide explains how to add your own reference agent implementations to the benchmark system. ## Adding Reference Agent Implementations ### 1. Implement the ModelProtocol Create a new file in `models/` directory implementing the `ModelProtocol`: ```python from models.base import ModelProtocol from typing import Optional, Tuple from PIL import Image class YourModelName(ModelProtocol): def __init__(self, model_path: str): self.model_path = model_path self._model = None @property def model_name(self) -> str: return self.model_path async def load_model(self) -> None: """Load the model into memory.""" # Your model loading logic here pass async def unload_model(self) -> None: """Unload the model from memory.""" # Your model cleanup logic here pass async def predict_click(self, image: Image.Image, instruction: str) -> Optional[Tuple[int, int]]: """ Predict click coordinates for the given image and instruction. Args: image: PIL Image to analyze instruction: Text instruction describing what to click Returns: Tuple of (x, y) coordinates or None if prediction fails """ # Your prediction logic here return (x, y) # Return predicted coordinates ``` ### 2. Register Your Model Add your model to the `get_available_models()` function in `utils.py`: ```python def get_available_models() -> List[Union[str, ModelProtocol]]: models = [ # Computer Agent SDK providers "huggingface-local/HelloKKMe/GTA1-7B", # Reference implementations GTA1Model("HelloKKMe/GTA1-7B"), YourModelName("path/to/your/model"), # Add your model here ] return models ``` ### 3. Test Your Implementation Before submitting, test your model with the interactive tool: ```bash python interactive.py ``` This will help you verify that your model loads correctly and produces reasonable predictions. ## Example: Adding a New Model Here's a complete example of adding a hypothetical "MyVisionModel": 1. **Create `models/my_vision_model.py`:** ```python import torch from transformers import AutoModel, AutoProcessor from models.base import ModelProtocol from typing import Optional, Tuple from PIL import Image class MyVisionModel(ModelProtocol): def __init__(self, model_path: str): self.model_path = model_path self.model = None self.processor = None @property def model_name(self) -> str: return f"MyVisionModel({self.model_path})" async def load_model(self) -> None: """Load the model and processor.""" self.processor = AutoProcessor.from_pretrained(self.model_path) self.model = AutoModel.from_pretrained( self.model_path, torch_dtype=torch.float16, device_map="auto" ) async def unload_model(self) -> None: """Clean up model resources.""" del self.model del self.processor self.model = None self.processor = None torch.cuda.empty_cache() async def predict_click(self, image: Image.Image, instruction: str) -> Optional[Tuple[int, int]]: """Predict click coordinates.""" try: # Preprocess inputs inputs = self.processor( text=instruction, images=image, return_tensors="pt" ) # Run inference with torch.no_grad(): outputs = self.model(**inputs) # Extract coordinates (model-specific logic) x, y = self._extract_coordinates(outputs) return (int(x), int(y)) except Exception as e: print(f"Prediction failed: {e}") return None def _extract_coordinates(self, outputs): """Extract x, y coordinates from model outputs.""" # Your model-specific coordinate extraction logic pass ``` 2. **Update `models/__init__.py`:** ```python from .gta1 import GTA1Model from .my_vision_model import MyVisionModel __all__ = ["GTA1Model", "MyVisionModel"] ``` 3. **Update `utils.py`:** ```python from models import GTA1Model, MyVisionModel def get_available_models() -> List[Union[str, ModelProtocol]]: models = [ "huggingface-local/HelloKKMe/GTA1-7B", GTA1Model("HelloKKMe/GTA1-7B"), MyVisionModel("my-org/my-vision-model"), # Add here ] return models ``` ``` -------------------------------------------------------------------------------- /libs/lume/src/FileSystem/VMConfig.swift: -------------------------------------------------------------------------------- ```swift import ArgumentParser import Foundation import Virtualization /// Represents a shared directory configuration struct SharedDirectory: Codable { let hostPath: String let tag: String let readOnly: Bool var string: String { return "\(hostPath):\(tag):\(readOnly ? "ro" : "rw")" } } // MARK: - VMConfig struct VMConfig: Codable { // MARK: - Properties let os: String private var _cpuCount: Int? private var _memorySize: UInt64? private var _diskSize: UInt64? private var _macAddress: String? private var _display: VMDisplayResolution private var _hardwareModel: Data? private var _machineIdentifier: Data? // MARK: - Initialization init( os: String, cpuCount: Int? = nil, memorySize: UInt64? = nil, diskSize: UInt64? = nil, macAddress: String? = nil, display: String, hardwareModel: Data? = nil, machineIdentifier: Data? = nil ) throws { self.os = os self._cpuCount = cpuCount self._memorySize = memorySize self._diskSize = diskSize self._macAddress = macAddress self._display = VMDisplayResolution(string: display) ?? VMDisplayResolution(string: "1024x768")! self._hardwareModel = hardwareModel self._machineIdentifier = machineIdentifier } var display: VMDisplayResolution { get { _display } set { _display = newValue } } var cpuCount: Int? { get { _cpuCount } set { _cpuCount = newValue } } var memorySize: UInt64? { get { _memorySize } set { _memorySize = newValue } } var diskSize: UInt64? { get { _diskSize } set { _diskSize = newValue } } var hardwareModel: Data? { get { _hardwareModel } set { _hardwareModel = newValue } } var machineIdentifier: Data? { get { _machineIdentifier } set { _machineIdentifier = newValue } } var macAddress: String? { get { _macAddress } set { _macAddress = newValue } } mutating func setCpuCount(_ count: Int) { _cpuCount = count } mutating func setMemorySize(_ size: UInt64) { _memorySize = size } mutating func setDiskSize(_ size: UInt64) { _diskSize = size } mutating func setHardwareModel(_ hardwareModel: Data) { _hardwareModel = hardwareModel } mutating func setMachineIdentifier(_ machineIdentifier: Data) { _machineIdentifier = machineIdentifier } mutating func setMacAddress(_ newMacAddress: String) { self._macAddress = newMacAddress } mutating func setDisplay(_ newDisplay: VMDisplayResolution) { self._display = newDisplay } // MARK: - Codable enum CodingKeys: String, CodingKey { case _cpuCount = "cpuCount" case _memorySize = "memorySize" case _diskSize = "diskSize" case macAddress case display case _hardwareModel = "hardwareModel" case _machineIdentifier = "machineIdentifier" case os } init(from decoder: Decoder) throws { let container = try decoder.container(keyedBy: CodingKeys.self) os = try container.decode(String.self, forKey: .os) _cpuCount = try container.decodeIfPresent(Int.self, forKey: ._cpuCount) _memorySize = try container.decodeIfPresent(UInt64.self, forKey: ._memorySize) _diskSize = try container.decodeIfPresent(UInt64.self, forKey: ._diskSize) _macAddress = try container.decodeIfPresent(String.self, forKey: .macAddress) _display = VMDisplayResolution(string: try container.decode(String.self, forKey: .display))! _hardwareModel = try container.decodeIfPresent(Data.self, forKey: ._hardwareModel) _machineIdentifier = try container.decodeIfPresent(Data.self, forKey: ._machineIdentifier) } func encode(to encoder: Encoder) throws { var container = encoder.container(keyedBy: CodingKeys.self) try container.encodeIfPresent(os, forKey: .os) try container.encodeIfPresent(_cpuCount, forKey: ._cpuCount) try container.encodeIfPresent(_memorySize, forKey: ._memorySize) try container.encodeIfPresent(_diskSize, forKey: ._diskSize) try container.encodeIfPresent(_macAddress, forKey: .macAddress) try container.encode(display.string, forKey: .display) try container.encodeIfPresent(_hardwareModel, forKey: ._hardwareModel) try container.encodeIfPresent(_machineIdentifier, forKey: ._machineIdentifier) } } ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/callbacks/base.py: -------------------------------------------------------------------------------- ```python """ Base callback handler interface for ComputerAgent preprocessing and postprocessing hooks. """ from abc import ABC, abstractmethod from typing import List, Dict, Any, Optional, Union class AsyncCallbackHandler(ABC): """ Base class for async callback handlers that can preprocess messages before the agent loop and postprocess output after the agent loop. """ async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None: """Called at the start of an agent run loop.""" pass async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None: """Called at the end of an agent run loop.""" pass async def on_run_continue(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> bool: """Called during agent run loop to determine if execution should continue. Args: kwargs: Run arguments old_items: Original messages new_items: New messages generated during run Returns: True to continue execution, False to stop """ return True async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Called before messages are sent to the agent loop. Args: messages: List of message dictionaries to preprocess Returns: List of preprocessed message dictionaries """ return messages async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Called after the agent loop returns output. Args: output: List of output message dictionaries to postprocess Returns: List of postprocessed output dictionaries """ return output async def on_computer_call_start(self, item: Dict[str, Any]) -> None: """ Called when a computer call is about to start. Args: item: The computer call item dictionary """ pass async def on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None: """ Called when a computer call has completed. Args: item: The computer call item dictionary result: The result of the computer call """ pass async def on_function_call_start(self, item: Dict[str, Any]) -> None: """ Called when a function call is about to start. Args: item: The function call item dictionary """ pass async def on_function_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None: """ Called when a function call has completed. Args: item: The function call item dictionary result: The result of the function call """ pass async def on_text(self, item: Dict[str, Any]) -> None: """ Called when a text message is encountered. Args: item: The message item dictionary """ pass async def on_api_start(self, kwargs: Dict[str, Any]) -> None: """ Called when an API call is about to start. Args: kwargs: The kwargs being passed to the API call """ pass async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None: """ Called when an API call has completed. Args: kwargs: The kwargs that were passed to the API call result: The result of the API call """ pass async def on_usage(self, usage: Dict[str, Any]) -> None: """ Called when usage information is received. Args: usage: The usage information """ pass async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None: """ Called when a screenshot is taken. Args: screenshot: The screenshot image name: The name of the screenshot """ pass async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None: """ Called when responses are received. Args: kwargs: The kwargs being passed to the agent loop responses: The responses received """ pass ``` -------------------------------------------------------------------------------- /examples/agent_examples.py: -------------------------------------------------------------------------------- ```python """Example demonstrating the ComputerAgent capabilities with the Omni provider.""" import asyncio import logging import traceback import signal from computer import Computer, VMProviderType # Import the unified agent class and types from agent import ComputerAgent # Import utility functions from utils import load_dotenv_files, handle_sigint # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) async def run_agent_example(): """Run example of using the ComputerAgent with different models.""" print("\n=== Example: ComputerAgent with different models ===") try: # Create a local macOS computer computer = Computer( os_type="macos", verbosity=logging.DEBUG, ) # Create a remote Linux computer with Cua # computer = Computer( # os_type="linux", # api_key=os.getenv("CUA_API_KEY"), # name=os.getenv("CUA_CONTAINER_NAME"), # provider_type=VMProviderType.CLOUD, # ) # Create ComputerAgent with new API agent = ComputerAgent( # Supported models: # == OpenAI CUA (computer-use-preview) == model="openai/computer-use-preview", # == Anthropic CUA (Claude > 3.5) == # model="anthropic/claude-opus-4-20250514", # model="anthropic/claude-sonnet-4-20250514", # model="anthropic/claude-3-7-sonnet-20250219", # model="anthropic/claude-3-5-sonnet-20241022", # == UI-TARS == # model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", # model="mlx/mlx-community/UI-TARS-1.5-7B-6bit", # model="ollama_chat/0000/ui-tars-1.5-7b", # == Omniparser + Any LLM == # model="omniparser+anthropic/claude-opus-4-20250514", # model="omniparser+ollama_chat/gemma3:12b-it-q4_K_M", tools=[computer], only_n_most_recent_images=3, verbosity=logging.DEBUG, trajectory_dir="trajectories", use_prompt_caching=True, max_trajectory_budget=1.0, ) # Example tasks to demonstrate the agent tasks = [ "Look for a repository named trycua/cua on GitHub.", "Check the open issues, open the most recent one and read it.", "Clone the repository in users/lume/projects if it doesn't exist yet.", "Open the repository with an app named Cursor (on the dock, black background and white cube icon).", "From Cursor, open Composer if not already open.", "Focus on the Composer text area, then write and submit a task to help resolve the GitHub issue.", ] # Use message-based conversation history history = [] for i, task in enumerate(tasks): print(f"\nExecuting task {i+1}/{len(tasks)}: {task}") # Add user message to history history.append({"role": "user", "content": task}) # Run agent with conversation history async for result in agent.run(history, stream=False): # Add agent outputs to history history += result.get("output", []) # Print output for debugging for item in result.get("output", []): if item.get("type") == "message": content = item.get("content", []) for content_part in content: if content_part.get("text"): print(f"Agent: {content_part.get('text')}") elif item.get("type") == "computer_call": action = item.get("action", {}) action_type = action.get("type", "") print(f"Computer Action: {action_type}({action})") elif item.get("type") == "computer_call_output": print("Computer Output: [Screenshot/Result]") print(f"✅ Task {i+1}/{len(tasks)} completed: {task}") except Exception as e: logger.error(f"Error in run_agent_example: {e}") traceback.print_exc() raise def main(): """Run the Anthropic agent example.""" try: load_dotenv_files() # Register signal handler for graceful exit signal.signal(signal.SIGINT, handle_sigint) asyncio.run(run_agent_example()) except Exception as e: print(f"Error running example: {e}") traceback.print_exc() if __name__ == "__main__": main() ``` -------------------------------------------------------------------------------- /libs/lume/src/Virtualization/DarwinImageLoader.swift: -------------------------------------------------------------------------------- ```swift import Foundation import Virtualization /// Handles loading and validation of macOS restore images (IPSW files). /// Provides functionality to: /// - Fetch the latest supported macOS restore image URL /// - Load and validate image requirements for VM creation /// - Extract hardware model and auxiliary storage configuration protocol ImageLoader: Sendable { typealias ImageRequirements = DarwinImageLoader.ImageRequirements func fetchLatestSupportedURL() async throws -> URL func loadImageRequirements(from url: URL) async throws -> ImageRequirements func downloadLatestImage() async throws -> Path } final class DarwinImageLoader: NSObject, ImageLoader, @unchecked Sendable, URLSessionDownloadDelegate { struct ImageRequirements: Sendable { let hardwareModel: Data let minimumSupportedCPUCount: Int let minimumSupportedMemorySize: UInt64 } enum ImageError: Error { case invalidImage case unsupportedConfiguration case downloadFailed } private var lastLoggedProgress: Double = 0.0 private var progressLogger = ProgressLogger() private var completionHandler: ((URL?, Error?) -> Void)? func fetchLatestSupportedURL() async throws -> URL { try await withCheckedThrowingContinuation { continuation in VZMacOSRestoreImage.fetchLatestSupported { result in switch result { case .success(let image): continuation.resume(returning: image.url) case .failure(let error): continuation.resume(throwing: error) } } } } func loadImageRequirements(from url: URL) async throws -> ImageRequirements { let image = try await VZMacOSRestoreImage.image(from: url) guard let requirements = image.mostFeaturefulSupportedConfiguration else { throw ImageError.unsupportedConfiguration } return ImageRequirements( hardwareModel: requirements.hardwareModel.dataRepresentation, minimumSupportedCPUCount: requirements.minimumSupportedCPUCount, minimumSupportedMemorySize: requirements.minimumSupportedMemorySize ) } func downloadLatestImage() async throws -> Path { let url = try await fetchLatestSupportedURL() let tempDir = FileManager.default.temporaryDirectory let downloadPath = tempDir.appendingPathComponent("latest.ipsw") // Reset progress logger state progressLogger = ProgressLogger(threshold: 0.01) // Create a continuation to wait for download completion return try await withCheckedThrowingContinuation { continuation in let session = URLSession(configuration: .default, delegate: self, delegateQueue: nil) let task = session.downloadTask(with: url) // Use the delegate method to handle completion self.completionHandler = { location, error in if let error = error { continuation.resume(throwing: error) return } do { // Remove existing file if it exists if FileManager.default.fileExists(atPath: downloadPath.path) { try FileManager.default.removeItem(at: downloadPath) } try FileManager.default.moveItem(at: location!, to: downloadPath) Logger.info("Download completed and moved to: \(downloadPath.path)") continuation.resume(returning: Path(downloadPath.path)) } catch { continuation.resume(throwing: error) } } task.resume() } } func urlSession(_ session: URLSession, downloadTask: URLSessionDownloadTask, didWriteData bytesWritten: Int64, totalBytesWritten: Int64, totalBytesExpectedToWrite: Int64) { let progress = Double(totalBytesWritten) / Double(totalBytesExpectedToWrite) progressLogger.logProgress(current: progress, context: "Downloading IPSW") } func urlSession(_ session: URLSession, downloadTask: URLSessionDownloadTask, didFinishDownloadingTo location: URL) { // Call the stored completion handler completionHandler?(location, nil) } func urlSession(_ session: URLSession, task: URLSessionTask, didCompleteWithError error: Error?) { // Call the stored completion handler with an error if it occurred if let error = error { completionHandler?(nil, error) } } } ``` -------------------------------------------------------------------------------- /.github/workflows/pypi-publish-computer.yml: -------------------------------------------------------------------------------- ```yaml name: Publish Computer Package on: push: tags: - "computer-v*" workflow_dispatch: inputs: version: description: "Version to publish (without v prefix)" required: true default: "0.1.0" workflow_call: inputs: version: description: "Version to publish" required: true type: string # Adding permissions at workflow level permissions: contents: write jobs: prepare: runs-on: macos-latest outputs: version: ${{ steps.get-version.outputs.version }} core_version: ${{ steps.update-deps.outputs.core_version }} steps: - uses: actions/checkout@v4 - name: Determine version id: get-version run: | if [ "${{ github.event_name }}" == "push" ]; then # Extract version from tag (for package-specific tags) if [[ "${{ github.ref }}" =~ ^refs/tags/computer-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then VERSION=${BASH_REMATCH[1]} else echo "Invalid tag format for computer" exit 1 fi elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then # Use version from workflow dispatch VERSION=${{ github.event.inputs.version }} else # Use version from workflow_call VERSION=${{ inputs.version }} fi echo "VERSION=$VERSION" echo "version=$VERSION" >> $GITHUB_OUTPUT - name: Set up Python uses: actions/setup-python@v4 with: python-version: "3.11" - name: Update dependencies to latest versions id: update-deps run: | cd libs/python/computer # Install required package for PyPI API access pip install requests # Create a more robust Python script for PyPI version checking cat > get_latest_versions.py << 'EOF' import requests import json import sys def get_package_version(package_name, fallback="0.1.0"): try: response = requests.get(f'https://pypi.org/pypi/{package_name}/json') print(f"API Response Status for {package_name}: {response.status_code}", file=sys.stderr) if response.status_code != 200: print(f"API request failed for {package_name}, using fallback version", file=sys.stderr) return fallback data = json.loads(response.text) if 'info' not in data: print(f"Missing 'info' key in API response for {package_name}, using fallback version", file=sys.stderr) return fallback return data['info']['version'] except Exception as e: print(f"Error fetching version for {package_name}: {str(e)}", file=sys.stderr) return fallback # Get latest versions print(get_package_version('cua-core')) EOF # Execute the script to get the versions VERSIONS=($(python get_latest_versions.py)) LATEST_CORE=${VERSIONS[0]} echo "Latest cua-core version: $LATEST_CORE" # Output the versions for the next job echo "core_version=$LATEST_CORE" >> $GITHUB_OUTPUT # Determine major version for version constraint CORE_MAJOR=$(echo $LATEST_CORE | cut -d. -f1) NEXT_CORE_MAJOR=$((CORE_MAJOR + 1)) # Update dependencies in pyproject.toml if [[ "$OSTYPE" == "darwin"* ]]; then # macOS version of sed needs an empty string for -i sed -i '' "s/\"cua-core>=.*,<.*\"/\"cua-core>=$LATEST_CORE,<$NEXT_CORE_MAJOR.0.0\"/" pyproject.toml else # Linux version sed -i "s/\"cua-core>=.*,<.*\"/\"cua-core>=$LATEST_CORE,<$NEXT_CORE_MAJOR.0.0\"/" pyproject.toml fi # Display the updated dependencies echo "Updated dependencies in pyproject.toml:" grep -E "cua-core" pyproject.toml publish: needs: prepare uses: ./.github/workflows/pypi-reusable-publish.yml with: package_name: "computer" package_dir: "libs/python/computer" version: ${{ needs.prepare.outputs.version }} is_lume_package: false base_package_name: "cua-computer" secrets: PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} set-env-variables: needs: [prepare, publish] runs-on: macos-latest steps: - name: Set environment variables for use in other jobs run: | echo "CORE_VERSION=${{ needs.prepare.outputs.core_version }}" >> $GITHUB_ENV ``` -------------------------------------------------------------------------------- /libs/xfce/Dockerfile: -------------------------------------------------------------------------------- ```dockerfile # CUA Docker XFCE Container # Vanilla XFCE desktop with noVNC and computer-server FROM ubuntu:22.04 # Avoid prompts from apt ENV DEBIAN_FRONTEND=noninteractive # Set environment variables ENV HOME=/home/cua ENV DISPLAY=:1 ENV VNC_PORT=5901 ENV NOVNC_PORT=6901 ENV API_PORT=8000 ENV VNC_RESOLUTION=1024x768 ENV VNC_COL_DEPTH=24 # Install system dependencies first (including sudo) RUN apt-get update && apt-get install -y \ # System utilities sudo \ # Desktop environment xfce4 \ xfce4-terminal \ dbus-x11 \ # VNC server tigervnc-standalone-server \ tigervnc-common \ # noVNC dependencies python3 \ python3-pip \ python3-numpy \ git \ net-tools \ netcat \ supervisor \ # Computer-server dependencies python3-tk \ python3-dev \ gnome-screenshot \ wmctrl \ ffmpeg \ socat \ xclip \ # Browser wget \ software-properties-common \ # Build tools build-essential \ libncursesw5-dev \ libssl-dev \ libsqlite3-dev \ tk-dev \ libgdbm-dev \ libc6-dev \ libbz2-dev \ libffi-dev \ zlib1g-dev \ && rm -rf /var/lib/apt/lists/* # Remove screensavers and power manager to avoid popups and lock screens RUN apt-get remove -y \ xfce4-power-manager \ xfce4-power-manager-data \ xfce4-power-manager-plugins \ xfce4-screensaver \ light-locker \ xscreensaver \ xscreensaver-data || true # Create user after sudo is installed RUN useradd -m -s /bin/bash -G sudo cua && \ echo "cua:cua" | chpasswd && \ echo "cua ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers # Install Firefox from Mozilla PPA (snap-free) - inline to avoid script issues RUN apt-get update && \ add-apt-repository -y ppa:mozillateam/ppa && \ echo 'Package: *\nPin: release o=LP-PPA-mozillateam\nPin-Priority: 1001' > /etc/apt/preferences.d/mozilla-firefox && \ apt-get update && \ apt-get install -y firefox && \ echo 'pref("datareporting.policy.firstRunURL", "");\npref("datareporting.policy.dataSubmissionEnabled", false);\npref("datareporting.healthreport.service.enabled", false);\npref("datareporting.healthreport.uploadEnabled", false);\npref("trailhead.firstrun.branches", "nofirstrun-empty");\npref("browser.aboutwelcome.enabled", false);' > /usr/lib/firefox/browser/defaults/preferences/firefox.js && \ update-alternatives --install /usr/bin/x-www-browser x-www-browser /usr/bin/firefox 100 && \ update-alternatives --install /usr/bin/gnome-www-browser gnome-www-browser /usr/bin/firefox 100 && \ rm -rf /var/lib/apt/lists/* # Install noVNC RUN git clone https://github.com/novnc/noVNC.git /opt/noVNC && \ git clone https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \ ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html # Pre-create cache directory with correct ownership before pip install RUN mkdir -p /home/cua/.cache && \ chown -R cua:cua /home/cua/.cache # Install computer-server RUN pip3 install cua-computer-server # Fix any cache files created by pip RUN chown -R cua:cua /home/cua/.cache # Copy startup scripts COPY src/supervisor/ /etc/supervisor/conf.d/ COPY src/scripts/ /usr/local/bin/ # Make scripts executable RUN chmod +x /usr/local/bin/*.sh # Setup VNC USER cua WORKDIR /home/cua # Create VNC directory (no password needed with SecurityTypes None) RUN mkdir -p $HOME/.vnc # Configure XFCE for first start RUN mkdir -p $HOME/.config/xfce4/xfconf/xfce-perchannel-xml $HOME/.config/xfce4 $HOME/.config/autostart # Copy XFCE config to disable browser launching and welcome screens COPY --chown=cua:cua src/xfce-config/helpers.rc $HOME/.config/xfce4/helpers.rc COPY --chown=cua:cua src/xfce-config/xfce4-session.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-session.xml COPY --chown=cua:cua src/xfce-config/xfce4-power-manager.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-power-manager.xml # Disable autostart for screensaver, lock screen, and power manager RUN echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-tips-autostart.desktop && \ echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-screensaver.desktop && \ echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/light-locker.desktop && \ echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-power-manager.desktop && \ chown -R cua:cua $HOME/.config # Create storage and shared directories, and Firefox cache directory RUN mkdir -p $HOME/storage $HOME/shared $HOME/.cache/dconf $HOME/.mozilla/firefox && \ chown -R cua:cua $HOME/storage $HOME/shared $HOME/.cache $HOME/.mozilla $HOME/.vnc USER root # Expose ports EXPOSE $VNC_PORT $NOVNC_PORT $API_PORT # Start services via supervisor CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"] ``` -------------------------------------------------------------------------------- /libs/python/computer-server/computer_server/cli.py: -------------------------------------------------------------------------------- ```python """ Command-line interface for the Computer API server. """ import argparse import asyncio import logging import os import sys import threading from typing import List, Optional from .server import Server logger = logging.getLogger(__name__) def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace: """Parse command-line arguments.""" parser = argparse.ArgumentParser(description="Start the Computer API server") parser.add_argument( "--host", default="0.0.0.0", help="Host to bind the server to (default: 0.0.0.0)" ) parser.add_argument( "--port", type=int, default=8000, help="Port to bind the server to (default: 8000)" ) parser.add_argument( "--log-level", choices=["debug", "info", "warning", "error", "critical"], default="info", help="Logging level (default: info)", ) parser.add_argument( "--ssl-keyfile", type=str, help="Path to SSL private key file (enables HTTPS)", ) parser.add_argument( "--ssl-certfile", type=str, help="Path to SSL certificate file (enables HTTPS)", ) parser.add_argument( "--watchdog", action="store_true", help="Enable watchdog monitoring (automatically enabled if CONTAINER_NAME env var is set)", ) parser.add_argument( "--watchdog-interval", type=int, default=30, help="Watchdog ping interval in seconds (default: 30)", ) parser.add_argument( "--no-restart", action="store_true", help="Disable automatic server restart in watchdog", ) return parser.parse_args(args) def main() -> None: """Main entry point for the CLI.""" args = parse_args() # Configure logging logging.basicConfig( level=getattr(logging, args.log_level.upper()), format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) # Check if watchdog should be enabled container_name = os.environ.get("CONTAINER_NAME") enable_watchdog = (args.watchdog or bool(container_name)) and not sys.platform.startswith("win") if container_name: logger.info(f"Container environment detected (CONTAINER_NAME={container_name}), enabling watchdog") elif args.watchdog: logger.info("Watchdog explicitly enabled via --watchdog flag") # Start watchdog if enabled if enable_watchdog: logger.info(f"Starting watchdog monitoring with {args.watchdog_interval}s interval") def run_watchdog_thread(): """Run watchdog in a separate thread.""" loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: # Create CLI args dict for watchdog cli_args = { 'host': args.host, 'port': args.port, 'log_level': args.log_level, 'ssl_keyfile': args.ssl_keyfile, 'ssl_certfile': args.ssl_certfile } # Create watchdog with restart settings from .watchdog import Watchdog watchdog = Watchdog( cli_args=cli_args, ping_interval=args.watchdog_interval ) watchdog.restart_enabled = not args.no_restart loop.run_until_complete(watchdog.start_monitoring()) except Exception as e: logger.error(f"Watchdog error: {e}") finally: loop.close() # Start watchdog in background thread watchdog_thread = threading.Thread( target=run_watchdog_thread, daemon=True, name="watchdog" ) watchdog_thread.start() # Create and start the server logger.info(f"Starting CUA Computer API server on {args.host}:{args.port}...") # Handle SSL configuration ssl_args = {} if args.ssl_keyfile and args.ssl_certfile: ssl_args = { "ssl_keyfile": args.ssl_keyfile, "ssl_certfile": args.ssl_certfile, } logger.info("HTTPS mode enabled with SSL certificates") elif args.ssl_keyfile or args.ssl_certfile: logger.warning("Both --ssl-keyfile and --ssl-certfile are required for HTTPS. Running in HTTP mode.") else: logger.info("HTTP mode (no SSL certificates provided)") server = Server(host=args.host, port=args.port, log_level=args.log_level, **ssl_args) try: server.start() except KeyboardInterrupt: logger.info("Server stopped by user") sys.exit(0) except Exception as e: logger.error(f"Error starting server: {e}") sys.exit(1) if __name__ == "__main__": main() ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/computers/cua.py: -------------------------------------------------------------------------------- ```python """ Computer handler implementation for OpenAI computer-use-preview protocol. """ import base64 from typing import Dict, List, Any, Literal, Union, Optional from .base import AsyncComputerHandler from computer import Computer class cuaComputerHandler(AsyncComputerHandler): """Computer handler that implements the Computer protocol using the computer interface.""" def __init__(self, cua_computer: Computer): """Initialize with a computer interface (from tool schema).""" self.cua_computer = cua_computer self.interface = None async def _initialize(self): if hasattr(self.cua_computer, '_initialized') and not self.cua_computer._initialized: await self.cua_computer.run() self.interface = self.cua_computer.interface # ==== Computer-Use-Preview Action Space ==== async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]: """Get the current environment type.""" # TODO: detect actual environment return "linux" async def get_dimensions(self) -> tuple[int, int]: """Get screen dimensions as (width, height).""" assert self.interface is not None screen_size = await self.interface.get_screen_size() return screen_size["width"], screen_size["height"] async def screenshot(self) -> str: """Take a screenshot and return as base64 string.""" assert self.interface is not None screenshot_bytes = await self.interface.screenshot() return base64.b64encode(screenshot_bytes).decode('utf-8') async def click(self, x: int, y: int, button: str = "left") -> None: """Click at coordinates with specified button.""" assert self.interface is not None if button == "left": await self.interface.left_click(x, y) elif button == "right": await self.interface.right_click(x, y) else: # Default to left click for unknown buttons await self.interface.left_click(x, y) async def double_click(self, x: int, y: int) -> None: """Double click at coordinates.""" assert self.interface is not None await self.interface.double_click(x, y) async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: """Scroll at coordinates with specified scroll amounts.""" assert self.interface is not None await self.interface.move_cursor(x, y) await self.interface.scroll(scroll_x, scroll_y) async def type(self, text: str) -> None: """Type text.""" assert self.interface is not None await self.interface.type_text(text) async def wait(self, ms: int = 1000) -> None: """Wait for specified milliseconds.""" assert self.interface is not None import asyncio await asyncio.sleep(ms / 1000.0) async def move(self, x: int, y: int) -> None: """Move cursor to coordinates.""" assert self.interface is not None await self.interface.move_cursor(x, y) async def keypress(self, keys: Union[List[str], str]) -> None: """Press key combination.""" assert self.interface is not None if isinstance(keys, str): keys = keys.replace("-", "+").split("+") if len(keys) == 1: await self.interface.press_key(keys[0]) else: # Handle key combinations await self.interface.hotkey(*keys) async def drag(self, path: List[Dict[str, int]]) -> None: """Drag along specified path.""" assert self.interface is not None if not path: return # Start drag from first point start = path[0] await self.interface.mouse_down(start["x"], start["y"]) # Move through path for point in path[1:]: await self.interface.move_cursor(point["x"], point["y"]) # End drag at last point end = path[-1] await self.interface.mouse_up(end["x"], end["y"]) async def get_current_url(self) -> str: """Get current URL (for browser environments).""" # This would need to be implemented based on the specific browser interface # For now, return empty string return "" # ==== Anthropic Computer Action Space ==== async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None: """Left mouse down at coordinates.""" assert self.interface is not None await self.interface.mouse_down(x, y, button="left") async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None: """Left mouse up at coordinates.""" assert self.interface is not None await self.interface.mouse_up(x, y, button="left") ``` -------------------------------------------------------------------------------- /docs/content/docs/computer-sdk/cloud-vm-management.mdx: -------------------------------------------------------------------------------- ```markdown --- title: Cloud VM Management description: Manage your Cua Cloud sandboxes (VMs) via Python SDK or HTTP API --- import { Tab, Tabs } from 'fumadocs-ui/components/tabs'; Use these concise examples to manage your cloud sandboxes. Pick either the Python SDK or plain HTTP (curl) for each action. > You need a CUA Database API key. Set it as an environment variable `CUA_API_KEY`. ## Status values - `pending` – VM deployment in progress - `running` – VM is active and accessible - `stopped` – VM is stopped but not terminated - `terminated` – VM has been permanently destroyed - `failed` – VM deployment or operation failed --- ## List VMs <Tabs items={["Python", "curl"]}> <Tab value="Python"> ```python import os import asyncio from computer.providers.cloud.provider import CloudProvider async def main(): api_key = os.getenv("CUA_API_KEY") or "your-api-key" # Optional: point to a different API base # os.environ["CUA_API_BASE"] = "https://api.cua.ai" provider = CloudProvider(api_key=api_key, verbose=False) async with provider: vms = await provider.list_vms() for vm in vms: print({ "name": vm["name"], "status": vm["status"], "api_url": vm.get("api_url"), "vnc_url": vm.get("vnc_url"), }) if __name__ == "__main__": asyncio.run(main()) ``` </Tab> <Tab value="curl"> ```bash curl -H "Authorization: Bearer $CUA_API_KEY" \ "https://api.cua.ai/v1/vms" ``` Example response: ```json [ { "name": "s-windows-x4snp46ebf", "status": "running" } ] ``` </Tab> </Tabs> --- ## Start a VM Provide the VM name you want to start. <Tabs items={["Python", "curl"]}> <Tab value="Python"> ```python import os import asyncio from computer.providers.cloud.provider import CloudProvider async def main(): api_key = os.getenv("CUA_API_KEY") or "your-api-key" name = "my-vm-name" # e.g., "m-linux-96lcxd2c2k" provider = CloudProvider(api_key=api_key) async with provider: resp = await provider.run_vm(name) print(resp) # { "name": name, "status": "starting" } if __name__ == "__main__": asyncio.run(main()) ``` </Tab> <Tab value="curl"> ```bash curl -X POST \ -H "Authorization: Bearer $CUA_API_KEY" \ "https://api.cua.ai/v1/vms/my-vm-name/start" -i ``` Example response headers (no body): ```text HTTP/1.1 204 No Content ``` </Tab> </Tabs> --- ## Stop a VM Stops the VM asynchronously. <Tabs items={["Python", "curl"]}> <Tab value="Python"> ```python import os import asyncio from computer.providers.cloud.provider import CloudProvider async def main(): api_key = os.getenv("CUA_API_KEY") or "your-api-key" name = "my-vm-name" provider = CloudProvider(api_key=api_key) async with provider: resp = await provider.stop_vm(name) print(resp) # { "name": name, "status": "stopping" } if __name__ == "__main__": asyncio.run(main()) ``` </Tab> <Tab value="curl"> ```bash curl -X POST \ -H "Authorization: Bearer $CUA_API_KEY" \ "https://api.cua.ai/v1/vms/my-vm-name/stop" ``` Example response: ```json { "status": "stopping" } ``` </Tab> </Tabs> --- ## Restart a VM Restarts the VM asynchronously. <Tabs items={["Python", "curl"]}> <Tab value="Python"> ```python import os import asyncio from computer.providers.cloud.provider import CloudProvider async def main(): api_key = os.getenv("CUA_API_KEY") or "your-api-key" name = "my-vm-name" provider = CloudProvider(api_key=api_key) async with provider: resp = await provider.restart_vm(name) print(resp) # { "name": name, "status": "restarting" } if __name__ == "__main__": asyncio.run(main()) ``` </Tab> <Tab value="curl"> ```bash curl -X POST \ -H "Authorization: Bearer $CUA_API_KEY" \ "https://api.cua.ai/v1/vms/my-vm-name/restart" ``` Example response: ```json { "status": "restarting" } ``` </Tab> </Tabs> --- ## Query a VM by name Query the computer-server running on the VM. Useful for checking details like status or OS type. <Tabs items={["Python", "curl"]}> <Tab value="Python"> ```python import os import asyncio from computer.providers.cloud.provider import CloudProvider async def main(): api_key = os.getenv("CUA_API_KEY") or "your-api-key" name = "my-vm-name" provider = CloudProvider(api_key=api_key) async with provider: info = await provider.get_vm(name) print(info) if __name__ == "__main__": asyncio.run(main()) ``` </Tab> <Tab value="curl"> ```bash curl "https://my-vm-name.containers.cloud.cua.ai:8443/status" ``` Example response: ```json { "status": "ok", "os_type": "linux", "features": ["agent"] } ``` </Tab> </Tabs> ``` -------------------------------------------------------------------------------- /examples/computer_examples_windows.py: -------------------------------------------------------------------------------- ```python import os import asyncio from pathlib import Path import sys import traceback # Load environment variables from .env file project_root = Path(__file__).parent.parent env_file = project_root / ".env" print(f"Loading environment from: {env_file}") from computer.helpers import sandboxed from dotenv import load_dotenv load_dotenv(env_file) # Add paths to sys.path if needed pythonpath = os.environ.get("PYTHONPATH", "") for path in pythonpath.split(":"): if path and path not in sys.path: sys.path.insert(0, path) # Insert at beginning to prioritize print(f"Added to sys.path: {path}") from computer.computer import Computer from computer.providers.base import VMProviderType from computer.logger import LogLevel # ANSI color codes RED = '\033[91m' RESET = '\033[0m' async def main(): try: print("\n=== Using direct initialization ===") # Create a remote Windows computer with Cua computer = Computer( os_type="windows", api_key=os.getenv("CUA_API_KEY"), name=os.getenv("CONTAINER_NAME") or "", provider_type=VMProviderType.CLOUD, ) try: # Run the computer with default parameters await computer.run() # Create output directory if it doesn't exist output_dir = Path("./output") output_dir.mkdir(exist_ok=True) # Keyboard Actions Examples print("\n=== Keyboard Actions ===") await computer.interface.type_text("Hello, World!") await computer.interface.press_key("enter") # Mouse Actions Examples print("\n=== Mouse Actions ===") await computer.interface.move_cursor(100, 100) await computer.interface.left_click() await computer.interface.double_click(400, 400) await computer.interface.right_click(300, 300) print("\n=== RPC ===") await computer.venv_install("demo_venv", ["mss"]) @sandboxed("demo_venv") def greet_and_print(name): from mss import mss import os # get username username = os.getlogin() print(f"Hello from inside the container, {name}!") print("Username:", username) print("Screens:", mss().monitors) # take a screenshot with mss() as sct: filename = sct.shot(mon=-1, output='C:/Users/azureuser/Desktop/fullscreen.png') print(filename) return {"greeted": name, "username": username} # Call with args and kwargs result = await greet_and_print("John Doe") print("Result from sandboxed function:", result) # Command Actions Examples print("\n=== Command Actions ===") result = await computer.interface.run_command("notepad") print("Result from command:", result) screenshot = await computer.interface.screenshot() screenshot_path = output_dir / "screenshot.png" with open(screenshot_path, "wb") as f: f.write(screenshot) print(f"Screenshot saved to: {screenshot_path.absolute()}") # Clipboard Actions Examples print("\n=== Clipboard Actions ===") await computer.interface.set_clipboard("Test clipboard") content = await computer.interface.copy_to_clipboard() print(f"Clipboard content: {content}") # Simple REPL Loop print("\n=== Command REPL ===") print("Enter commands to run on the remote computer.") print("Type 'exit' or 'quit' to leave the REPL.\n") while True: try: # Get command from user command = input("command> ").strip() # Check for exit commands if command.lower() in ['exit', 'quit', '']: if command.lower() in ['exit', 'quit']: print("Exiting REPL...") break # Run the command result = await computer.interface.run_command(command) print(result.stdout) if result.stderr: print(f"{RED}{result.stderr}{RESET}") except KeyboardInterrupt: print("\nExiting REPL...") break except Exception as e: print(f"{RED}Error running command: {e}{RESET}") finally: # Important to clean up resources # await computer.stop() pass except Exception as e: print(f"Error in main: {e}") traceback.print_exc() if __name__ == "__main__": asyncio.run(main()) ``` -------------------------------------------------------------------------------- /libs/lume/src/VNC/VNCService.swift: -------------------------------------------------------------------------------- ```swift import Foundation import Dynamic import Virtualization /// Protocol defining the interface for VNC server operations @MainActor protocol VNCService { var url: String? { get } func start(port: Int, virtualMachine: Any?) async throws func stop() func openClient(url: String) async throws } /// Default implementation of VNCService @MainActor final class DefaultVNCService: VNCService { private var vncServer: Any? private let vmDirectory: VMDirectory init(vmDirectory: VMDirectory) { self.vmDirectory = vmDirectory } var url: String? { get { return try? vmDirectory.loadSession().url } } func start(port: Int, virtualMachine: Any?) async throws { let password = Array(PassphraseGenerator().prefix(4)).joined(separator: "-") let securityConfiguration = Dynamic._VZVNCAuthenticationSecurityConfiguration(password: password) // Create VNC server with specified port let server = Dynamic._VZVNCServer(port: port, queue: DispatchQueue.main, securityConfiguration: securityConfiguration) if let vm = virtualMachine as? VZVirtualMachine { server.virtualMachine = vm } server.start() vncServer = server // Wait for port to be assigned (both for auto-assign and specific port) var attempts = 0 let maxAttempts = 20 // 1 second total wait time while true { if let assignedPort: UInt16 = server.port.asUInt16 { // If we got a non-zero port, check if it matches our request if assignedPort != 0 { // For specific port requests, verify we got the requested port if port != 0 && Int(assignedPort) != port { throw VMError.vncPortBindingFailed(requested: port, actual: Int(assignedPort)) } // Get the local IP address for the URL - prefer IPv4 let hostIP = try getLocalIPAddress() ?? "127.0.0.1" let url = "vnc://:\(password)@127.0.0.1:\(assignedPort)" // Use localhost for local connections let externalUrl = "vnc://:\(password)@\(hostIP):\(assignedPort)" // External URL for remote connections Logger.info("VNC server started", metadata: [ "local": url, "external": externalUrl ]) // Save session information with local URL for the client let session = VNCSession(url: url) try vmDirectory.saveSession(session) break } } attempts += 1 if attempts >= maxAttempts { // If we've timed out and we requested a specific port, it likely means binding failed vncServer = nil if port != 0 { throw VMError.vncPortBindingFailed(requested: port, actual: -1) } throw VMError.internalError("Timeout waiting for VNC server to start") } try await Task.sleep(nanoseconds: 50_000_000) // 50ms delay between checks } } // Modified to prefer IPv4 addresses private func getLocalIPAddress() throws -> String? { var address: String? var ifaddr: UnsafeMutablePointer<ifaddrs>? guard getifaddrs(&ifaddr) == 0 else { return nil } defer { freeifaddrs(ifaddr) } var ptr = ifaddr while ptr != nil { defer { ptr = ptr?.pointee.ifa_next } let interface = ptr?.pointee let family = interface?.ifa_addr.pointee.sa_family // Only look for IPv4 addresses if family == UInt8(AF_INET) { let name = String(cString: (interface?.ifa_name)!) if name == "en0" { // Primary interface var hostname = [CChar](repeating: 0, count: Int(NI_MAXHOST)) getnameinfo(interface?.ifa_addr, socklen_t((interface?.ifa_addr.pointee.sa_len)!), &hostname, socklen_t(hostname.count), nil, 0, NI_NUMERICHOST) address = String(cString: hostname, encoding: .utf8) break } } } return address } func stop() { if let server = vncServer as? Dynamic { server.stop() } vncServer = nil vmDirectory.clearSession() } func openClient(url: String) async throws { let processRunner = DefaultProcessRunner() try processRunner.run(executable: "/usr/bin/open", arguments: [url]) } } ``` -------------------------------------------------------------------------------- /libs/typescript/agent/examples/playground-example.html: -------------------------------------------------------------------------------- ```html <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>CUA Agent Playground Example</title> </head> <body> <h1>CUA Agent Playground Example</h1> <div> <h2>Configuration</h2> <label for="url">Agent URL:</label><br> <input type="text" id="url" placeholder="https://localhost:8000 or peer://peer-id" value="https://localhost:8000" style="width: 400px;"><br><br> <label for="model">Model:</label><br> <input type="text" id="model" placeholder="anthropic/claude-opus-4-1-20250805" value="anthropic/claude-opus-4-1-20250805" style="width: 400px;"><br><br> </div> <div> <h2>Chat</h2> <label for="message">Message:</label><br> <input type="text" id="message" placeholder="Enter your message here..." style="width: 400px;"><br><br> <button onclick="sendMessage()">Send Message</button> <!-- <button onclick="checkHealth()">Check Health</button> --> <button onclick="clearOutput()">Clear Output</button><br><br> <label for="output">Output:</label><br> <textarea id="output" rows="20" cols="80" readonly></textarea> </div> <script src="https://unpkg.com/[email protected]/dist/peerjs.min.js"></script> <script type="module"> // Import the AgentClient from the built library import AgentClient from '/dist/index.js'; let client = null; // Make functions available globally window.sendMessage = sendMessage; window.checkHealth = checkHealth; window.clearOutput = clearOutput; function log(message) { const output = document.getElementById('output'); const timestamp = new Date().toLocaleTimeString(); output.value += `[${timestamp}] ${message}\n`; output.scrollTop = output.scrollHeight; } function getClient() { const url = document.getElementById('url').value.trim(); if (!url) { log('ERROR: Please enter a URL'); return null; } // Create new client if URL changed or client doesn't exist if (!client || client.url !== url) { try { client = new AgentClient(url); client.url = url; // Store URL for comparison log(`Created new client for: ${url}`); } catch (error) { log(`ERROR creating client: ${error.message}`); return null; } } return client; } async function sendMessage() { const messageInput = document.getElementById('message'); const modelInput = document.getElementById('model'); const message = messageInput.value.trim(); const model = modelInput.value.trim(); if (!message) { log('ERROR: Please enter a message'); return; } if (!model) { log('ERROR: Please enter a model'); return; } const agentClient = getClient(); if (!agentClient) return; try { log(`Sending message: "${message}"`); log(`Using model: ${model}`); const request = { model: model, input: message }; log('Sending request...'); const response = await agentClient.responses.create(request); log('Response received:'); log(JSON.stringify(response, null, 2)); // Clear the message input messageInput.value = ''; } catch (error) { log(`ERROR: ${error.message}`); } } async function checkHealth() { const agentClient = getClient(); if (!agentClient) return; try { log('Checking health...'); const health = await agentClient.health(); log(`Health status: ${health.status}`); } catch (error) { log(`ERROR checking health: ${error.message}`); } } function clearOutput() { document.getElementById('output').value = ''; } // Allow sending message with Enter key document.getElementById('message').addEventListener('keypress', function(e) { if (e.key === 'Enter') { sendMessage(); } }); // Log initial message log('CUA Agent Client Browser Example loaded'); log('Enter a URL (HTTP/HTTPS or peer://) and model, then send a message'); </script> </body> </html> ``` -------------------------------------------------------------------------------- /docs/src/assets/logo-black.svg: -------------------------------------------------------------------------------- ``` <?xml version="1.0" standalone="no"?> <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 20010904//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd"> <svg version="1.0" xmlns="http://www.w3.org/2000/svg" width="1000.000000pt" height="1000.000000pt" viewBox="0 0 1000.000000 1000.000000" preserveAspectRatio="xMidYMid meet"> <g transform="translate(0.000000,1000.000000) scale(0.100000,-0.100000)" fill="#000000" stroke="none"> <path d="M4934 9086 c-40 -14 -62 -33 -80 -69 -22 -42 -21 -994 1 -1037 38 -73 174 -101 243 -50 19 14 43 42 53 62 18 35 19 65 19 510 0 471 0 473 -23 513 -38 69 -133 101 -213 71z"/> <path d="M3702 8472 c-52 -28 -82 -81 -82 -147 0 -67 8 -80 125 -210 44 -49 107 -121 139 -160 165 -196 233 -268 278 -291 58 -29 66 -30 124 -2 67 31 104 86 104 154 0 60 -14 82 -149 235 -42 47 -95 108 -117 135 -23 27 -52 61 -65 75 -13 14 -57 65 -98 112 -41 47 -89 93 -107 102 -42 20 -111 19 -152 -3z"/> <path d="M6145 8472 c-29 -18 -136 -133 -235 -252 -53 -64 -190 -222 -230 -265 -37 -41 -70 -108 -70 -142 0 -16 10 -49 23 -73 17 -36 33 -51 79 -73 57 -29 57 -29 107 -12 44 14 63 31 149 128 54 62 122 141 151 177 30 36 57 67 60 70 12 10 157 175 179 204 33 43 31 150 -2 188 -56 64 -151 86 -211 50z"/> <path d="M2245 7400 c-188 -14 -374 -75 -585 -191 -222 -123 -464 -366 -577 -579 -13 -25 -28 -52 -33 -60 -74 -123 -137 -348 -161 -580 -10 -106 1 -310 22 -384 5 -17 9 -44 9 -60 0 -72 116 -366 181 -458 11 -14 19 -29 19 -33 0 -33 296 -355 326 -355 7 0 14 -4 16 -10 5 -17 139 -99 243 -150 106 -52 216 -91 303 -109 98 -20 92 -7 92 -215 0 -176 26 -472 50 -571 5 -22 12 -56 15 -75 8 -44 31 -129 56 -201 10 -31 19 -62 19 -69 0 -8 8 -32 19 -54 10 -23 30 -70 45 -106 76 -182 189 -363 319 -515 296 -344 701 -603 1162 -743 216 -66 521 -126 730 -143 335 -27 467 -31 653 -19 103 6 237 15 297 19 120 8 282 32 415 62 47 10 98 19 113 19 16 0 37 5 48 11 11 5 48 16 82 24 34 7 85 21 112 31 104 36 161 58 201 76 22 10 43 18 47 18 12 0 185 85 263 131 44 25 116 71 159 100 43 30 87 61 99 68 107 74 344 310 444 444 40 53 72 98 72 101 0 2 17 31 38 63 68 104 202 390 202 431 0 10 4 22 9 28 12 12 53 168 80 304 30 149 43 293 48 538 l5 214 33 14 c18 7 53 16 77 20 23 4 48 10 53 14 6 4 28 13 50 19 91 27 214 86 318 152 224 141 416 353 524 580 98 206 129 320 153 562 19 189 -20 467 -92 657 -144 382 -420 674 -811 859 -48 22 -93 41 -101 41 -7 0 -35 8 -62 19 -27 10 -92 29 -144 41 -84 20 -119 23 -325 22 -212 0 -238 -2 -330 -25 -55 -14 -131 -37 -170 -52 -38 -15 -84 -32 -101 -39 -18 -6 -38 -16 -45 -22 -8 -6 -27 -18 -44 -26 -79 -40 -121 -67 -205 -134 -69 -54 -225 -212 -255 -257 -21 -32 -26 -33 -84 -6 -25 12 -64 29 -86 40 -183 84 -514 183 -705 209 -41 6 -91 15 -110 20 -50 13 -318 30 -470 30 -159 0 -363 -16 -450 -35 -36 -8 -87 -17 -115 -20 -48 -7 -178 -36 -240 -55 -84 -26 -222 -71 -240 -79 -11 -4 -47 -19 -80 -31 -77 -30 -162 -66 -198 -85 -32 -17 -67 -20 -67 -6 0 16 -211 230 -274 279 -96 74 -124 92 -237 149 -204 102 -346 139 -569 146 -85 2 -200 1 -255 -3z m396 -331 c163 -33 302 -93 433 -184 97 -68 232 -206 299 -307 32 -48 70 -94 85 -104 38 -25 155 -24 185 3 28 24 183 99 302 146 180 70 201 77 214 77 8 0 39 8 70 19 77 26 221 57 376 82 111 17 173 20 418 20 159 0 305 -5 325 -10 21 -5 71 -14 112 -21 178 -28 372 -81 590 -161 65 -24 225 -102 279 -137 48 -30 63 -34 118 -34 78 1 105 20 179 131 65 97 213 245 301 303 74 48 228 128 248 128 6 0 25 6 41 14 61 30 229 56 359 56 202 0 365 -39 550 -131 285 -142 521 -410 616 -699 108 -331 69 -692 -109 -995 -79 -134 -217 -274 -366 -369 -63 -40 -221 -116 -242 -116 -8 0 -28 -7 -44 -15 -16 -8 -55 -19 -87 -24 -230 -37 -274 -55 -306 -124 -15 -30 -16 -58 -7 -238 18 -382 -25 -716 -128 -994 -63 -171 -182 -380 -298 -523 -59 -74 -186 -204 -244 -251 -25 -20 -54 -44 -65 -54 -26 -24 -178 -128 -235 -161 -25 -14 -88 -46 -140 -72 -52 -25 -106 -51 -120 -58 -34 -18 -216 -80 -315 -107 -114 -31 -197 -48 -410 -85 -126 -21 -452 -46 -625 -48 -376 -3 -837 62 -1105 155 -16 6 -50 17 -75 24 -72 21 -256 98 -320 135 -8 5 -40 21 -70 36 -63 31 -172 103 -277 181 -199 148 -392 374 -504 588 -118 228 -190 479 -220 775 -11 113 -7 483 7 597 5 42 2 62 -15 96 -37 77 -60 86 -318 127 -29 4 -67 15 -84 24 -18 9 -41 16 -52 16 -10 0 -36 8 -56 18 -20 10 -58 30 -86 43 -139 67 -301 202 -395 329 -150 203 -229 445 -230 705 0 331 117 613 355 850 175 176 364 280 615 339 96 22 103 23 243 25 95 1 154 -4 228 -20z"/> <path d="M3464 5185 c-17 -8 -43 -28 -58 -45 l-26 -32 0 -265 c0 -249 1 -268 20 -298 38 -62 51 -65 244 -65 l175 0 36 34 37 35 -4 283 c-4 378 13 353 -253 362 -108 4 -147 2 -171 -9z"/> <path d="M6174 5171 c-12 -5 -31 -22 -43 -37 -22 -28 -22 -32 -19 -309 l3 -281 25 -31 25 -32 189 0 188 -1 41 40 40 40 -5 253 c-6 260 -10 288 -53 342 -15 18 -29 20 -193 22 -97 1 -187 -2 -198 -6z"/> <path d="M4935 5079 c-199 -25 -341 -112 -454 -278 -49 -71 -134 -238 -151 -296 -7 -22 -21 -59 -31 -83 -11 -23 -19 -50 -19 -60 0 -9 -7 -37 -15 -60 -9 -24 -20 -69 -25 -100 -5 -32 -16 -93 -25 -137 -12 -59 -16 -144 -17 -325 -1 -238 0 -247 25 -321 63 -188 164 -313 318 -394 86 -45 137 -61 274 -85 236 -42 492 -10 651 81 238 137 348 357 348 699 0 89 -21 335 -34 390 -6 25 -15 70 -20 100 -5 30 -15 71 -21 90 -6 19 -15 51 -19 70 -24 100 -107 282 -186 406 -59 94 -167 193 -265 242 -46 23 -93 42 -104 42 -12 0 -25 4 -30 9 -15 13 -132 19 -200 10z"/> </g> </svg> ``` -------------------------------------------------------------------------------- /docs/src/assets/logo-white.svg: -------------------------------------------------------------------------------- ``` <?xml version="1.0" standalone="no"?> <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 20010904//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd"> <svg version="1.0" xmlns="http://www.w3.org/2000/svg" width="1000.000000pt" height="1000.000000pt" viewBox="0 0 1000.000000 1000.000000" preserveAspectRatio="xMidYMid meet"> <g transform="translate(0.000000,1000.000000) scale(0.100000,-0.100000)" fill="#ffffff" stroke="none"> <path d="M4934 9086 c-40 -14 -62 -33 -80 -69 -22 -42 -21 -994 1 -1037 38 -73 174 -101 243 -50 19 14 43 42 53 62 18 35 19 65 19 510 0 471 0 473 -23 513 -38 69 -133 101 -213 71z"/> <path d="M3702 8472 c-52 -28 -82 -81 -82 -147 0 -67 8 -80 125 -210 44 -49 107 -121 139 -160 165 -196 233 -268 278 -291 58 -29 66 -30 124 -2 67 31 104 86 104 154 0 60 -14 82 -149 235 -42 47 -95 108 -117 135 -23 27 -52 61 -65 75 -13 14 -57 65 -98 112 -41 47 -89 93 -107 102 -42 20 -111 19 -152 -3z"/> <path d="M6145 8472 c-29 -18 -136 -133 -235 -252 -53 -64 -190 -222 -230 -265 -37 -41 -70 -108 -70 -142 0 -16 10 -49 23 -73 17 -36 33 -51 79 -73 57 -29 57 -29 107 -12 44 14 63 31 149 128 54 62 122 141 151 177 30 36 57 67 60 70 12 10 157 175 179 204 33 43 31 150 -2 188 -56 64 -151 86 -211 50z"/> <path d="M2245 7400 c-188 -14 -374 -75 -585 -191 -222 -123 -464 -366 -577 -579 -13 -25 -28 -52 -33 -60 -74 -123 -137 -348 -161 -580 -10 -106 1 -310 22 -384 5 -17 9 -44 9 -60 0 -72 116 -366 181 -458 11 -14 19 -29 19 -33 0 -33 296 -355 326 -355 7 0 14 -4 16 -10 5 -17 139 -99 243 -150 106 -52 216 -91 303 -109 98 -20 92 -7 92 -215 0 -176 26 -472 50 -571 5 -22 12 -56 15 -75 8 -44 31 -129 56 -201 10 -31 19 -62 19 -69 0 -8 8 -32 19 -54 10 -23 30 -70 45 -106 76 -182 189 -363 319 -515 296 -344 701 -603 1162 -743 216 -66 521 -126 730 -143 335 -27 467 -31 653 -19 103 6 237 15 297 19 120 8 282 32 415 62 47 10 98 19 113 19 16 0 37 5 48 11 11 5 48 16 82 24 34 7 85 21 112 31 104 36 161 58 201 76 22 10 43 18 47 18 12 0 185 85 263 131 44 25 116 71 159 100 43 30 87 61 99 68 107 74 344 310 444 444 40 53 72 98 72 101 0 2 17 31 38 63 68 104 202 390 202 431 0 10 4 22 9 28 12 12 53 168 80 304 30 149 43 293 48 538 l5 214 33 14 c18 7 53 16 77 20 23 4 48 10 53 14 6 4 28 13 50 19 91 27 214 86 318 152 224 141 416 353 524 580 98 206 129 320 153 562 19 189 -20 467 -92 657 -144 382 -420 674 -811 859 -48 22 -93 41 -101 41 -7 0 -35 8 -62 19 -27 10 -92 29 -144 41 -84 20 -119 23 -325 22 -212 0 -238 -2 -330 -25 -55 -14 -131 -37 -170 -52 -38 -15 -84 -32 -101 -39 -18 -6 -38 -16 -45 -22 -8 -6 -27 -18 -44 -26 -79 -40 -121 -67 -205 -134 -69 -54 -225 -212 -255 -257 -21 -32 -26 -33 -84 -6 -25 12 -64 29 -86 40 -183 84 -514 183 -705 209 -41 6 -91 15 -110 20 -50 13 -318 30 -470 30 -159 0 -363 -16 -450 -35 -36 -8 -87 -17 -115 -20 -48 -7 -178 -36 -240 -55 -84 -26 -222 -71 -240 -79 -11 -4 -47 -19 -80 -31 -77 -30 -162 -66 -198 -85 -32 -17 -67 -20 -67 -6 0 16 -211 230 -274 279 -96 74 -124 92 -237 149 -204 102 -346 139 -569 146 -85 2 -200 1 -255 -3z m396 -331 c163 -33 302 -93 433 -184 97 -68 232 -206 299 -307 32 -48 70 -94 85 -104 38 -25 155 -24 185 3 28 24 183 99 302 146 180 70 201 77 214 77 8 0 39 8 70 19 77 26 221 57 376 82 111 17 173 20 418 20 159 0 305 -5 325 -10 21 -5 71 -14 112 -21 178 -28 372 -81 590 -161 65 -24 225 -102 279 -137 48 -30 63 -34 118 -34 78 1 105 20 179 131 65 97 213 245 301 303 74 48 228 128 248 128 6 0 25 6 41 14 61 30 229 56 359 56 202 0 365 -39 550 -131 285 -142 521 -410 616 -699 108 -331 69 -692 -109 -995 -79 -134 -217 -274 -366 -369 -63 -40 -221 -116 -242 -116 -8 0 -28 -7 -44 -15 -16 -8 -55 -19 -87 -24 -230 -37 -274 -55 -306 -124 -15 -30 -16 -58 -7 -238 18 -382 -25 -716 -128 -994 -63 -171 -182 -380 -298 -523 -59 -74 -186 -204 -244 -251 -25 -20 -54 -44 -65 -54 -26 -24 -178 -128 -235 -161 -25 -14 -88 -46 -140 -72 -52 -25 -106 -51 -120 -58 -34 -18 -216 -80 -315 -107 -114 -31 -197 -48 -410 -85 -126 -21 -452 -46 -625 -48 -376 -3 -837 62 -1105 155 -16 6 -50 17 -75 24 -72 21 -256 98 -320 135 -8 5 -40 21 -70 36 -63 31 -172 103 -277 181 -199 148 -392 374 -504 588 -118 228 -190 479 -220 775 -11 113 -7 483 7 597 5 42 2 62 -15 96 -37 77 -60 86 -318 127 -29 4 -67 15 -84 24 -18 9 -41 16 -52 16 -10 0 -36 8 -56 18 -20 10 -58 30 -86 43 -139 67 -301 202 -395 329 -150 203 -229 445 -230 705 0 331 117 613 355 850 175 176 364 280 615 339 96 22 103 23 243 25 95 1 154 -4 228 -20z"/> <path d="M3464 5185 c-17 -8 -43 -28 -58 -45 l-26 -32 0 -265 c0 -249 1 -268 20 -298 38 -62 51 -65 244 -65 l175 0 36 34 37 35 -4 283 c-4 378 13 353 -253 362 -108 4 -147 2 -171 -9z"/> <path d="M6174 5171 c-12 -5 -31 -22 -43 -37 -22 -28 -22 -32 -19 -309 l3 -281 25 -31 25 -32 189 0 188 -1 41 40 40 40 -5 253 c-6 260 -10 288 -53 342 -15 18 -29 20 -193 22 -97 1 -187 -2 -198 -6z"/> <path d="M4935 5079 c-199 -25 -341 -112 -454 -278 -49 -71 -134 -238 -151 -296 -7 -22 -21 -59 -31 -83 -11 -23 -19 -50 -19 -60 0 -9 -7 -37 -15 -60 -9 -24 -20 -69 -25 -100 -5 -32 -16 -93 -25 -137 -12 -59 -16 -144 -17 -325 -1 -238 0 -247 25 -321 63 -188 164 -313 318 -394 86 -45 137 -61 274 -85 236 -42 492 -10 651 81 238 137 348 357 348 699 0 89 -21 335 -34 390 -6 25 -15 70 -20 100 -5 30 -15 71 -21 90 -6 19 -15 51 -19 70 -24 100 -107 282 -186 406 -59 94 -167 193 -265 242 -46 23 -93 42 -104 42 -12 0 -25 4 -30 9 -15 13 -132 19 -200 10z"/> </g> </svg> ``` -------------------------------------------------------------------------------- /scripts/build.ps1: -------------------------------------------------------------------------------- ``` # PowerShell Build Script for CUA # Exit on error $ErrorActionPreference = "Stop" # Colors for output $RED = "Red" $GREEN = "Green" $BLUE = "Blue" # Function to print step information function Print-Step { param([string]$Message) Write-Host "==> $Message" -ForegroundColor $BLUE } # Function to print success message function Print-Success { param([string]$Message) Write-Host "==> Success: $Message" -ForegroundColor $GREEN } # Function to print error message function Print-Error { param([string]$Message) Write-Host "==> Error: $Message" -ForegroundColor $RED } # Get the script's directory and project root $SCRIPT_DIR = Split-Path -Parent $MyInvocation.MyCommand.Path $PROJECT_ROOT = Split-Path -Parent $SCRIPT_DIR # Change to project root Set-Location $PROJECT_ROOT # Load environment variables from .env.local if (Test-Path ".env.local") { Print-Step "Loading environment variables from .env.local..." Get-Content ".env.local" | ForEach-Object { if ($_ -match "^([^#][^=]*?)=(.*)$") { [Environment]::SetEnvironmentVariable($matches[1], $matches[2], "Process") } } Print-Success "Environment variables loaded" } else { Print-Error ".env.local file not found" exit 1 } # Check if conda is available try { conda --version | Out-Null Print-Success "Conda is available" } catch { Print-Error "Conda is not available. Please install Anaconda or Miniconda first." exit 1 } # Create or update conda environment Print-Step "Creating/updating conda environment 'cua' with Python 3.12..." try { # Check if environment exists $envExists = conda env list | Select-String "^cua\s" if ($envExists) { Print-Step "Environment 'cua' already exists. Updating..." conda env update -n cua -f environment.yml --prune } else { Print-Step "Creating new environment 'cua'..." conda create -n cua python=3.12 -y } Print-Success "Conda environment 'cua' ready" } catch { Print-Error "Failed to create/update conda environment" exit 1 } # Activate conda environment Print-Step "Activating conda environment 'cua'..." try { conda activate cua Print-Success "Environment activated" } catch { Print-Error "Failed to activate conda environment 'cua'" Print-Step "Please run: conda activate cua" Print-Step "Then re-run this script" exit 1 } # Clean up existing environments and cache Print-Step "Cleaning up existing environments..." Get-ChildItem -Path . -Recurse -Directory -Name "__pycache__" | ForEach-Object { Remove-Item -Path $_ -Recurse -Force } Get-ChildItem -Path . -Recurse -Directory -Name ".pytest_cache" | ForEach-Object { Remove-Item -Path $_ -Recurse -Force } Get-ChildItem -Path . -Recurse -Directory -Name "dist" | ForEach-Object { Remove-Item -Path $_ -Recurse -Force } Get-ChildItem -Path . -Recurse -Directory -Name "*.egg-info" | ForEach-Object { Remove-Item -Path $_ -Recurse -Force } # Function to install a package and its dependencies function Install-Package { param( [string]$PackageDir, [string]$PackageName, [string]$Extras = "" ) Print-Step "Installing $PackageName..." Set-Location $PackageDir if (Test-Path "pyproject.toml") { if ($Extras) { pip install -e ".[$Extras]" } else { pip install -e . } } else { Print-Error "No pyproject.toml found in $PackageDir" Set-Location $PROJECT_ROOT return $false } Set-Location $PROJECT_ROOT return $true } # Install packages in order of dependency Print-Step "Installing packages in development mode..." # Install core first (base package with telemetry support) if (-not (Install-Package "libs/python/core" "core")) { exit 1 } # Install pylume (base dependency) if (-not (Install-Package "libs/python/pylume" "pylume")) { exit 1 } # Install computer with all its dependencies and extras if (-not (Install-Package "libs/python/computer" "computer" "all")) { exit 1 } # Install omniparser if (-not (Install-Package "libs/python/som" "som")) { exit 1 } # Install agent with all its dependencies and extras if (-not (Install-Package "libs/python/agent" "agent" "all")) { exit 1 } # Install computer-server if (-not (Install-Package "libs/python/computer-server" "computer-server")) { exit 1 } # Install mcp-server if (-not (Install-Package "libs/python/mcp-server" "mcp-server")) { exit 1 } # Install development tools from root project Print-Step "Installing development dependencies..." pip install -e ".[dev,test,docs]" # Create a .env file for VS Code to use the virtual environment Print-Step "Creating .env file for VS Code..." $pythonPath = "$PROJECT_ROOT/libs/python/core;$PROJECT_ROOT/libs/python/computer;$PROJECT_ROOT/libs/python/agent;$PROJECT_ROOT/libs/python/som;$PROJECT_ROOT/libs/python/pylume;$PROJECT_ROOT/libs/python/computer-server;$PROJECT_ROOT/libs/python/mcp-server" "PYTHONPATH=$pythonPath" | Out-File -FilePath ".env" -Encoding UTF8 Print-Success "All packages installed successfully!" Print-Step "Your conda environment 'cua' is ready. To activate it:" Write-Host " conda activate cua" -ForegroundColor Yellow ``` -------------------------------------------------------------------------------- /docs/content/docs/agent-sdk/integrations/hud.mdx: -------------------------------------------------------------------------------- ```markdown --- title: HUD Evals description: Use ComputerAgent with HUD for benchmarking and evaluation --- <Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.</Callout> The HUD integration allows an agent to be benchmarked using the [HUD framework](https://www.hud.so/). Through the HUD integration, the agent controls a computer inside HUD, where tests are run to evaluate the success of each task. ## Installation First, install the required package: ```bash pip install "cua-agent[hud]" ## or install hud-python directly # pip install hud-python==0.4.12 ``` ## Environment Variables Before running any evaluations, you’ll need to set up your environment variables for HUD and your model providers: ```bash # HUD access export HUD_API_KEY="your_hud_api_key" # Model provider keys (at least one required) export OPENAI_API_KEY="your_openai_key" export ANTHROPIC_API_KEY="your_anthropic_key" ``` ## Running a Single Task You can run a single task from a HUD dataset for quick verification. ### Example ```python from agent.integrations.hud import run_single_task await run_single_task( dataset="hud-evals/OSWorld-Verified", # or another HUD dataset model="openai/computer-use-preview+openai/gpt-5-nano", # any supported model string task_id=155, # e.g., reopen last closed tab ) ``` ### Parameters - `task_id` (`int`): Default: `0` Index of the task to run from the dataset. ## Running a Full Dataset To benchmark your agent at scale, you can run an entire dataset (or a subset) in parallel. ### Example ```python from agent.integrations.hud import run_full_dataset results = await run_full_dataset( dataset="hud-evals/OSWorld-Verified", # can also pass a Dataset or list[dict] model="openai/computer-use-preview", split="train[:3]", # try a few tasks to start max_concurrent=20, # tune to your infra max_steps=50 # safety cap per task ) ``` ### Parameters - `job_name` (`str` | `None`): Optional human-readable name for the evaluation job (shows up in HUD UI). - `max_concurrent` (`int`): Default: `30` Number of tasks to run in parallel. Scale this based on your infra. - `max_steps` (`int`): Default: `50` Safety cap on steps per task to prevent infinite loops. - `split` (`str`): Default: `"train"` Dataset split or subset to run. Uses the [Hugging Face split format](https://huggingface.co/docs/datasets/v1.11.0/splits.html), e.g., `"train[:10]"` for the first 10 tasks. ## Additional Parameters Both single-task and full-dataset runs share a common set of configuration options. These let you fine-tune how the evaluation runs. - `dataset` (`str` | `Dataset` | `list[dict]`): **Required** HUD dataset name (e.g. `"hud-evals/OSWorld-Verified"`), a loaded `Dataset`, or a list of tasks. - `model` (`str`): Default: `"computer-use-preview"` Model string, e.g. `"openai/computer-use-preview+openai/gpt-5-nano"`. Supports composition with `+` (planning + grounding). - `allowed_tools` (`list[str]`): Default: `["openai_computer"]` Restrict which tools the agent may use. - `tools` (`list[Any]`): Extra tool configs to inject. - `custom_loop` (`Callable`): Optional custom agent loop function. If provided, overrides automatic loop selection. - `only_n_most_recent_images` (`int`): Default: `5` for full dataset, `None` for single task. Retain only the last N screenshots in memory. - `callbacks` (`list[Any]`): Hook functions for logging, telemetry, or side effects. - `verbosity` (`int`): Logging level. Set `2` for debugging every call/action. - `trajectory_dir` (`str` | `dict`): Save local copies of trajectories for replay/analysis. - `max_retries` (`int`): Default: `3` Number of retries for failed model/tool calls. - `screenshot_delay` (`float` | `int`): Default: `0.5` Delay (seconds) between screenshots to avoid race conditions. - `use_prompt_caching` (`bool`): Default: `False` Cache repeated prompts to reduce API calls. - `max_trajectory_budget` (`float` | `dict`): Limit on trajectory size/budget (e.g., tokens, steps). - `telemetry_enabled` (`bool`): Default: `True` Whether to send telemetry/traces to HUD. - `**kwargs` (`any`): Any additional keyword arguments are passed through to the agent loop or model provider. ## Available Benchmarks HUD provides multiple benchmark datasets for realistic evaluation. 1. **[OSWorld-Verified](/agent-sdk/benchmarks/osworld-verified)** – Benchmark on 369+ real-world desktop tasks across Chrome, LibreOffice, GIMP, VS Code, etc. *Best for*: evaluating full computer-use agents in realistic environments. *Verified variant*: fixes 300+ issues from earlier versions for reliability. **Coming soon:** SheetBench (spreadsheet automation) and other specialized HUD datasets. See the [HUD docs](https://docs.hud.so/environment-creation) for more eval environments. ## Tips * **Debugging:** set `verbosity=2` to see every model call and tool action. * **Performance:** lower `screenshot_delay` for faster runs; raise it if you see race conditions. * **Safety:** always set `max_steps` (defaults to 50) to prevent runaway loops. * **Custom tools:** pass extra `tools=[...]` into the agent config if you need beyond `openai_computer`. ``` -------------------------------------------------------------------------------- /docs/content/docs/agent-sdk/message-format.mdx: -------------------------------------------------------------------------------- ```markdown --- title: Message Format --- This page documents the Python message and response schema used by the Agent SDK. It mirrors the structure shown in Chat History and provides precise type definitions you can target in your own code. All examples below use Python type hints with `TypedDict` and `Literal` from the standard `typing` module. ## Response The agent yields response chunks as an async generator of objects with `output` and `usage`. ```python from typing import List, TypedDict class Usage(TypedDict, total=False): prompt_tokens: int completion_tokens: int total_tokens: int response_cost: float # USD cost if available class AgentResponse(TypedDict): output: List["AgentMessage"] usage: Usage ``` ## Messages Agent messages represent the state of the conversation and the agent's actions. ```python from typing import List, Literal, Optional, TypedDict, Union # Union of all message variants AgentMessage = Union[ "UserMessage", "AssistantMessage", "ReasoningMessage", "ComputerCallMessage", "ComputerCallOutputMessage", "FunctionCallMessage", "FunctionCallOutputMessage", ] # Input message (role: user/system/developer) class UserMessage(TypedDict, total=False): type: Literal["message"] # optional for user input role: Literal["user", "system", "developer"] content: Union[str, List["InputContent"]] # Output message (assistant text) class AssistantMessage(TypedDict): type: Literal["message"] role: Literal["assistant"] content: List["OutputContent"] # Output reasoning/thinking message class ReasoningMessage(TypedDict): type: Literal["reasoning"] summary: List["SummaryContent"] # Output computer action call (agent intends to act) class ComputerCallMessage(TypedDict): type: Literal["computer_call"] call_id: str status: Literal["completed", "failed", "pending"] action: "ComputerAction" # Output computer action result (always a screenshot) class ComputerCallOutputMessage(TypedDict): type: Literal["computer_call_output"] call_id: str output: "ComputerResultContent" # Output function call (agent calls a Python tool) class FunctionCallMessage(TypedDict): type: Literal["function_call"] call_id: str status: Literal["completed", "failed", "pending"] name: str arguments: str # JSON-serialized kwargs # Output function call result (text) class FunctionCallOutputMessage(TypedDict): type: Literal["function_call_output"] call_id: str output: str ``` ## Message Content These content items appear inside `content` arrays for the message types above. ```python # Input content kinds class InputContent(TypedDict): type: Literal["input_image", "input_text"] text: Optional[str] image_url: Optional[str] # e.g., data URL # Assistant output content class OutputContent(TypedDict): type: Literal["output_text"] text: str # Reasoning/summary output content class SummaryContent(TypedDict): type: Literal["summary_text"] text: str # Computer call outputs (screenshots) class ComputerResultContent(TypedDict): type: Literal["computer_screenshot", "input_image"] image_url: str # data URL (e.g., "data:image/png;base64,....") ``` ## Actions Computer actions represent concrete operations the agent will perform on the computer. Two broad families exist depending on the provider: OpenAI-style and Anthropic-style. ```python # Union of all supported computer actions ComputerAction = Union[ "ClickAction", "DoubleClickAction", "DragAction", "KeyPressAction", "MoveAction", "ScreenshotAction", "ScrollAction", "TypeAction", "WaitAction", # Anthropic variants "LeftMouseDownAction", "LeftMouseUpAction", ] # OpenAI Computer Actions class ClickAction(TypedDict): type: Literal["click"] button: Literal["left", "right", "wheel", "back", "forward"] x: int y: int class DoubleClickAction(TypedDict, total=False): type: Literal["double_click"] button: Literal["left", "right", "wheel", "back", "forward"] x: int y: int class DragAction(TypedDict, total=False): type: Literal["drag"] button: Literal["left", "right", "wheel", "back", "forward"] path: List[tuple[int, int]] # [(x1, y1), (x2, y2), ...] class KeyPressAction(TypedDict): type: Literal["keypress"] keys: List[str] # e.g., ["ctrl", "a"] class MoveAction(TypedDict): type: Literal["move"] x: int y: int class ScreenshotAction(TypedDict): type: Literal["screenshot"] class ScrollAction(TypedDict): type: Literal["scroll"] scroll_x: int scroll_y: int x: int y: int class TypeAction(TypedDict): type: Literal["type"] text: str class WaitAction(TypedDict): type: Literal["wait"] # Anthropic Computer Actions class LeftMouseDownAction(TypedDict): type: Literal["left_mouse_down"] x: int y: int class LeftMouseUpAction(TypedDict): type: Literal["left_mouse_up"] x: int y: int ``` ## Notes - The agent runtime may add provider-specific fields when available (e.g., usage cost). Unknown fields should be ignored for forward compatibility. - Computer action outputs are screenshots as data URLs. For security and storage, some serializers may redact or omit large fields in persisted metadata. - The message flow typically alternates between reasoning, actions, screenshots, and concluding assistant text. See [Chat History](./chat-history) for a step-by-step example. ``` -------------------------------------------------------------------------------- /libs/typescript/agent/src/client.ts: -------------------------------------------------------------------------------- ```typescript import {Peer} from "peerjs"; import type { AgentRequest, AgentResponse, ConnectionType, AgentClientOptions, } from "./types"; export class AgentClient { private url: string; private connectionType: ConnectionType; private options: AgentClientOptions; private peer?: Peer; private connection?: any; constructor(url: string, options: AgentClientOptions = {}) { this.url = url; this.options = { timeout: 30000, retries: 3, ...options, }; // Determine connection type from URL if (url.startsWith("http://") || url.startsWith("https://")) { this.connectionType = url.startsWith("https://") ? "https" : "http"; } else if (url.startsWith("peer://")) { this.connectionType = "peer"; } else { throw new Error( "Invalid URL format. Must start with http://, https://, or peer://" ); } } // Main responses API matching the desired usage pattern public responses = { create: async (request: AgentRequest): Promise<AgentResponse> => { return this.sendRequest(request); }, }; private async sendRequest(request: AgentRequest): Promise<AgentResponse> { switch (this.connectionType) { case "http": case "https": return this.sendHttpRequest(request); case "peer": return this.sendPeerRequest(request); default: throw new Error(`Unsupported connection type: ${this.connectionType}`); } } private async sendHttpRequest(request: AgentRequest): Promise<AgentResponse> { const controller = new AbortController(); const timeoutId = setTimeout( () => controller.abort(), this.options.timeout ); try { const headers: Record<string, string> = { "Content-Type": "application/json", }; if (this.options.apiKey) { headers["X-API-Key"] = this.options.apiKey; } const response = await fetch(`${this.url}/responses`, { method: "POST", headers, body: JSON.stringify(request), signal: controller.signal, }); clearTimeout(timeoutId); if (!response.ok) { throw new Error(`HTTP error! status: ${response.status}`); } const data = await response.json(); return data as AgentResponse; } catch (error) { clearTimeout(timeoutId); if (error instanceof Error) { throw new Error(`Failed to send HTTP request: ${error.message}`); } throw error; } } private async sendPeerRequest(request: AgentRequest): Promise<AgentResponse> { // Extract peer ID from peer:// URL const peerId = this.url.replace("peer://", ""); if (!this.peer) { // Initialize peer connection with default options as requested this.peer = new Peer(); return new Promise<AgentResponse>((resolve, reject) => { const timeout = setTimeout(() => { reject(new Error("Peer connection timeout")); }, this.options.timeout); this.peer!.on("open", () => { // Connect to the target peer this.connection = this.peer!.connect(peerId); this.connection.on("open", () => { // Send the request this.connection!.send(JSON.stringify(request)); }); this.connection.on("data", (data: any) => { clearTimeout(timeout); try { const response = typeof data === "string" ? JSON.parse(data) : data; resolve(response as AgentResponse); } catch (error) { reject(new Error("Failed to parse peer response")); } }); this.connection.on("error", (error: any) => { clearTimeout(timeout); reject(new Error(`Peer connection error: ${error}`)); }); }); this.peer!.on("error", (error: any) => { clearTimeout(timeout); reject(new Error(`Peer error: ${error}`)); }); }); } else { // Reuse existing connection return new Promise<AgentResponse>((resolve, reject) => { const timeout = setTimeout(() => { reject(new Error("Peer request timeout")); }, this.options.timeout); if (this.connection && this.connection.open) { this.connection.send(JSON.stringify(request)); const handleData = (data: any) => { clearTimeout(timeout); this.connection!.off("data", handleData); try { const response = typeof data === "string" ? JSON.parse(data) : data; resolve(response as AgentResponse); } catch (error) { reject(new Error("Failed to parse peer response")); } }; this.connection.on("data", handleData); } else { clearTimeout(timeout); reject(new Error("Peer connection not available")); } }); } } // Health check method async health(): Promise<{ status: string }> { if (this.connectionType === "peer") { return { status: this.peer?.open ? "connected" : "disconnected" }; } try { const response = await fetch(`${this.url}/health`); if (response.ok) { return { status: "healthy" }; } return { status: "unhealthy" }; } catch { return { status: "unreachable" }; } } // Clean up resources async disconnect(): Promise<void> { if (this.connection) { this.connection.close(); this.connection = undefined; } if (this.peer) { this.peer.destroy(); this.peer = undefined; } } } ``` -------------------------------------------------------------------------------- /scripts/build-uv.sh: -------------------------------------------------------------------------------- ```bash #!/bin/bash # Exit on error set -e # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' BLUE='\033[0;34m' YELLOW='\033[1;33m' NC='\033[0m' # No Color # Function to print step information print_step() { echo -e "${BLUE}==> $1${NC}" } # Function to print success message print_success() { echo -e "${GREEN}==> Success: $1${NC}" } # Function to print error message print_error() { echo -e "${RED}==> Error: $1${NC}" >&2 } # Function to print warning message print_warning() { echo -e "${YELLOW}==> Warning: $1${NC}" } # Function to check if UV is installed check_uv() { if command -v uv &> /dev/null; then print_success "UV is already installed" uv --version return 0 else return 1 fi } # Function to install UV install_uv() { print_step "UV not found. Installing UV..." # Detect OS if [[ "$OSTYPE" == "linux-gnu"* ]] || [[ "$OSTYPE" == "darwin"* ]]; then print_step "Installing UV for Unix-like system..." curl -LsSf https://astral.sh/uv/install.sh | sh # Add UV to PATH for current session export PATH="$HOME/.cargo/bin:$PATH" # Check if installation was successful if command -v uv &> /dev/null; then print_success "UV installed successfully" uv --version else print_error "UV installation failed" print_step "Please restart your terminal and try again, or install manually:" echo " curl -LsSf https://astral.sh/uv/install.sh | sh" exit 1 fi elif [[ "$OSTYPE" == "msys" ]] || [[ "$OSTYPE" == "cygwin" ]]; then print_error "For Windows, please use PowerShell and run:" echo " powershell -ExecutionPolicy ByPass -c \"irm https://astral.sh/uv/install.ps1 | iex\"" exit 1 else print_error "Unsupported operating system: $OSTYPE" print_step "Please install UV manually from: https://docs.astral.sh/uv/getting-started/installation/" exit 1 fi } # Get the script's directory SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" PROJECT_ROOT="$( cd "${SCRIPT_DIR}/.." && pwd )" # Change to project root cd "$PROJECT_ROOT" # Check if UV is installed, install if not if ! check_uv; then install_uv fi # Load environment variables from .env.local if [ -f .env.local ]; then print_step "Loading environment variables from .env.local..." set -a source .env.local set +a print_success "Environment variables loaded" else print_error ".env.local file not found" exit 1 fi # Clean up existing environments and cache print_step "Cleaning up existing environments..." find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true find . -type d -name ".pytest_cache" -exec rm -rf {} + 2>/dev/null || true find . -type d -name "dist" -exec rm -rf {} + 2>/dev/null || true find . -type d -name ".venv" -exec rm -rf {} + 2>/dev/null || true find . -type d -name "*.egg-info" -exec rm -rf {} + 2>/dev/null || true print_success "Environment cleanup complete" # Install Python 3.12 using UV print_step "Installing Python 3.12 using UV..." uv python install 3.12 print_success "Python 3.12 installed" # Create virtual environment using UV print_step "Creating virtual environment with UV..." uv venv .venv --python 3.12 print_success "Virtual environment created" # Activate virtual environment print_step "Activating virtual environment..." source .venv/bin/activate print_success "Virtual environment activated" # Function to install a package and its dependencies using UV install_package() { local package_dir=$1 local package_name=$2 local extras=$3 print_step "Installing ${package_name} with UV..." cd "$package_dir" if [ -f "pyproject.toml" ]; then if [ -n "$extras" ]; then uv pip install -e ".[${extras}]" else uv pip install -e . fi else print_error "No pyproject.toml found in ${package_dir}" return 1 fi cd "$PROJECT_ROOT" } # Install packages in order of dependency print_step "Installing packages in development mode with UV..." # Install core first (base package with telemetry support) install_package "libs/python/core" "core" # Install pylume (base dependency) install_package "libs/python/pylume" "pylume" # Install computer with all its dependencies and extras install_package "libs/python/computer" "computer" "all" # Install omniparser install_package "libs/python/som" "som" # Install agent with all its dependencies and extras install_package "libs/python/agent" "agent" "all" # Install computer-server install_package "libs/python/computer-server" "computer-server" # Install mcp-server install_package "libs/python/mcp-server" "mcp-server" # Install development tools from root project print_step "Installing development dependencies with UV..." uv pip install -e ".[dev,test,docs]" # Create a .env file for VS Code to use the virtual environment print_step "Creating .env file for VS Code..." echo "PYTHONPATH=${PROJECT_ROOT}/libs/python/core:${PROJECT_ROOT}/libs/python/computer:${PROJECT_ROOT}/libs/python/agent:${PROJECT_ROOT}/libs/python/som:${PROJECT_ROOT}/libs/python/pylume:${PROJECT_ROOT}/libs/python/computer-server:${PROJECT_ROOT}/libs/python/mcp-server" > .env print_success "All packages installed successfully with UV!" print_step "Your virtual environment is ready. To activate it:" echo " source .venv/bin/activate" print_step "UV provides fast dependency resolution and installation." print_step "You can also use 'uv run' to run commands in the virtual environment without activation." ``` -------------------------------------------------------------------------------- /libs/python/computer/computer/providers/winsandbox/setup_script.ps1: -------------------------------------------------------------------------------- ``` # Setup script for Windows Sandbox CUA Computer provider # This script runs when the sandbox starts Write-Host "Starting CUA Computer setup in Windows Sandbox..." # Function to find the mapped Python installation from pywinsandbox function Find-MappedPython { Write-Host "Looking for mapped Python installation from pywinsandbox..." # pywinsandbox maps the host Python installation to the sandbox # Look for mapped shared folders on the desktop (common pywinsandbox pattern) $desktopPath = "C:\Users\WDAGUtilityAccount\Desktop" $sharedFolders = Get-ChildItem -Path $desktopPath -Directory -ErrorAction SilentlyContinue foreach ($folder in $sharedFolders) { # Look for Python executables in shared folders $pythonPaths = @( "$($folder.FullName)\python.exe", "$($folder.FullName)\Scripts\python.exe", "$($folder.FullName)\bin\python.exe" ) foreach ($pythonPath in $pythonPaths) { if (Test-Path $pythonPath) { try { $version = & $pythonPath --version 2>&1 if ($version -match "Python") { Write-Host "Found mapped Python: $pythonPath - $version" return $pythonPath } } catch { continue } } } # Also check subdirectories that might contain Python $subDirs = Get-ChildItem -Path $folder.FullName -Directory -ErrorAction SilentlyContinue foreach ($subDir in $subDirs) { $pythonPath = "$($subDir.FullName)\python.exe" if (Test-Path $pythonPath) { try { $version = & $pythonPath --version 2>&1 if ($version -match "Python") { Write-Host "Found mapped Python in subdirectory: $pythonPath - $version" return $pythonPath } } catch { continue } } } } # Fallback: try common Python commands that might be available $pythonCommands = @("python", "py", "python3") foreach ($cmd in $pythonCommands) { try { $version = & $cmd --version 2>&1 if ($version -match "Python") { Write-Host "Found Python via command '$cmd': $version" return $cmd } } catch { continue } } throw "Could not find any Python installation (mapped or otherwise)" } try { # Step 1: Find the mapped Python installation Write-Host "Step 1: Finding mapped Python installation..." $pythonExe = Find-MappedPython Write-Host "Using Python: $pythonExe" # Verify Python works and show version $pythonVersion = & $pythonExe --version 2>&1 Write-Host "Python version: $pythonVersion" # Step 2: Create a dedicated virtual environment in mapped Desktop folder (persistent) Write-Host "Step 2: Creating virtual environment (if needed)..." $cachePath = "C:\Users\WDAGUtilityAccount\Desktop\wsb_cache" $venvPath = "C:\Users\WDAGUtilityAccount\Desktop\wsb_cache\venv" if (!(Test-Path $venvPath)) { Write-Host "Creating venv at: $venvPath" & $pythonExe -m venv $venvPath } else { Write-Host "Venv already exists at: $venvPath" } # Hide the folder to keep Desktop clean try { $item = Get-Item $cachePath -ErrorAction SilentlyContinue if ($item) { if (-not ($item.Attributes -band [IO.FileAttributes]::Hidden)) { $item.Attributes = $item.Attributes -bor [IO.FileAttributes]::Hidden } } } catch { } $venvPython = Join-Path $venvPath "Scripts\python.exe" if (!(Test-Path $venvPython)) { throw "Virtual environment Python not found at $venvPython" } Write-Host "Using venv Python: $venvPython" # Step 3: Install cua-computer-server into the venv Write-Host "Step 3: Installing cua-computer-server..." Write-Host "Upgrading pip..." & $venvPython -m pip install --upgrade pip --quiet Write-Host "Installing cua-computer-server..." & $venvPython -m pip install cua-computer-server Write-Host "cua-computer-server installation completed." # Step 4: Start computer server in background using the venv Python Write-Host "Step 4: Starting computer server in background..." Write-Host "Starting computer server with: $venvPython" # Start the computer server in the background $serverProcess = Start-Process -FilePath $venvPython -ArgumentList "-m", "computer_server.main" -WindowStyle Hidden -PassThru Write-Host "Computer server started in background with PID: $($serverProcess.Id)" # Give it a moment to start Start-Sleep -Seconds 3 # Check if the process is still running if (Get-Process -Id $serverProcess.Id -ErrorAction SilentlyContinue) { Write-Host "Computer server is running successfully in background" } else { throw "Computer server failed to start or exited immediately" } } catch { Write-Error "Setup failed: $_" Write-Host "Error details: $($_.Exception.Message)" Write-Host "Stack trace: $($_.ScriptStackTrace)" Write-Host "" Write-Host "Press any key to close this window..." $null = $Host.UI.RawUI.ReadKey("NoEcho,IncludeKeyDown") exit 1 } Write-Host "" Write-Host "Setup completed successfully!" Write-Host "Press any key to close this window..." $null = $Host.UI.RawUI.ReadKey("NoEcho,IncludeKeyDown") ``` -------------------------------------------------------------------------------- /libs/python/som/som/ocr.py: -------------------------------------------------------------------------------- ```python from typing import List, Dict, Any, Tuple, Union import logging import signal from contextlib import contextmanager from pathlib import Path import easyocr from PIL import Image import numpy as np import torch logger = logging.getLogger(__name__) class TimeoutException(Exception): pass @contextmanager def timeout(seconds: int): import threading # Check if we're in the main thread if threading.current_thread() is threading.main_thread(): def timeout_handler(signum, frame): raise TimeoutException("OCR process timed out") original_handler = signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(seconds) try: yield finally: signal.alarm(0) signal.signal(signal.SIGALRM, original_handler) else: # In a non-main thread, we can't use signal logger.warning("Timeout function called from non-main thread; signal-based timeout disabled") try: yield finally: pass class OCRProcessor: """Class for handling OCR text detection.""" _shared_reader = None # Class-level shared reader instance def __init__(self): """Initialize the OCR processor.""" self.reader = None # Determine best available device self.device = "cpu" if torch.cuda.is_available(): self.device = "cuda" elif ( hasattr(torch, "backends") and hasattr(torch.backends, "mps") and torch.backends.mps.is_available() ): self.device = "mps" logger.info(f"OCR processor initialized with device: {self.device}") def _ensure_reader(self): """Ensure EasyOCR reader is initialized. Uses a class-level cached reader to avoid reinitializing on every instance. """ # First check if we already have a class-level reader if OCRProcessor._shared_reader is not None: self.reader = OCRProcessor._shared_reader return # Otherwise initialize a new one if self.reader is None: try: logger.info("Initializing EasyOCR reader...") import easyocr # Use GPU if available use_gpu = self.device in ["cuda", "mps"] self.reader = easyocr.Reader(["en"], gpu=use_gpu) # Verify reader initialization if self.reader is None: raise ValueError("Failed to initialize EasyOCR reader") # Cache the reader at class level OCRProcessor._shared_reader = self.reader logger.info(f"EasyOCR reader initialized successfully with GPU={use_gpu}") except Exception as e: logger.error(f"Failed to initialize EasyOCR reader: {str(e)}") # Set to a placeholder that will be checked self.reader = None raise RuntimeError(f"EasyOCR initialization failed: {str(e)}") from e def detect_text( self, image: Image.Image, confidence_threshold: float = 0.5, timeout_seconds: int = 5 ) -> List[Dict[str, Any]]: """Detect text in an image using EasyOCR. Args: image: PIL Image to process confidence_threshold: Minimum confidence for text detection timeout_seconds: Maximum time to wait for OCR Returns: List of text detection dictionaries """ try: # Try to initialize reader, catch any exceptions try: self._ensure_reader() except Exception as e: logger.error(f"Failed to initialize OCR reader: {str(e)}") return [] # Ensure reader was properly initialized if self.reader is None: logger.error("OCR reader is None after initialization") return [] # Convert PIL Image to numpy array image_np = np.array(image) try: with timeout(timeout_seconds): results = self.reader.readtext( image_np, paragraph=False, text_threshold=confidence_threshold ) except TimeoutException: logger.warning("OCR timed out") return [] except Exception as e: logger.warning(f"OCR failed: {str(e)}") return [] detections = [] img_width, img_height = image.size for box, text, conf in results: # Ensure conf is float conf_float = float(conf) if conf_float < confidence_threshold: continue # Convert box format to [x1, y1, x2, y2] # Ensure box points are properly typed as float x1 = min(float(point[0]) for point in box) / img_width y1 = min(float(point[1]) for point in box) / img_height x2 = max(float(point[0]) for point in box) / img_width y2 = max(float(point[1]) for point in box) / img_height detections.append( { "type": "text", "bbox": [x1, y1, x2, y2], "content": text, "confidence": conf, "interactivity": False, # Text is typically non-interactive } ) return detections except Exception as e: logger.error(f"Unexpected error in OCR processing: {str(e)}") return [] ``` -------------------------------------------------------------------------------- /.github/workflows/pypi-publish-mcp-server.yml: -------------------------------------------------------------------------------- ```yaml name: Publish MCP Server Package on: push: tags: - "mcp-server-v*" workflow_dispatch: inputs: version: description: "Version to publish (without v prefix)" required: true default: "0.1.0" workflow_call: inputs: version: description: "Version to publish" required: true type: string outputs: version: description: "The version that was published" value: ${{ jobs.prepare.outputs.version }} # Adding permissions at workflow level permissions: contents: write jobs: prepare: runs-on: macos-latest outputs: version: ${{ steps.get-version.outputs.version }} agent_version: ${{ steps.update-deps.outputs.agent_version }} computer_version: ${{ steps.update-deps.outputs.computer_version }} steps: - uses: actions/checkout@v4 - name: Determine version id: get-version run: | if [ "${{ github.event_name }}" == "push" ]; then # Extract version from tag (for package-specific tags) if [[ "${{ github.ref }}" =~ ^refs/tags/mcp-server-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then VERSION=${BASH_REMATCH[1]} else echo "Invalid tag format for mcp-server" exit 1 fi elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then # Use version from workflow dispatch VERSION=${{ github.event.inputs.version }} else # Use version from workflow_call VERSION=${{ inputs.version }} fi echo "VERSION=$VERSION" echo "version=$VERSION" >> $GITHUB_OUTPUT - name: Set up Python uses: actions/setup-python@v4 with: python-version: "3.11" - name: Update dependencies to latest versions id: update-deps run: | cd libs/python/mcp-server # Install required package for PyPI API access pip install requests # Create a Python script for PyPI version checking cat > get_latest_versions.py << 'EOF' import requests import json import sys def get_package_version(package_name, fallback="0.1.0"): try: response = requests.get(f'https://pypi.org/pypi/{package_name}/json') print(f"API Response Status for {package_name}: {response.status_code}", file=sys.stderr) if response.status_code != 200: print(f"API request failed for {package_name}, using fallback version", file=sys.stderr) return fallback data = json.loads(response.text) if 'info' not in data: print(f"Missing 'info' key in API response for {package_name}, using fallback version", file=sys.stderr) return fallback return data['info']['version'] except Exception as e: print(f"Error fetching version for {package_name}: {str(e)}", file=sys.stderr) return fallback # Get latest versions print(get_package_version('cua-agent')) print(get_package_version('cua-computer')) EOF # Execute the script to get the versions VERSIONS=($(python get_latest_versions.py)) LATEST_AGENT=${VERSIONS[0]} LATEST_COMPUTER=${VERSIONS[1]} echo "Latest cua-agent version: $LATEST_AGENT" echo "Latest cua-computer version: $LATEST_COMPUTER" # Output the versions for the next job echo "agent_version=$LATEST_AGENT" >> $GITHUB_OUTPUT echo "computer_version=$LATEST_COMPUTER" >> $GITHUB_OUTPUT # Determine major version for version constraint AGENT_MAJOR=$(echo $LATEST_AGENT | cut -d. -f1) COMPUTER_MAJOR=$(echo $LATEST_COMPUTER | cut -d. -f1) NEXT_AGENT_MAJOR=$((AGENT_MAJOR + 1)) NEXT_COMPUTER_MAJOR=$((COMPUTER_MAJOR + 1)) # Update dependencies in pyproject.toml if [[ "$OSTYPE" == "darwin"* ]]; then # macOS version of sed needs an empty string for -i # Update cua-agent with all extras sed -i '' "s/\"cua-agent\[all\]>=.*,<.*\"/\"cua-agent[all]>=$LATEST_AGENT,<$NEXT_AGENT_MAJOR.0.0\"/" pyproject.toml sed -i '' "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml else # Linux version sed -i "s/\"cua-agent\[all\]>=.*,<.*\"/\"cua-agent[all]>=$LATEST_AGENT,<$NEXT_AGENT_MAJOR.0.0\"/" pyproject.toml sed -i "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml fi # Display the updated dependencies echo "Updated dependencies in pyproject.toml:" grep -E "cua-agent|cua-computer" pyproject.toml publish: needs: prepare uses: ./.github/workflows/pypi-reusable-publish.yml with: package_name: "mcp-server" package_dir: "libs/python/mcp-server" version: ${{ needs.prepare.outputs.version }} is_lume_package: false base_package_name: "cua-mcp-server" secrets: PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} set-env-variables: needs: [prepare, publish] runs-on: macos-latest steps: - name: Set environment variables for use in other jobs run: | echo "AGENT_VERSION=${{ needs.prepare.outputs.agent_version }}" >> $GITHUB_ENV echo "COMPUTER_VERSION=${{ needs.prepare.outputs.computer_version }}" >> $GITHUB_ENV ```