This is page 4 of 21. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .all-contributorsrc ├── .cursorignore ├── .devcontainer │ ├── devcontainer.json │ ├── post-install.sh │ └── README.md ├── .dockerignore ├── .gitattributes ├── .github │ ├── FUNDING.yml │ ├── scripts │ │ ├── get_pyproject_version.py │ │ └── tests │ │ ├── __init__.py │ │ ├── README.md │ │ └── test_get_pyproject_version.py │ └── workflows │ ├── ci-lume.yml │ ├── docker-publish-kasm.yml │ ├── docker-publish-xfce.yml │ ├── docker-reusable-publish.yml │ ├── npm-publish-computer.yml │ ├── npm-publish-core.yml │ ├── publish-lume.yml │ ├── pypi-publish-agent.yml │ ├── pypi-publish-computer-server.yml │ ├── pypi-publish-computer.yml │ ├── pypi-publish-core.yml │ ├── pypi-publish-mcp-server.yml │ ├── pypi-publish-pylume.yml │ ├── pypi-publish-som.yml │ ├── pypi-reusable-publish.yml │ └── test-validation-script.yml ├── .gitignore ├── .vscode │ ├── docs.code-workspace │ ├── launch.json │ ├── libs-ts.code-workspace │ ├── lume.code-workspace │ ├── lumier.code-workspace │ └── py.code-workspace ├── blog │ ├── app-use.md │ ├── assets │ │ ├── composite-agents.png │ │ ├── docker-ubuntu-support.png │ │ ├── hack-booth.png │ │ ├── hack-closing-ceremony.jpg │ │ ├── hack-cua-ollama-hud.jpeg │ │ ├── hack-leaderboard.png │ │ ├── hack-the-north.png │ │ ├── hack-winners.jpeg │ │ ├── hack-workshop.jpeg │ │ ├── hud-agent-evals.png │ │ └── trajectory-viewer.jpeg │ ├── bringing-computer-use-to-the-web.md │ ├── build-your-own-operator-on-macos-1.md │ ├── build-your-own-operator-on-macos-2.md │ ├── composite-agents.md │ ├── cua-hackathon.md │ ├── hack-the-north.md │ ├── hud-agent-evals.md │ ├── human-in-the-loop.md │ ├── introducing-cua-cloud-containers.md │ ├── lume-to-containerization.md │ ├── sandboxed-python-execution.md │ ├── training-computer-use-models-trajectories-1.md │ ├── trajectory-viewer.md │ ├── ubuntu-docker-support.md │ └── windows-sandbox.md ├── CONTRIBUTING.md ├── Development.md ├── Dockerfile ├── docs │ ├── .gitignore │ ├── .prettierrc │ ├── content │ │ └── docs │ │ ├── agent-sdk │ │ │ ├── agent-loops.mdx │ │ │ ├── benchmarks │ │ │ │ ├── index.mdx │ │ │ │ ├── interactive.mdx │ │ │ │ ├── introduction.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── osworld-verified.mdx │ │ │ │ ├── screenspot-pro.mdx │ │ │ │ └── screenspot-v2.mdx │ │ │ ├── callbacks │ │ │ │ ├── agent-lifecycle.mdx │ │ │ │ ├── cost-saving.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── logging.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── pii-anonymization.mdx │ │ │ │ └── trajectories.mdx │ │ │ ├── chat-history.mdx │ │ │ ├── custom-computer-handlers.mdx │ │ │ ├── custom-tools.mdx │ │ │ ├── customizing-computeragent.mdx │ │ │ ├── integrations │ │ │ │ ├── hud.mdx │ │ │ │ └── meta.json │ │ │ ├── message-format.mdx │ │ │ ├── meta.json │ │ │ ├── migration-guide.mdx │ │ │ ├── prompt-caching.mdx │ │ │ ├── supported-agents │ │ │ │ ├── composed-agents.mdx │ │ │ │ ├── computer-use-agents.mdx │ │ │ │ ├── grounding-models.mdx │ │ │ │ ├── human-in-the-loop.mdx │ │ │ │ └── meta.json │ │ │ ├── supported-model-providers │ │ │ │ ├── index.mdx │ │ │ │ └── local-models.mdx │ │ │ └── usage-tracking.mdx │ │ ├── computer-sdk │ │ │ ├── cloud-vm-management.mdx │ │ │ ├── commands.mdx │ │ │ ├── computer-ui.mdx │ │ │ ├── computers.mdx │ │ │ ├── meta.json │ │ │ └── sandboxed-python.mdx │ │ ├── index.mdx │ │ ├── libraries │ │ │ ├── agent │ │ │ │ └── index.mdx │ │ │ ├── computer │ │ │ │ └── index.mdx │ │ │ ├── computer-server │ │ │ │ ├── Commands.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── REST-API.mdx │ │ │ │ └── WebSocket-API.mdx │ │ │ ├── core │ │ │ │ └── index.mdx │ │ │ ├── lume │ │ │ │ ├── cli-reference.mdx │ │ │ │ ├── faq.md │ │ │ │ ├── http-api.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── meta.json │ │ │ │ └── prebuilt-images.mdx │ │ │ ├── lumier │ │ │ │ ├── building-lumier.mdx │ │ │ │ ├── docker-compose.mdx │ │ │ │ ├── docker.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ └── meta.json │ │ │ ├── mcp-server │ │ │ │ ├── client-integrations.mdx │ │ │ │ ├── configuration.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── llm-integrations.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── tools.mdx │ │ │ │ └── usage.mdx │ │ │ └── som │ │ │ ├── configuration.mdx │ │ │ └── index.mdx │ │ ├── meta.json │ │ ├── quickstart-cli.mdx │ │ ├── quickstart-devs.mdx │ │ └── telemetry.mdx │ ├── next.config.mjs │ ├── package-lock.json │ ├── package.json │ ├── pnpm-lock.yaml │ ├── postcss.config.mjs │ ├── public │ │ └── img │ │ ├── agent_gradio_ui.png │ │ ├── agent.png │ │ ├── cli.png │ │ ├── computer.png │ │ ├── som_box_threshold.png │ │ └── som_iou_threshold.png │ ├── README.md │ ├── source.config.ts │ ├── src │ │ ├── app │ │ │ ├── (home) │ │ │ │ ├── [[...slug]] │ │ │ │ │ └── page.tsx │ │ │ │ └── layout.tsx │ │ │ ├── api │ │ │ │ └── search │ │ │ │ └── route.ts │ │ │ ├── favicon.ico │ │ │ ├── global.css │ │ │ ├── layout.config.tsx │ │ │ ├── layout.tsx │ │ │ ├── llms.mdx │ │ │ │ └── [[...slug]] │ │ │ │ └── route.ts │ │ │ └── llms.txt │ │ │ └── route.ts │ │ ├── assets │ │ │ ├── discord-black.svg │ │ │ ├── discord-white.svg │ │ │ ├── logo-black.svg │ │ │ └── logo-white.svg │ │ ├── components │ │ │ ├── iou.tsx │ │ │ └── mermaid.tsx │ │ ├── lib │ │ │ ├── llms.ts │ │ │ └── source.ts │ │ └── mdx-components.tsx │ └── tsconfig.json ├── examples │ ├── agent_examples.py │ ├── agent_ui_examples.py │ ├── cloud_api_examples.py │ ├── computer_examples_windows.py │ ├── computer_examples.py │ ├── computer_ui_examples.py │ ├── computer-example-ts │ │ ├── .env.example │ │ ├── .gitignore │ │ ├── .prettierrc │ │ ├── package-lock.json │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── README.md │ │ ├── src │ │ │ ├── helpers.ts │ │ │ └── index.ts │ │ └── tsconfig.json │ ├── docker_examples.py │ ├── evals │ │ ├── hud_eval_examples.py │ │ └── wikipedia_most_linked.txt │ ├── pylume_examples.py │ ├── sandboxed_functions_examples.py │ ├── som_examples.py │ ├── utils.py │ └── winsandbox_example.py ├── img │ ├── agent_gradio_ui.png │ ├── agent.png │ ├── cli.png │ ├── computer.png │ ├── logo_black.png │ └── logo_white.png ├── libs │ ├── kasm │ │ ├── Dockerfile │ │ ├── LICENSE │ │ ├── README.md │ │ └── src │ │ └── ubuntu │ │ └── install │ │ └── firefox │ │ ├── custom_startup.sh │ │ ├── firefox.desktop │ │ └── install_firefox.sh │ ├── lume │ │ ├── .cursorignore │ │ ├── CONTRIBUTING.md │ │ ├── Development.md │ │ ├── img │ │ │ └── cli.png │ │ ├── Package.resolved │ │ ├── Package.swift │ │ ├── README.md │ │ ├── resources │ │ │ └── lume.entitlements │ │ ├── scripts │ │ │ ├── build │ │ │ │ ├── build-debug.sh │ │ │ │ ├── build-release-notarized.sh │ │ │ │ └── build-release.sh │ │ │ └── install.sh │ │ ├── src │ │ │ ├── Commands │ │ │ │ ├── Clone.swift │ │ │ │ ├── Config.swift │ │ │ │ ├── Create.swift │ │ │ │ ├── Delete.swift │ │ │ │ ├── Get.swift │ │ │ │ ├── Images.swift │ │ │ │ ├── IPSW.swift │ │ │ │ ├── List.swift │ │ │ │ ├── Logs.swift │ │ │ │ ├── Options │ │ │ │ │ └── FormatOption.swift │ │ │ │ ├── Prune.swift │ │ │ │ ├── Pull.swift │ │ │ │ ├── Push.swift │ │ │ │ ├── Run.swift │ │ │ │ ├── Serve.swift │ │ │ │ ├── Set.swift │ │ │ │ └── Stop.swift │ │ │ ├── ContainerRegistry │ │ │ │ ├── ImageContainerRegistry.swift │ │ │ │ ├── ImageList.swift │ │ │ │ └── ImagesPrinter.swift │ │ │ ├── Errors │ │ │ │ └── Errors.swift │ │ │ ├── FileSystem │ │ │ │ ├── Home.swift │ │ │ │ ├── Settings.swift │ │ │ │ ├── VMConfig.swift │ │ │ │ ├── VMDirectory.swift │ │ │ │ └── VMLocation.swift │ │ │ ├── LumeController.swift │ │ │ ├── Main.swift │ │ │ ├── Server │ │ │ │ ├── Handlers.swift │ │ │ │ ├── HTTP.swift │ │ │ │ ├── Requests.swift │ │ │ │ ├── Responses.swift │ │ │ │ └── Server.swift │ │ │ ├── Utils │ │ │ │ ├── CommandRegistry.swift │ │ │ │ ├── CommandUtils.swift │ │ │ │ ├── Logger.swift │ │ │ │ ├── NetworkUtils.swift │ │ │ │ ├── Path.swift │ │ │ │ ├── ProcessRunner.swift │ │ │ │ ├── ProgressLogger.swift │ │ │ │ ├── String.swift │ │ │ │ └── Utils.swift │ │ │ ├── Virtualization │ │ │ │ ├── DarwinImageLoader.swift │ │ │ │ ├── DHCPLeaseParser.swift │ │ │ │ ├── ImageLoaderFactory.swift │ │ │ │ └── VMVirtualizationService.swift │ │ │ ├── VM │ │ │ │ ├── DarwinVM.swift │ │ │ │ ├── LinuxVM.swift │ │ │ │ ├── VM.swift │ │ │ │ ├── VMDetails.swift │ │ │ │ ├── VMDetailsPrinter.swift │ │ │ │ ├── VMDisplayResolution.swift │ │ │ │ └── VMFactory.swift │ │ │ └── VNC │ │ │ ├── PassphraseGenerator.swift │ │ │ └── VNCService.swift │ │ └── tests │ │ ├── Mocks │ │ │ ├── MockVM.swift │ │ │ ├── MockVMVirtualizationService.swift │ │ │ └── MockVNCService.swift │ │ ├── VM │ │ │ └── VMDetailsPrinterTests.swift │ │ ├── VMTests.swift │ │ ├── VMVirtualizationServiceTests.swift │ │ └── VNCServiceTests.swift │ ├── lumier │ │ ├── .dockerignore │ │ ├── Dockerfile │ │ ├── README.md │ │ └── src │ │ ├── bin │ │ │ └── entry.sh │ │ ├── config │ │ │ └── constants.sh │ │ ├── hooks │ │ │ └── on-logon.sh │ │ └── lib │ │ ├── utils.sh │ │ └── vm.sh │ ├── python │ │ ├── agent │ │ │ ├── agent │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── adapters │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── huggingfacelocal_adapter.py │ │ │ │ │ ├── human_adapter.py │ │ │ │ │ ├── mlxvlm_adapter.py │ │ │ │ │ └── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── qwen2_5_vl.py │ │ │ │ ├── agent.py │ │ │ │ ├── callbacks │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── budget_manager.py │ │ │ │ │ ├── image_retention.py │ │ │ │ │ ├── logging.py │ │ │ │ │ ├── operator_validator.py │ │ │ │ │ ├── pii_anonymization.py │ │ │ │ │ ├── prompt_instructions.py │ │ │ │ │ ├── telemetry.py │ │ │ │ │ └── trajectory_saver.py │ │ │ │ ├── cli.py │ │ │ │ ├── computers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cua.py │ │ │ │ │ └── custom.py │ │ │ │ ├── decorators.py │ │ │ │ ├── human_tool │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ ├── server.py │ │ │ │ │ └── ui.py │ │ │ │ ├── integrations │ │ │ │ │ └── hud │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── agent.py │ │ │ │ │ └── proxy.py │ │ │ │ ├── loops │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── anthropic.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── composed_grounded.py │ │ │ │ │ ├── gemini.py │ │ │ │ │ ├── glm45v.py │ │ │ │ │ ├── gta1.py │ │ │ │ │ ├── holo.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── model_types.csv │ │ │ │ │ ├── moondream3.py │ │ │ │ │ ├── omniparser.py │ │ │ │ │ ├── openai.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── uitars.py │ │ │ │ ├── proxy │ │ │ │ │ ├── examples.py │ │ │ │ │ └── handlers.py │ │ │ │ ├── responses.py │ │ │ │ ├── types.py │ │ │ │ └── ui │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ └── gradio │ │ │ │ ├── __init__.py │ │ │ │ ├── app.py │ │ │ │ └── ui_components.py │ │ │ ├── benchmarks │ │ │ │ ├── .gitignore │ │ │ │ ├── contrib.md │ │ │ │ ├── interactive.py │ │ │ │ ├── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ └── gta1.py │ │ │ │ ├── README.md │ │ │ │ ├── ss-pro.py │ │ │ │ ├── ss-v2.py │ │ │ │ └── utils.py │ │ │ ├── example.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer │ │ │ ├── computer │ │ │ │ ├── __init__.py │ │ │ │ ├── computer.py │ │ │ │ ├── diorama_computer.py │ │ │ │ ├── helpers.py │ │ │ │ ├── interface │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ ├── models.py │ │ │ │ │ └── windows.py │ │ │ │ ├── logger.py │ │ │ │ ├── models.py │ │ │ │ ├── providers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cloud │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── docker │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── lume │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── lume_api.py │ │ │ │ │ ├── lumier │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── types.py │ │ │ │ │ └── winsandbox │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── provider.py │ │ │ │ │ └── setup_script.ps1 │ │ │ │ ├── ui │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ └── gradio │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── app.py │ │ │ │ └── utils.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer-server │ │ │ ├── computer_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── cli.py │ │ │ │ ├── diorama │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── diorama_computer.py │ │ │ │ │ ├── diorama.py │ │ │ │ │ ├── draw.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── safezone.py │ │ │ │ ├── handlers │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── windows.py │ │ │ │ ├── main.py │ │ │ │ ├── server.py │ │ │ │ └── watchdog.py │ │ │ ├── examples │ │ │ │ ├── __init__.py │ │ │ │ └── usage_example.py │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ ├── run_server.py │ │ │ └── test_connection.py │ │ ├── core │ │ │ ├── core │ │ │ │ ├── __init__.py │ │ │ │ └── telemetry │ │ │ │ ├── __init__.py │ │ │ │ └── posthog.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── mcp-server │ │ │ ├── mcp_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ └── server.py │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ └── scripts │ │ │ ├── install_mcp_server.sh │ │ │ └── start_mcp_server.sh │ │ ├── pylume │ │ │ ├── __init__.py │ │ │ ├── pylume │ │ │ │ ├── __init__.py │ │ │ │ ├── client.py │ │ │ │ ├── exceptions.py │ │ │ │ ├── lume │ │ │ │ ├── models.py │ │ │ │ ├── pylume.py │ │ │ │ └── server.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ └── som │ │ ├── LICENSE │ │ ├── poetry.toml │ │ ├── pyproject.toml │ │ ├── README.md │ │ ├── som │ │ │ ├── __init__.py │ │ │ ├── detect.py │ │ │ ├── detection.py │ │ │ ├── models.py │ │ │ ├── ocr.py │ │ │ ├── util │ │ │ │ └── utils.py │ │ │ └── visualization.py │ │ └── tests │ │ └── test_omniparser.py │ ├── typescript │ │ ├── .gitignore │ │ ├── .nvmrc │ │ ├── agent │ │ │ ├── examples │ │ │ │ ├── playground-example.html │ │ │ │ └── README.md │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── client.ts │ │ │ │ ├── index.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ └── client.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── biome.json │ │ ├── computer │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── computer │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── providers │ │ │ │ │ │ ├── base.ts │ │ │ │ │ │ ├── cloud.ts │ │ │ │ │ │ └── index.ts │ │ │ │ │ └── types.ts │ │ │ │ ├── index.ts │ │ │ │ ├── interface │ │ │ │ │ ├── base.ts │ │ │ │ │ ├── factory.ts │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── linux.ts │ │ │ │ │ ├── macos.ts │ │ │ │ │ └── windows.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ ├── computer │ │ │ │ │ └── cloud.test.ts │ │ │ │ ├── interface │ │ │ │ │ ├── factory.test.ts │ │ │ │ │ ├── index.test.ts │ │ │ │ │ ├── linux.test.ts │ │ │ │ │ ├── macos.test.ts │ │ │ │ │ └── windows.test.ts │ │ │ │ └── setup.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── core │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── index.ts │ │ │ │ └── telemetry │ │ │ │ ├── clients │ │ │ │ │ ├── index.ts │ │ │ │ │ └── posthog.ts │ │ │ │ └── index.ts │ │ │ ├── tests │ │ │ │ └── telemetry.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── pnpm-workspace.yaml │ │ └── README.md │ └── xfce │ ├── .dockerignore │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ └── src │ ├── scripts │ │ ├── resize-display.sh │ │ ├── start-computer-server.sh │ │ ├── start-novnc.sh │ │ ├── start-vnc.sh │ │ └── xstartup.sh │ ├── supervisor │ │ └── supervisord.conf │ └── xfce-config │ ├── helpers.rc │ ├── xfce4-power-manager.xml │ └── xfce4-session.xml ├── LICENSE.md ├── notebooks │ ├── agent_nb.ipynb │ ├── blog │ │ ├── build-your-own-operator-on-macos-1.ipynb │ │ └── build-your-own-operator-on-macos-2.ipynb │ ├── composite_agents_docker_nb.ipynb │ ├── computer_nb.ipynb │ ├── computer_server_nb.ipynb │ ├── customizing_computeragent.ipynb │ ├── eval_osworld.ipynb │ ├── ollama_nb.ipynb │ ├── pylume_nb.ipynb │ ├── README.md │ ├── sota_hackathon_cloud.ipynb │ └── sota_hackathon.ipynb ├── pdm.lock ├── pyproject.toml ├── pyrightconfig.json ├── README.md ├── samples │ └── community │ ├── global-online │ │ └── README.md │ └── hack-the-north │ └── README.md ├── scripts │ ├── build-uv.sh │ ├── build.ps1 │ ├── build.sh │ ├── cleanup.sh │ ├── playground-docker.sh │ ├── playground.sh │ └── run-docker-dev.sh └── tests ├── pytest.ini ├── shell_cmd.py ├── test_files.py ├── test_shell_bash.py ├── test_telemetry.py ├── test_venv.py └── test_watchdog.py ``` # Files -------------------------------------------------------------------------------- /libs/typescript/computer/src/computer/providers/base.ts: -------------------------------------------------------------------------------- ```typescript 1 | import os from "node:os"; 2 | import { Telemetry } from "@trycua/core"; 3 | import pino from "pino"; 4 | import type { OSType } from "../../types"; 5 | import type { BaseComputerConfig, Display, VMProviderType } from "../types"; 6 | 7 | const logger = pino({ name: "computer.provider_base" }); 8 | 9 | /** 10 | * Base Computer class with shared functionality 11 | */ 12 | export abstract class BaseComputer { 13 | protected name: string; 14 | protected osType: OSType; 15 | protected vmProvider?: VMProviderType; 16 | protected telemetry: Telemetry; 17 | 18 | constructor(config: BaseComputerConfig) { 19 | this.name = config.name; 20 | this.osType = config.osType; 21 | this.telemetry = new Telemetry(); 22 | this.telemetry.recordEvent("module_init", { 23 | module: "computer", 24 | version: process.env.npm_package_version, 25 | node_version: process.version, 26 | }); 27 | 28 | this.telemetry.recordEvent("computer_initialized", { 29 | os: os.platform(), 30 | os_version: os.version(), 31 | node_version: process.version, 32 | }); 33 | } 34 | 35 | /** 36 | * Get the name of the computer 37 | */ 38 | getName(): string { 39 | return this.name; 40 | } 41 | 42 | /** 43 | * Get the OS type of the computer 44 | */ 45 | getOSType(): OSType { 46 | return this.osType; 47 | } 48 | 49 | /** 50 | * Get the VM provider type 51 | */ 52 | getVMProviderType(): VMProviderType | undefined { 53 | return this.vmProvider; 54 | } 55 | 56 | /** 57 | * Shared method available to all computer types 58 | */ 59 | async disconnect(): Promise<void> { 60 | logger.info(`Disconnecting from ${this.name}`); 61 | // Implementation would go here 62 | } 63 | 64 | /** 65 | * Parse display string into Display object 66 | * @param display Display string in format "WIDTHxHEIGHT" 67 | * @returns Display object 68 | */ 69 | public static parseDisplayString(display: string): Display { 70 | const match = display.match(/^(\d+)x(\d+)$/); 71 | if (!match) { 72 | throw new Error( 73 | `Invalid display format: ${display}. Expected format: WIDTHxHEIGHT`, 74 | ); 75 | } 76 | 77 | return { 78 | width: Number.parseInt(match[1], 10), 79 | height: Number.parseInt(match[2], 10), 80 | }; 81 | } 82 | 83 | /** 84 | * Parse memory string to MB integer. 85 | * 86 | * Examples: 87 | * "8GB" -> 8192 88 | * "1024MB" -> 1024 89 | * "512" -> 512 90 | * 91 | * @param memoryStr - Memory string to parse 92 | * @returns Memory value in MB 93 | */ 94 | public static parseMemoryString(memoryStr: string): number { 95 | if (!memoryStr) { 96 | return 0; 97 | } 98 | 99 | // Convert to uppercase for case-insensitive matching 100 | const upperStr = memoryStr.toUpperCase().trim(); 101 | 102 | // Extract numeric value and unit 103 | const match = upperStr.match(/^(\d+(?:\.\d+)?)\s*(GB|MB)?$/); 104 | if (!match) { 105 | throw new Error(`Invalid memory format: ${memoryStr}`); 106 | } 107 | 108 | const value = Number.parseFloat(match[1]); 109 | const unit = match[2] || "MB"; // Default to MB if no unit specified 110 | 111 | // Convert to MB 112 | if (unit === "GB") { 113 | return Math.round(value * 1024); 114 | } 115 | return Math.round(value); 116 | } 117 | } 118 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/adapters/models/generic.py: -------------------------------------------------------------------------------- ```python 1 | from typing import List, Dict, Any, Optional 2 | 3 | # Hugging Face imports are local to avoid hard dependency at module import 4 | try: 5 | import torch # type: ignore 6 | from transformers import AutoModel, AutoProcessor # type: ignore 7 | HF_AVAILABLE = True 8 | except Exception: 9 | HF_AVAILABLE = False 10 | 11 | 12 | class GenericHFModel: 13 | """Generic Hugging Face vision-language model handler. 14 | Loads an AutoModelForImageTextToText and AutoProcessor and generates text. 15 | """ 16 | 17 | def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None: 18 | if not HF_AVAILABLE: 19 | raise ImportError( 20 | "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\"" 21 | ) 22 | self.model_name = model_name 23 | self.device = device 24 | self.model = None 25 | self.processor = None 26 | self.trust_remote_code = trust_remote_code 27 | self._load() 28 | 29 | def _load(self) -> None: 30 | # Load model 31 | self.model = AutoModel.from_pretrained( 32 | self.model_name, 33 | torch_dtype=torch.float16, 34 | device_map=self.device, 35 | attn_implementation="sdpa", 36 | trust_remote_code=self.trust_remote_code, 37 | ) 38 | # Load processor 39 | self.processor = AutoProcessor.from_pretrained( 40 | self.model_name, 41 | min_pixels=3136, 42 | max_pixels=4096 * 2160, 43 | device_map=self.device, 44 | trust_remote_code=self.trust_remote_code, 45 | ) 46 | 47 | def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str: 48 | """Generate text for the given HF-format messages. 49 | messages: [{ role, content: [{type:'text'|'image', text|image}] }] 50 | """ 51 | assert self.model is not None and self.processor is not None 52 | # Apply chat template and tokenize 53 | inputs = self.processor.apply_chat_template( 54 | messages, 55 | add_generation_prompt=True, 56 | tokenize=True, 57 | return_dict=True, 58 | return_tensors="pt", 59 | ) 60 | # Move inputs to the same device as model 61 | inputs = inputs.to(self.model.device) 62 | # Generate 63 | with torch.no_grad(): 64 | generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens) 65 | # Trim prompt tokens from output 66 | generated_ids_trimmed = [ 67 | out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) 68 | ] 69 | # Decode 70 | output_text = self.processor.batch_decode( 71 | generated_ids_trimmed, 72 | skip_special_tokens=True, 73 | clean_up_tokenization_spaces=False, 74 | ) 75 | return output_text[0] if output_text else "" 76 | ``` -------------------------------------------------------------------------------- /docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | title: All‑in‑one CUA Models 3 | description: Models that support full computer-use agent capabilities with ComputerAgent.run() 4 | --- 5 | 6 | These models support complete computer-use agent functionality through `ComputerAgent.run()`. They can understand natural language instructions and autonomously perform sequences of actions to complete tasks. 7 | 8 | All agent loops are compatible with any LLM provider supported by LiteLLM. 9 | 10 | See [Running Models Locally](../local-models) for how to use Hugging Face and MLX models on your own machine. 11 | 12 | ## Gemini CUA 13 | 14 | Gemini models with computer-use capabilities: 15 | 16 | - Gemini 2.5 CUA: `gemini-2.5-computer-use-preview-10-2025` 17 | 18 | ```python 19 | agent = ComputerAgent("gemini-2.5-computer-use-preview-10-2025", tools=[computer]) 20 | async for _ in agent.run("Open Firefox and navigate to github.com"): 21 | pass 22 | ``` 23 | 24 | ## Anthropic CUAs 25 | 26 | Claude models with computer-use capabilities: 27 | 28 | - Claude 4.5: `claude-sonnet-4-5-20250929` 29 | - Claude 4.1: `claude-opus-4-1-20250805` 30 | - Claude 4: `claude-opus-4-20250514`, `claude-sonnet-4-20250514` 31 | - Claude 3.7: `claude-3-7-sonnet-20250219` 32 | - Claude 3.5: `claude-3-5-sonnet-20241022` 33 | 34 | ```python 35 | agent = ComputerAgent("claude-3-5-sonnet-20241022", tools=[computer]) 36 | async for _ in agent.run("Open Firefox and navigate to github.com"): 37 | pass 38 | ``` 39 | 40 | ## OpenAI CUA Preview 41 | 42 | OpenAI's computer-use preview model: 43 | 44 | - Computer-use-preview: `computer-use-preview` 45 | 46 | ```python 47 | agent = ComputerAgent("openai/computer-use-preview", tools=[computer]) 48 | async for _ in agent.run("Take a screenshot and describe what you see"): 49 | pass 50 | ``` 51 | 52 | ## GLM-4.5V 53 | 54 | Zhipu AI's GLM-4.5V vision-language model with computer-use capabilities: 55 | 56 | - `openrouter/z-ai/glm-4.5v` 57 | - `huggingface-local/zai-org/GLM-4.5V` 58 | 59 | ```python 60 | agent = ComputerAgent("openrouter/z-ai/glm-4.5v", tools=[computer]) 61 | async for _ in agent.run("Click on the search bar and type 'hello world'"): 62 | pass 63 | ``` 64 | 65 | ## InternVL 3.5 66 | 67 | InternVL 3.5 family: 68 | - `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` 69 | 70 | ```python 71 | agent = ComputerAgent("huggingface-local/OpenGVLab/InternVL3_5-1B", tools=[computer]) 72 | async for _ in agent.run("Open Firefox and navigate to github.com"): 73 | pass 74 | ``` 75 | 76 | ## UI-TARS 1.5 77 | 78 | Unified vision-language model for computer-use: 79 | 80 | - `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` 81 | - `huggingface/ByteDance-Seed/UI-TARS-1.5-7B` (requires TGI endpoint) 82 | 83 | ```python 84 | agent = ComputerAgent("huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", tools=[computer]) 85 | async for _ in agent.run("Open the settings menu and change the theme to dark mode"): 86 | pass 87 | ``` 88 | 89 | --- 90 | 91 | CUAs also support direct click prediction. See [Grounding Models](./grounding-models) for details on `predict_click()`. 92 | 93 | For details on agent loop behavior and usage, see [Agent Loops](../agent-loops). 94 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/adapters/models/qwen2_5_vl.py: -------------------------------------------------------------------------------- ```python 1 | from typing import List, Dict, Any, Optional 2 | 3 | # Hugging Face imports are local to avoid hard dependency at module import 4 | try: 5 | import torch # type: ignore 6 | from transformers import AutoModelForImageTextToText, AutoProcessor # type: ignore 7 | HF_AVAILABLE = True 8 | except Exception: 9 | HF_AVAILABLE = False 10 | 11 | 12 | class Qwen2_5_VLModel: 13 | """Qwen2.5-VL Hugging Face vision-language model handler. 14 | Loads an AutoModelForImageTextToText and AutoProcessor and generates text. 15 | """ 16 | 17 | def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None: 18 | if not HF_AVAILABLE: 19 | raise ImportError( 20 | "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\"" 21 | ) 22 | self.model_name = model_name 23 | self.device = device 24 | self.model = None 25 | self.processor = None 26 | self.trust_remote_code = trust_remote_code 27 | self._load() 28 | 29 | def _load(self) -> None: 30 | # Load model 31 | self.model = AutoModelForImageTextToText.from_pretrained( 32 | self.model_name, 33 | torch_dtype=torch.bfloat16, 34 | device_map=self.device, 35 | attn_implementation="sdpa", 36 | trust_remote_code=self.trust_remote_code, 37 | ) 38 | # Load processor 39 | self.processor = AutoProcessor.from_pretrained( 40 | self.model_name, 41 | min_pixels=3136, 42 | max_pixels=4096 * 2160, 43 | device_map=self.device, 44 | trust_remote_code=self.trust_remote_code, 45 | ) 46 | 47 | def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str: 48 | """Generate text for the given HF-format messages. 49 | messages: [{ role, content: [{type:'text'|'image', text|image}] }] 50 | """ 51 | assert self.model is not None and self.processor is not None 52 | # Apply chat template and tokenize 53 | inputs = self.processor.apply_chat_template( 54 | messages, 55 | add_generation_prompt=True, 56 | tokenize=True, 57 | return_dict=True, 58 | return_tensors="pt", 59 | ) 60 | # Move inputs to the same device as model 61 | inputs = inputs.to(self.model.device) 62 | # Generate 63 | with torch.no_grad(): 64 | generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens) 65 | # Trim prompt tokens from output 66 | generated_ids_trimmed = [ 67 | out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) 68 | ] 69 | # Decode 70 | output_text = self.processor.batch_decode( 71 | generated_ids_trimmed, 72 | skip_special_tokens=True, 73 | clean_up_tokenization_spaces=False, 74 | ) 75 | return output_text[0] if output_text else "" 76 | ``` -------------------------------------------------------------------------------- /libs/python/computer/computer/utils.py: -------------------------------------------------------------------------------- ```python 1 | import base64 2 | from typing import Tuple, Optional, Dict, Any 3 | from PIL import Image, ImageDraw 4 | import io 5 | 6 | def decode_base64_image(base64_str: str) -> bytes: 7 | """Decode a base64 string into image bytes.""" 8 | return base64.b64decode(base64_str) 9 | 10 | def encode_base64_image(image_bytes: bytes) -> str: 11 | """Encode image bytes to base64 string.""" 12 | return base64.b64encode(image_bytes).decode('utf-8') 13 | 14 | def bytes_to_image(image_bytes: bytes) -> Image.Image: 15 | """Convert bytes to PIL Image. 16 | 17 | Args: 18 | image_bytes: Raw image bytes 19 | 20 | Returns: 21 | PIL.Image: The converted image 22 | """ 23 | return Image.open(io.BytesIO(image_bytes)) 24 | 25 | def image_to_bytes(image: Image.Image, format: str = 'PNG') -> bytes: 26 | """Convert PIL Image to bytes.""" 27 | buf = io.BytesIO() 28 | image.save(buf, format=format) 29 | return buf.getvalue() 30 | 31 | def resize_image(image_bytes: bytes, scale_factor: float) -> bytes: 32 | """Resize an image by a scale factor. 33 | 34 | Args: 35 | image_bytes: The original image as bytes 36 | scale_factor: Factor to scale the image by (e.g., 0.5 for half size, 2.0 for double) 37 | 38 | Returns: 39 | bytes: The resized image as bytes 40 | """ 41 | image = bytes_to_image(image_bytes) 42 | if scale_factor != 1.0: 43 | new_size = (int(image.width * scale_factor), int(image.height * scale_factor)) 44 | image = image.resize(new_size, Image.Resampling.LANCZOS) 45 | return image_to_bytes(image) 46 | 47 | def draw_box( 48 | image_bytes: bytes, 49 | x: int, 50 | y: int, 51 | width: int, 52 | height: int, 53 | color: str = "#FF0000", 54 | thickness: int = 2 55 | ) -> bytes: 56 | """Draw a box on an image. 57 | 58 | Args: 59 | image_bytes: The original image as bytes 60 | x: X coordinate of top-left corner 61 | y: Y coordinate of top-left corner 62 | width: Width of the box 63 | height: Height of the box 64 | color: Color of the box in hex format 65 | thickness: Thickness of the box border in pixels 66 | 67 | Returns: 68 | bytes: The modified image as bytes 69 | """ 70 | # Convert bytes to PIL Image 71 | image = bytes_to_image(image_bytes) 72 | 73 | # Create drawing context 74 | draw = ImageDraw.Draw(image) 75 | 76 | # Draw rectangle 77 | draw.rectangle( 78 | [(x, y), (x + width, y + height)], 79 | outline=color, 80 | width=thickness 81 | ) 82 | 83 | # Convert back to bytes 84 | return image_to_bytes(image) 85 | 86 | def get_image_size(image_bytes: bytes) -> Tuple[int, int]: 87 | """Get the dimensions of an image. 88 | 89 | Args: 90 | image_bytes: The image as bytes 91 | 92 | Returns: 93 | Tuple[int, int]: Width and height of the image 94 | """ 95 | image = bytes_to_image(image_bytes) 96 | return image.size 97 | 98 | def parse_vm_info(vm_info: Dict[str, Any]) -> Optional[Dict[str, Any]]: 99 | """Parse VM info from pylume response.""" 100 | if not vm_info: 101 | return None ``` -------------------------------------------------------------------------------- /examples/computer-example-ts/src/index.ts: -------------------------------------------------------------------------------- ```typescript 1 | import { Computer, OSType } from "@trycua/computer"; 2 | import OpenAI from "openai"; 3 | import { executeAction } from "./helpers"; 4 | 5 | import "dotenv/config"; 6 | 7 | const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }); 8 | 9 | const COMPUTER_USE_PROMPT = "Open firefox and go to trycua.com"; 10 | 11 | // Initialize the Computer Connection 12 | const computer = new Computer({ 13 | apiKey: process.env.CUA_API_KEY!, 14 | name: process.env.CUA_CONTAINER_NAME!, 15 | osType: OSType.LINUX, 16 | }); 17 | 18 | await computer.run(); 19 | // Take the initial screenshot 20 | const screenshot = await computer.interface.screenshot(); 21 | const screenshotBase64 = screenshot.toString("base64"); 22 | 23 | // Setup openai config for computer use 24 | const computerUseConfig: OpenAI.Responses.ResponseCreateParamsNonStreaming = { 25 | model: "computer-use-preview", 26 | tools: [ 27 | { 28 | type: "computer_use_preview", 29 | display_width: 1024, 30 | display_height: 768, 31 | environment: "linux", // we're using a linux vm 32 | }, 33 | ], 34 | truncation: "auto", 35 | }; 36 | 37 | // Send initial screenshot to the openai computer use model 38 | let res = await openai.responses.create({ 39 | ...computerUseConfig, 40 | input: [ 41 | { 42 | role: "user", 43 | content: [ 44 | // what we want the ai to do 45 | { type: "input_text", text: COMPUTER_USE_PROMPT }, 46 | // current screenshot of the vm 47 | { 48 | type: "input_image", 49 | image_url: `data:image/png;base64,${screenshotBase64}`, 50 | detail: "auto", 51 | }, 52 | ], 53 | }, 54 | ], 55 | }); 56 | 57 | // Loop until there are no more computer use actions. 58 | while (true) { 59 | const computerCalls = res.output.filter((o) => o.type === "computer_call"); 60 | if (computerCalls.length < 1) { 61 | console.log("No more computer calls. Loop complete."); 62 | break; 63 | } 64 | // Get the first call 65 | const call = computerCalls[0]; 66 | const action = call.action; 67 | console.log("Received action from OpenAI Responses API:", action); 68 | let ackChecks: OpenAI.Responses.ResponseComputerToolCall.PendingSafetyCheck[] = 69 | []; 70 | if (call.pending_safety_checks.length > 0) { 71 | console.log("Safety checks pending:", call.pending_safety_checks); 72 | // In a real implementation, you would want to get user confirmation here 73 | ackChecks = call.pending_safety_checks; 74 | } 75 | 76 | // Execute the action in the container 77 | await executeAction(computer, action); 78 | // Wait for changes to process within the container (1sec) 79 | await new Promise((resolve) => setTimeout(resolve, 1000)); 80 | 81 | // Capture new screenshot 82 | const newScreenshot = await computer.interface.screenshot(); 83 | const newScreenshotBase64 = newScreenshot.toString("base64"); 84 | 85 | // Screenshot back as computer_call_output 86 | 87 | res = await openai.responses.create({ 88 | ...computerUseConfig, 89 | previous_response_id: res.id, 90 | input: [ 91 | { 92 | type: "computer_call_output", 93 | call_id: call.call_id, 94 | acknowledged_safety_checks: ackChecks, 95 | output: { 96 | type: "computer_screenshot", 97 | image_url: `data:image/png;base64,${newScreenshotBase64}`, 98 | }, 99 | }, 100 | ], 101 | }); 102 | } 103 | 104 | process.exit(); 105 | ``` -------------------------------------------------------------------------------- /libs/python/computer/computer/logger.py: -------------------------------------------------------------------------------- ```python 1 | """Logging utilities for the Computer module.""" 2 | 3 | import logging 4 | from enum import IntEnum 5 | 6 | 7 | # Keep LogLevel for backward compatibility, but it will be deprecated 8 | class LogLevel(IntEnum): 9 | """Log levels for logging. Deprecated - use standard logging levels instead.""" 10 | 11 | QUIET = 0 # Only warnings and errors 12 | NORMAL = 1 # Info level, standard output 13 | VERBOSE = 2 # More detailed information 14 | DEBUG = 3 # Full debug information 15 | 16 | 17 | # Map LogLevel to standard logging levels for backward compatibility 18 | LOGLEVEL_MAP = { 19 | LogLevel.QUIET: logging.WARNING, 20 | LogLevel.NORMAL: logging.INFO, 21 | LogLevel.VERBOSE: logging.DEBUG, 22 | LogLevel.DEBUG: logging.DEBUG, 23 | } 24 | 25 | 26 | class Logger: 27 | """Logger class for Computer.""" 28 | 29 | def __init__(self, name: str, verbosity: int): 30 | """Initialize the logger. 31 | 32 | Args: 33 | name: The name of the logger. 34 | verbosity: The log level (use standard logging levels like logging.INFO). 35 | For backward compatibility, LogLevel enum values are also accepted. 36 | """ 37 | self.logger = logging.getLogger(name) 38 | 39 | # Convert LogLevel enum to standard logging level if needed 40 | if isinstance(verbosity, LogLevel): 41 | self.verbosity = LOGLEVEL_MAP.get(verbosity, logging.INFO) 42 | else: 43 | self.verbosity = verbosity 44 | 45 | self._configure() 46 | 47 | def _configure(self): 48 | """Configure the logger based on log level.""" 49 | # Set the logging level directly 50 | self.logger.setLevel(self.verbosity) 51 | 52 | # Log the verbosity level that was set 53 | if self.verbosity <= logging.DEBUG: 54 | self.logger.info("Logger set to DEBUG level") 55 | elif self.verbosity <= logging.INFO: 56 | self.logger.info("Logger set to INFO level") 57 | elif self.verbosity <= logging.WARNING: 58 | self.logger.warning("Logger set to WARNING level") 59 | elif self.verbosity <= logging.ERROR: 60 | self.logger.warning("Logger set to ERROR level") 61 | elif self.verbosity <= logging.CRITICAL: 62 | self.logger.warning("Logger set to CRITICAL level") 63 | 64 | def debug(self, message: str): 65 | """Log a debug message if log level is DEBUG or lower.""" 66 | self.logger.debug(message) 67 | 68 | def info(self, message: str): 69 | """Log an info message if log level is INFO or lower.""" 70 | self.logger.info(message) 71 | 72 | def verbose(self, message: str): 73 | """Log a verbose message between INFO and DEBUG levels.""" 74 | # Since there's no standard verbose level, 75 | # use debug level with [VERBOSE] prefix for backward compatibility 76 | self.logger.debug(f"[VERBOSE] {message}") 77 | 78 | def warning(self, message: str): 79 | """Log a warning message.""" 80 | self.logger.warning(message) 81 | 82 | def error(self, message: str): 83 | """Log an error message.""" 84 | self.logger.error(message) 85 | ``` -------------------------------------------------------------------------------- /docs/content/docs/computer-sdk/sandboxed-python.mdx: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | title: Sandboxed Python 3 | slug: sandboxed-python 4 | --- 5 | 6 | <Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py" target="_blank">Python example</a> is available for this documentation.</Callout> 7 | 8 | You can run Python functions securely inside a sandboxed virtual environment on a remote Cua Computer. This is useful for executing untrusted user code, isolating dependencies, or providing a safe environment for automation tasks. 9 | 10 | ## How It Works 11 | 12 | The `sandboxed` decorator from the Computer SDK wraps a Python function so that it is executed remotely in a specified virtual environment on the target Computer. The function and its arguments are serialized, sent to the remote, and executed in isolation. Results or errors are returned to the caller. 13 | 14 | ## Example Usage 15 | 16 | ```python 17 | from computer import Computer 18 | from computer.helpers import sandboxed 19 | 20 | @sandboxed() 21 | def read_file(location: str) -> str: 22 | """Read contents of a file""" 23 | with open(location, 'r') as f: 24 | return f.read() 25 | 26 | async def main(): 27 | async with Computer(os_type="linux", provider_type="cloud", name="my-container", api_key="...") as computer: 28 | # Call the sandboxed function (runs remotely) 29 | result = await read_file("/etc/hostname") 30 | print(result) 31 | ``` 32 | 33 | ## Installing Python Packages 34 | 35 | You can specify the virtual environment name and target computer: 36 | 37 | ```python 38 | @sandboxed(venv_name="myenv", computer=my_computer, max_retries=5) 39 | def my_function(...): 40 | ... 41 | ``` 42 | 43 | You can also install packages in the virtual environment using the `venv_install` method: 44 | 45 | ```python 46 | await my_computer.venv_install("myenv", ["requests"]) 47 | ``` 48 | 49 | ## Example: Interacting with macOS Applications 50 | 51 | You can use sandboxed functions to interact with macOS applications on a local Cua Computer (requires `os_type="darwin"`). This is particularly useful for automation tasks that involve GUI applications. 52 | 53 | ```python 54 | # Example: Use sandboxed functions to execute code in a Cua Container 55 | from computer.helpers import sandboxed 56 | 57 | await computer.venv_install("demo_venv", ["macos-pyxa"]) # Install packages in a virtual environment 58 | 59 | @sandboxed("demo_venv") 60 | def greet_and_print(name): 61 | """Get the HTML of the current Safari tab""" 62 | import PyXA 63 | safari = PyXA.Application("Safari") 64 | html = safari.current_document.source() 65 | print(f"Hello from inside the container, {name}!") 66 | return {"greeted": name, "safari_html": html} 67 | 68 | # When a @sandboxed function is called, it will execute in the container 69 | result = await greet_and_print("Cua") 70 | # Result: {"greeted": "Cua", "safari_html": "<html>...</html>"} 71 | # stdout and stderr are also captured and printed / raised 72 | print("Result from sandboxed function:", result) 73 | ``` 74 | 75 | ## Error Handling 76 | 77 | If the remote execution fails, the decorator will retry up to `max_retries` times. If all attempts fail, the last exception is raised locally. 78 | ``` -------------------------------------------------------------------------------- /docs/content/docs/libraries/computer-server/Commands.mdx: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | title: Supported Commands 3 | description: List of all commands supported by the Computer Server API (WebSocket and REST). 4 | --- 5 | 6 | # Commands Reference 7 | 8 | This page lists all supported commands for the Computer Server, available via both WebSocket and REST API endpoints. 9 | 10 | | Command | Description | 11 | |---------------------|--------------------------------------------| 12 | | version | Get protocol and package version info | 13 | | run_command | Run a shell command | 14 | | screenshot | Capture a screenshot | 15 | | get_screen_size | Get the screen size | 16 | | get_cursor_position | Get the current mouse cursor position | 17 | | mouse_down | Mouse button down | 18 | | mouse_up | Mouse button up | 19 | | left_click | Left mouse click | 20 | | right_click | Right mouse click | 21 | | double_click | Double mouse click | 22 | | move_cursor | Move mouse cursor to coordinates | 23 | | drag_to | Drag mouse to coordinates | 24 | | drag | Drag mouse by offset | 25 | | key_down | Keyboard key down | 26 | | key_up | Keyboard key up | 27 | | type_text | Type text | 28 | | press_key | Press a single key | 29 | | hotkey | Press a hotkey combination | 30 | | scroll | Scroll the screen | 31 | | scroll_down | Scroll down | 32 | | scroll_up | Scroll up | 33 | | copy_to_clipboard | Copy text to clipboard | 34 | | set_clipboard | Set clipboard content | 35 | | file_exists | Check if a file exists | 36 | | directory_exists | Check if a directory exists | 37 | | list_dir | List files/directories in a directory | 38 | | read_text | Read text from a file | 39 | | write_text | Write text to a file | 40 | | read_bytes | Read bytes from a file | 41 | | write_bytes | Write bytes to a file | 42 | | get_file_size | Get file size | 43 | | delete_file | Delete a file | 44 | | create_dir | Create a directory | 45 | | delete_dir | Delete a directory | 46 | | get_accessibility_tree | Get accessibility tree (if supported) | 47 | | find_element | Find element in accessibility tree | 48 | | diorama_cmd | Run a diorama command (if supported) | 49 | ``` -------------------------------------------------------------------------------- /libs/lume/tests/VNCServiceTests.swift: -------------------------------------------------------------------------------- ```swift 1 | import Foundation 2 | import Testing 3 | @testable import lume 4 | 5 | @Test("VNCService starts correctly") 6 | func testVNCServiceStart() async throws { 7 | let tempDir = try createTempDirectory() 8 | let vmDir = VMDirectory(Path(tempDir.path)) 9 | let service = await MockVNCService(vmDirectory: vmDir) 10 | 11 | // Initial state 12 | let isRunning = await service.isRunning 13 | let url = await service.url 14 | #expect(!isRunning) 15 | #expect(url == nil) 16 | 17 | // Start service 18 | try await service.start(port: 5900, virtualMachine: nil) 19 | #expect(await service.isRunning) 20 | #expect(await service.url?.contains("5900") ?? false) 21 | } 22 | 23 | @Test("VNCService stops correctly") 24 | func testVNCServiceStop() async throws { 25 | let tempDir = try createTempDirectory() 26 | let vmDir = VMDirectory(Path(tempDir.path)) 27 | let service = await MockVNCService(vmDirectory: vmDir) 28 | try await service.start(port: 5900, virtualMachine: nil) 29 | 30 | await service.stop() 31 | let isRunning = await service.isRunning 32 | let url = await service.url 33 | #expect(!isRunning) 34 | #expect(url == nil) 35 | } 36 | 37 | @Test("VNCService handles client operations") 38 | func testVNCServiceClient() async throws { 39 | let tempDir = try createTempDirectory() 40 | let vmDir = VMDirectory(Path(tempDir.path)) 41 | let service = await MockVNCService(vmDirectory: vmDir) 42 | 43 | // Should fail when not started 44 | do { 45 | try await service.openClient(url: "vnc://localhost:5900") 46 | #expect(Bool(false), "Expected openClient to throw when not started") 47 | } catch VMError.vncNotConfigured { 48 | // Expected error 49 | } catch { 50 | #expect(Bool(false), "Expected vncNotConfigured error but got \(error)") 51 | } 52 | 53 | // Start and try client operations 54 | try await service.start(port: 5900, virtualMachine: nil) 55 | try await service.openClient(url: "vnc://localhost:5900") 56 | #expect(await service.clientOpenCount == 1) 57 | 58 | // Stop and verify client operations fail 59 | await service.stop() 60 | do { 61 | try await service.openClient(url: "vnc://localhost:5900") 62 | #expect(Bool(false), "Expected openClient to throw after stopping") 63 | } catch VMError.vncNotConfigured { 64 | // Expected error 65 | } catch { 66 | #expect(Bool(false), "Expected vncNotConfigured error but got \(error)") 67 | } 68 | } 69 | 70 | @Test("VNCService handles virtual machine attachment") 71 | func testVNCServiceVMAttachment() async throws { 72 | let tempDir = try createTempDirectory() 73 | let vmDir = VMDirectory(Path(tempDir.path)) 74 | let service = await MockVNCService(vmDirectory: vmDir) 75 | let mockVM = "mock_vm" 76 | 77 | try await service.start(port: 5900, virtualMachine: mockVM) 78 | let attachedVM = await service.attachedVM 79 | #expect(attachedVM == mockVM) 80 | } 81 | 82 | private func createTempDirectory() throws -> URL { 83 | let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent(UUID().uuidString) 84 | try FileManager.default.createDirectory(at: tempDir, withIntermediateDirectories: true) 85 | return tempDir 86 | } ``` -------------------------------------------------------------------------------- /libs/python/computer-server/computer_server/handlers/factory.py: -------------------------------------------------------------------------------- ```python 1 | import platform 2 | import subprocess 3 | from typing import Tuple, Type 4 | from .base import BaseAccessibilityHandler, BaseAutomationHandler, BaseFileHandler 5 | from computer_server.diorama.base import BaseDioramaHandler 6 | 7 | # Conditionally import platform-specific handlers 8 | system = platform.system().lower() 9 | if system == 'darwin': 10 | from .macos import MacOSAccessibilityHandler, MacOSAutomationHandler 11 | from computer_server.diorama.macos import MacOSDioramaHandler 12 | elif system == 'linux': 13 | from .linux import LinuxAccessibilityHandler, LinuxAutomationHandler 14 | elif system == 'windows': 15 | from .windows import WindowsAccessibilityHandler, WindowsAutomationHandler 16 | 17 | from .generic import GenericFileHandler 18 | 19 | class HandlerFactory: 20 | """Factory for creating OS-specific handlers.""" 21 | 22 | @staticmethod 23 | def _get_current_os() -> str: 24 | """Determine the current OS. 25 | 26 | Returns: 27 | str: The OS type ('darwin' for macOS, 'linux' for Linux, or 'windows' for Windows) 28 | 29 | Raises: 30 | RuntimeError: If unable to determine the current OS 31 | """ 32 | try: 33 | # Use platform.system() as primary method 34 | system = platform.system().lower() 35 | if system in ['darwin', 'linux', 'windows']: 36 | return system 37 | 38 | # Fallback to uname if platform.system() doesn't return expected values (Unix-like systems only) 39 | result = subprocess.run(['uname', '-s'], capture_output=True, text=True) 40 | if result.returncode == 0: 41 | return result.stdout.strip().lower() 42 | 43 | raise RuntimeError(f"Unsupported OS: {system}") 44 | except Exception as e: 45 | raise RuntimeError(f"Failed to determine current OS: {str(e)}") 46 | 47 | @staticmethod 48 | def create_handlers() -> Tuple[BaseAccessibilityHandler, BaseAutomationHandler, BaseDioramaHandler, BaseFileHandler]: 49 | """Create and return appropriate handlers for the current OS. 50 | 51 | Returns: 52 | Tuple[BaseAccessibilityHandler, BaseAutomationHandler, BaseDioramaHandler, BaseFileHandler]: A tuple containing 53 | the appropriate accessibility, automation, diorama, and file handlers for the current OS. 54 | 55 | Raises: 56 | NotImplementedError: If the current OS is not supported 57 | RuntimeError: If unable to determine the current OS 58 | """ 59 | os_type = HandlerFactory._get_current_os() 60 | 61 | if os_type == 'darwin': 62 | return MacOSAccessibilityHandler(), MacOSAutomationHandler(), MacOSDioramaHandler(), GenericFileHandler() 63 | elif os_type == 'linux': 64 | return LinuxAccessibilityHandler(), LinuxAutomationHandler(), BaseDioramaHandler(), GenericFileHandler() 65 | elif os_type == 'windows': 66 | return WindowsAccessibilityHandler(), WindowsAutomationHandler(), BaseDioramaHandler(), GenericFileHandler() 67 | else: 68 | raise NotImplementedError(f"OS '{os_type}' is not supported") 69 | ``` -------------------------------------------------------------------------------- /libs/lume/tests/VM/VMDetailsPrinterTests.swift: -------------------------------------------------------------------------------- ```swift 1 | import Foundation 2 | import Testing 3 | 4 | @testable import lume 5 | 6 | struct VMDetailsPrinterTests { 7 | 8 | @Test func printStatus_whenJSON() throws { 9 | // Given 10 | let vms: [VMDetails] = [ 11 | VMDetails( 12 | name: "name", 13 | os: "os", 14 | cpuCount: 2, 15 | memorySize: 1024, 16 | diskSize: .init(allocated: 24, total: 30), 17 | display: "1024x768", 18 | status: "status", 19 | vncUrl: "vncUrl", 20 | ipAddress: "0.0.0.0", 21 | locationName: "mockLocation") 22 | ] 23 | let jsonEncoder = JSONEncoder() 24 | jsonEncoder.outputFormatting = .prettyPrinted 25 | let expectedOutput = try String(data: jsonEncoder.encode(vms), encoding: .utf8)! 26 | 27 | // When 28 | var printedStatus: String? 29 | try VMDetailsPrinter.printStatus(vms, format: .json, print: { printedStatus = $0 }) 30 | 31 | // Then 32 | // Decode both JSONs and compare the actual data structures 33 | let jsonDecoder = JSONDecoder() 34 | let printedVMs = try jsonDecoder.decode( 35 | [VMDetails].self, from: printedStatus!.data(using: .utf8)!) 36 | let expectedVMs = try jsonDecoder.decode( 37 | [VMDetails].self, from: expectedOutput.data(using: .utf8)!) 38 | 39 | #expect(printedVMs.count == expectedVMs.count) 40 | for (printed, expected) in zip(printedVMs, expectedVMs) { 41 | #expect(printed.name == expected.name) 42 | #expect(printed.os == expected.os) 43 | #expect(printed.cpuCount == expected.cpuCount) 44 | #expect(printed.memorySize == expected.memorySize) 45 | #expect(printed.diskSize.allocated == expected.diskSize.allocated) 46 | #expect(printed.diskSize.total == expected.diskSize.total) 47 | #expect(printed.status == expected.status) 48 | #expect(printed.vncUrl == expected.vncUrl) 49 | #expect(printed.ipAddress == expected.ipAddress) 50 | } 51 | } 52 | 53 | @Test func printStatus_whenNotJSON() throws { 54 | // Given 55 | let vms: [VMDetails] = [ 56 | VMDetails( 57 | name: "name", 58 | os: "os", 59 | cpuCount: 2, 60 | memorySize: 1024, 61 | diskSize: .init(allocated: 24, total: 30), 62 | display: "1024x768", 63 | status: "status", 64 | vncUrl: "vncUrl", 65 | ipAddress: "0.0.0.0", 66 | locationName: "mockLocation") 67 | ] 68 | 69 | // When 70 | var printedLines: [String] = [] 71 | try VMDetailsPrinter.printStatus(vms, format: .text, print: { printedLines.append($0) }) 72 | 73 | // Then 74 | #expect(printedLines.count == 2) 75 | 76 | let headerParts = printedLines[0].split(whereSeparator: \.isWhitespace) 77 | #expect( 78 | headerParts == [ 79 | "name", "os", "cpu", "memory", "disk", "display", "status", "storage", "shared_dirs", "ip", "vnc", 80 | ]) 81 | 82 | #expect( 83 | printedLines[1].split(whereSeparator: \.isWhitespace).map(String.init) == [ 84 | "name", "os", "2", "0.00G", "24.0B/30.0B", "1024x768", "status", "mockLocation", 85 | "-", 86 | "0.0.0.0", 87 | "vncUrl", 88 | ]) 89 | } 90 | } 91 | ``` -------------------------------------------------------------------------------- /libs/lume/src/Server/HTTP.swift: -------------------------------------------------------------------------------- ```swift 1 | import Foundation 2 | import Network 3 | 4 | enum HTTPError: Error { 5 | case internalError 6 | } 7 | 8 | struct HTTPRequest { 9 | let method: String 10 | let path: String 11 | let headers: [String: String] 12 | let body: Data? 13 | 14 | init?(data: Data) { 15 | guard let requestString = String(data: data, encoding: .utf8) else { return nil } 16 | let components = requestString.components(separatedBy: "\r\n\r\n") 17 | guard components.count >= 1 else { return nil } 18 | 19 | let headerLines = components[0].components(separatedBy: "\r\n") 20 | guard !headerLines.isEmpty else { return nil } 21 | 22 | // Parse request line 23 | let requestLine = headerLines[0].components(separatedBy: " ") 24 | guard requestLine.count >= 2 else { return nil } 25 | 26 | self.method = requestLine[0] 27 | self.path = requestLine[1] 28 | 29 | // Parse headers 30 | var headers: [String: String] = [:] 31 | for line in headerLines.dropFirst() { 32 | let headerComponents = line.split(separator: ":", maxSplits: 1).map(String.init) 33 | if headerComponents.count == 2 { 34 | headers[headerComponents[0].trimmingCharacters(in: .whitespaces)] = 35 | headerComponents[1].trimmingCharacters(in: .whitespaces) 36 | } 37 | } 38 | self.headers = headers 39 | 40 | // Parse body if present 41 | if components.count > 1 { 42 | self.body = components[1].data(using: .utf8) 43 | } else { 44 | self.body = nil 45 | } 46 | } 47 | } 48 | 49 | struct HTTPResponse { 50 | enum StatusCode: Int { 51 | case ok = 200 52 | case accepted = 202 53 | case badRequest = 400 54 | case notFound = 404 55 | case internalServerError = 500 56 | 57 | var description: String { 58 | switch self { 59 | case .ok: return "OK" 60 | case .accepted: return "Accepted" 61 | case .badRequest: return "Bad Request" 62 | case .notFound: return "Not Found" 63 | case .internalServerError: return "Internal Server Error" 64 | } 65 | } 66 | } 67 | 68 | let statusCode: StatusCode 69 | let headers: [String: String] 70 | let body: Data? 71 | 72 | init(statusCode: StatusCode, headers: [String: String] = [:], body: Data? = nil) { 73 | self.statusCode = statusCode 74 | self.headers = headers 75 | self.body = body 76 | } 77 | 78 | init(statusCode: StatusCode, body: String) { 79 | self.statusCode = statusCode 80 | self.headers = ["Content-Type": "text/plain"] 81 | self.body = body.data(using: .utf8) 82 | } 83 | 84 | func serialize() -> Data { 85 | var response = "HTTP/1.1 \(statusCode.rawValue) \(statusCode.description)\r\n" 86 | 87 | var headers = self.headers 88 | if let body = body { 89 | headers["Content-Length"] = "\(body.count)" 90 | } 91 | 92 | for (key, value) in headers { 93 | response += "\(key): \(value)\r\n" 94 | } 95 | 96 | response += "\r\n" 97 | 98 | var responseData = response.data(using: .utf8) ?? Data() 99 | if let body = body { 100 | responseData.append(body) 101 | } 102 | 103 | return responseData 104 | } 105 | } 106 | 107 | final class HTTPServer { 108 | let port: UInt16 109 | 110 | init(port: UInt16) { 111 | self.port = port 112 | } 113 | } ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/callbacks/pii_anonymization.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | PII anonymization callback handler using Microsoft Presidio for text and image redaction. 3 | """ 4 | 5 | from typing import List, Dict, Any, Optional, Tuple 6 | from .base import AsyncCallbackHandler 7 | import base64 8 | import io 9 | import logging 10 | 11 | try: 12 | # TODO: Add Presidio dependencies 13 | from PIL import Image 14 | PRESIDIO_AVAILABLE = True 15 | except ImportError: 16 | PRESIDIO_AVAILABLE = False 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | class PIIAnonymizationCallback(AsyncCallbackHandler): 21 | """ 22 | Callback handler that anonymizes PII in text and images using Microsoft Presidio. 23 | 24 | This handler: 25 | 1. Anonymizes PII in messages before sending to the agent loop 26 | 2. Deanonymizes PII in tool calls and message outputs after the agent loop 27 | 3. Redacts PII from images in computer_call_output messages 28 | """ 29 | 30 | def __init__( 31 | self, 32 | # TODO: Any extra kwargs if needed 33 | ): 34 | """ 35 | Initialize the PII anonymization callback. 36 | 37 | Args: 38 | anonymize_text: Whether to anonymize text content 39 | anonymize_images: Whether to redact images 40 | entities_to_anonymize: List of entity types to anonymize (None for all) 41 | anonymization_operator: Presidio operator to use ("replace", "mask", "redact", etc.) 42 | image_redaction_color: RGB color for image redaction 43 | """ 44 | if not PRESIDIO_AVAILABLE: 45 | raise ImportError( 46 | "Presidio is not available. Install with: " 47 | "pip install cua-agent[pii-anonymization]" 48 | ) 49 | 50 | # TODO: Implement __init__ 51 | 52 | async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 53 | """ 54 | Anonymize PII in messages before sending to agent loop. 55 | 56 | Args: 57 | messages: List of message dictionaries 58 | 59 | Returns: 60 | List of messages with PII anonymized 61 | """ 62 | anonymized_messages = [] 63 | for msg in messages: 64 | anonymized_msg = await self._anonymize_message(msg) 65 | anonymized_messages.append(anonymized_msg) 66 | 67 | return anonymized_messages 68 | 69 | async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 70 | """ 71 | Deanonymize PII in tool calls and message outputs after agent loop. 72 | 73 | Args: 74 | output: List of output dictionaries 75 | 76 | Returns: 77 | List of output with PII deanonymized for tool calls 78 | """ 79 | deanonymized_output = [] 80 | for item in output: 81 | # Only deanonymize tool calls and computer_call messages 82 | if item.get("type") in ["computer_call", "computer_call_output"]: 83 | deanonymized_item = await self._deanonymize_item(item) 84 | deanonymized_output.append(deanonymized_item) 85 | else: 86 | deanonymized_output.append(item) 87 | 88 | return deanonymized_output 89 | 90 | async def _anonymize_message(self, message: Dict[str, Any]) -> Dict[str, Any]: 91 | # TODO: Implement _anonymize_message 92 | return message 93 | 94 | async def _deanonymize_item(self, item: Dict[str, Any]) -> Dict[str, Any]: 95 | # TODO: Implement _deanonymize_item 96 | return item 97 | ``` -------------------------------------------------------------------------------- /docs/content/docs/agent-sdk/supported-agents/grounding-models.mdx: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | title: Grounding Models 3 | description: Models that support click prediction with ComputerAgent.predict_click() 4 | --- 5 | 6 | These models specialize in UI element grounding and click prediction. They can identify precise coordinates for UI elements based on natural language descriptions, but cannot perform autonomous task planning. 7 | 8 | Use `ComputerAgent.predict_click()` to get coordinates for specific UI elements. 9 | 10 | All models that support `ComputerAgent.run()` also support `ComputerAgent.predict_click()`. See [All‑in‑one CUAs](./computer-use-agents). 11 | 12 | ### Anthropic CUAs 13 | 14 | - Claude 4.1: `claude-opus-4-1-20250805` 15 | - Claude 4: `claude-opus-4-20250514`, `claude-sonnet-4-20250514` 16 | - Claude 3.7: `claude-3-7-sonnet-20250219` 17 | - Claude 3.5: `claude-3-5-sonnet-20241022` 18 | 19 | ### OpenAI CUA Preview 20 | - Computer-use-preview: `computer-use-preview` 21 | 22 | ### UI-TARS 1.5 (Unified VLM with grounding support) 23 | - `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` 24 | - `huggingface/ByteDance-Seed/UI-TARS-1.5-7B` (requires TGI endpoint) 25 | 26 | ## Specialized Grounding Models 27 | 28 | These models are optimized specifically for click prediction and UI element grounding: 29 | 30 | ### OpenCUA 31 | - `huggingface-local/xlangai/OpenCUA-{7B,32B}` 32 | 33 | ### GTA1 Family 34 | - `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` 35 | 36 | ### Holo 1.5 Family 37 | - `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` 38 | 39 | ### InternVL 3.5 Family 40 | - `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` 41 | 42 | ### OmniParser (OCR) 43 | 44 | OCR-focused set-of-marks model that requires an LLM for click prediction: 45 | 46 | - `omniparser` (requires combination with any LiteLLM vision model) 47 | 48 | ### Moondream3 (Local Grounding) 49 | 50 | Moondream3 is a powerful small model that can perform UI grounding and click prediction. 51 | 52 | - `moondream3` 53 | 54 | ## Usage Examples 55 | 56 | ```python 57 | # Using any grounding model for click prediction 58 | agent = ComputerAgent("claude-3-5-sonnet-20241022", tools=[computer]) 59 | 60 | # Predict coordinates for specific elements 61 | login_coords = agent.predict_click("find the login button") 62 | search_coords = agent.predict_click("locate the search text field") 63 | menu_coords = agent.predict_click("find the hamburger menu icon") 64 | 65 | print(f"Login button: {login_coords}") 66 | print(f"Search field: {search_coords}") 67 | print(f"Menu icon: {menu_coords}") 68 | ``` 69 | 70 | ```python 71 | # OmniParser is just for OCR, so it requires an LLM for predict_click 72 | agent = ComputerAgent("omniparser+anthropic/claude-3-5-sonnet-20241022", tools=[computer]) 73 | 74 | # Predict click coordinates using composed agent 75 | coords = agent.predict_click("find the submit button") 76 | print(f"Click coordinates: {coords}") # (450, 320) 77 | 78 | # Note: Cannot use omniparser alone for click prediction 79 | # This will raise an error: 80 | # agent = ComputerAgent("omniparser", tools=[computer]) 81 | # coords = agent.predict_click("find button") # Error! 82 | ``` 83 | 84 | ```python 85 | agent = ComputerAgent("huggingface-local/HelloKKMe/GTA1-7B", tools=[computer]) 86 | 87 | # Predict click coordinates for UI elements 88 | coords = agent.predict_click("find the submit button") 89 | print(f"Click coordinates: {coords}") # (450, 320) 90 | 91 | # Note: GTA1 cannot perform autonomous task planning 92 | # This will raise an error: 93 | # agent.run("Fill out the form and submit it") 94 | ``` 95 | 96 | --- 97 | 98 | For information on combining grounding models with planning capabilities, see [Composed Agents](./composed-agents) and [All‑in‑one CUAs](./computer-use-agents). 99 | ``` -------------------------------------------------------------------------------- /libs/python/computer-server/computer_server/server.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Server interface for Computer API. 3 | Provides a clean API for starting and stopping the server. 4 | """ 5 | 6 | import asyncio 7 | import logging 8 | import uvicorn 9 | from typing import Optional 10 | from fastapi import FastAPI 11 | 12 | from .main import app as fastapi_app 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class Server: 18 | """ 19 | Server interface for Computer API. 20 | 21 | Usage: 22 | from computer_api import Server 23 | 24 | # Synchronous usage 25 | server = Server() 26 | server.start() # Blocks until server is stopped 27 | 28 | # Asynchronous usage 29 | server = Server() 30 | await server.start_async() # Starts server in background 31 | # Do other things 32 | await server.stop() # Stop the server 33 | """ 34 | 35 | def __init__(self, host: str = "0.0.0.0", port: int = 8000, log_level: str = "info", 36 | ssl_keyfile: Optional[str] = None, ssl_certfile: Optional[str] = None): 37 | """ 38 | Initialize the server. 39 | 40 | Args: 41 | host: Host to bind the server to 42 | port: Port to bind the server to 43 | log_level: Logging level (debug, info, warning, error, critical) 44 | ssl_keyfile: Path to SSL private key file (for HTTPS) 45 | ssl_certfile: Path to SSL certificate file (for HTTPS) 46 | """ 47 | self.host = host 48 | self.port = port 49 | self.log_level = log_level 50 | self.ssl_keyfile = ssl_keyfile 51 | self.ssl_certfile = ssl_certfile 52 | self.app = fastapi_app 53 | self._server_task: Optional[asyncio.Task] = None 54 | self._should_exit = asyncio.Event() 55 | 56 | def start(self) -> None: 57 | """ 58 | Start the server synchronously. This will block until the server is stopped. 59 | """ 60 | uvicorn.run( 61 | self.app, 62 | host=self.host, 63 | port=self.port, 64 | log_level=self.log_level, 65 | ssl_keyfile=self.ssl_keyfile, 66 | ssl_certfile=self.ssl_certfile 67 | ) 68 | 69 | async def start_async(self) -> None: 70 | """ 71 | Start the server asynchronously. This will return immediately and the server 72 | will run in the background. 73 | """ 74 | server_config = uvicorn.Config( 75 | self.app, 76 | host=self.host, 77 | port=self.port, 78 | log_level=self.log_level, 79 | ssl_keyfile=self.ssl_keyfile, 80 | ssl_certfile=self.ssl_certfile 81 | ) 82 | 83 | self._should_exit.clear() 84 | server = uvicorn.Server(server_config) 85 | 86 | # Create a task to run the server 87 | self._server_task = asyncio.create_task(server.serve()) 88 | 89 | # Wait a short time to ensure the server starts 90 | await asyncio.sleep(0.5) 91 | 92 | protocol = "https" if self.ssl_certfile else "http" 93 | logger.info(f"Server started at {protocol}://{self.host}:{self.port}") 94 | 95 | async def stop(self) -> None: 96 | """ 97 | Stop the server if it's running asynchronously. 98 | """ 99 | if self._server_task and not self._server_task.done(): 100 | # Signal the server to exit 101 | self._should_exit.set() 102 | 103 | # Cancel the server task 104 | self._server_task.cancel() 105 | 106 | try: 107 | await self._server_task 108 | except asyncio.CancelledError: 109 | logger.info("Server stopped") 110 | 111 | self._server_task = None 112 | ``` -------------------------------------------------------------------------------- /libs/lume/src/VM/VMDetailsPrinter.swift: -------------------------------------------------------------------------------- ```swift 1 | import Foundation 2 | 3 | /// Prints VM status information in a formatted table 4 | enum VMDetailsPrinter { 5 | /// Represents a column in the VM status table 6 | private struct Column: Sendable { 7 | let header: String 8 | let width: Int 9 | let getValue: @Sendable (VMDetails) -> String 10 | } 11 | 12 | /// Configuration for all columns in the status table 13 | private static let columns: [Column] = [ 14 | Column(header: "name", width: 34, getValue: { $0.name }), 15 | Column(header: "os", width: 8, getValue: { $0.os }), 16 | Column(header: "cpu", width: 8, getValue: { String($0.cpuCount) }), 17 | Column( 18 | header: "memory", width: 8, 19 | getValue: { 20 | String(format: "%.2fG", Float($0.memorySize) / (1024 * 1024 * 1024)) 21 | }), 22 | Column( 23 | header: "disk", width: 16, 24 | getValue: { 25 | "\($0.diskSize.formattedAllocated)/\($0.diskSize.formattedTotal)" 26 | }), 27 | Column(header: "display", width: 12, getValue: { $0.display }), 28 | Column( 29 | header: "status", width: 16, 30 | getValue: { 31 | $0.status 32 | }), 33 | Column(header: "storage", width: 16, getValue: { $0.locationName }), 34 | Column( 35 | header: "shared_dirs", width: 54, 36 | getValue: { vm in 37 | // Only show shared directories if the VM is running 38 | if vm.status == "running", let dirs = vm.sharedDirectories, !dirs.isEmpty { 39 | return dirs.map { "\($0.hostPath) (\($0.readOnly ? "ro" : "rw"))" }.joined(separator: ", ") 40 | } else { 41 | return "-" 42 | } 43 | }), 44 | Column( 45 | header: "ip", width: 16, 46 | getValue: { 47 | $0.ipAddress ?? "-" 48 | }), 49 | Column( 50 | header: "vnc", width: 50, 51 | getValue: { 52 | $0.vncUrl ?? "-" 53 | }), 54 | ] 55 | 56 | /// Prints the status of all VMs in a formatted table 57 | /// - Parameter vms: Array of VM status objects to display 58 | static func printStatus( 59 | _ vms: [VMDetails], format: FormatOption, print: (String) -> Void = { print($0) } 60 | ) throws { 61 | if format == .json { 62 | let jsonEncoder = JSONEncoder() 63 | jsonEncoder.outputFormatting = .prettyPrinted 64 | let jsonData = try jsonEncoder.encode(vms) 65 | let jsonString = String(data: jsonData, encoding: .utf8)! 66 | print(jsonString) 67 | } else { 68 | printHeader(print: print) 69 | vms.forEach({ vm in 70 | printVM(vm, print: print) 71 | }) 72 | } 73 | } 74 | 75 | private static func printHeader(print: (String) -> Void = { print($0) }) { 76 | let paddedHeaders = columns.map { $0.header.paddedToWidth($0.width) } 77 | print(paddedHeaders.joined()) 78 | } 79 | 80 | private static func printVM(_ vm: VMDetails, print: (String) -> Void = { print($0) }) { 81 | let paddedColumns = columns.map { column in 82 | column.getValue(vm).paddedToWidth(column.width) 83 | } 84 | print(paddedColumns.joined()) 85 | } 86 | } 87 | 88 | extension String { 89 | /// Pads the string to the specified width with spaces 90 | /// - Parameter width: Target width for padding 91 | /// - Returns: Padded string 92 | fileprivate func paddedToWidth(_ width: Int) -> String { 93 | padding(toLength: width, withPad: " ", startingAt: 0) 94 | } 95 | } 96 | ``` -------------------------------------------------------------------------------- /libs/lume/src/VM/DarwinVM.swift: -------------------------------------------------------------------------------- ```swift 1 | import Foundation 2 | 3 | /// macOS-specific virtual machine implementation 4 | @MainActor 5 | final class DarwinVM: VM { 6 | private let imageLoader: ImageLoader 7 | 8 | init( 9 | vmDirContext: VMDirContext, 10 | virtualizationServiceFactory: @escaping (VMVirtualizationServiceContext) throws -> VMVirtualizationService = { try DarwinVirtualizationService(configuration: $0) }, 11 | vncServiceFactory: @escaping (VMDirectory) -> VNCService = { DefaultVNCService(vmDirectory: $0) }, 12 | imageLoader: ImageLoader 13 | ) { 14 | self.imageLoader = imageLoader 15 | super.init( 16 | vmDirContext: vmDirContext, 17 | virtualizationServiceFactory: virtualizationServiceFactory, 18 | vncServiceFactory: vncServiceFactory 19 | ) 20 | } 21 | 22 | override func getOSType() -> String { 23 | return "macOS" 24 | } 25 | 26 | // MARK: - Installation and Configuration 27 | 28 | override func setup(ipswPath: String, cpuCount: Int, memorySize: UInt64, diskSize: UInt64, display: String) async throws { 29 | let imagePath: Path 30 | if ipswPath == "latest" { 31 | Logger.info("Downloading latest supported Image...") 32 | let downloadedPath = try await self.imageLoader.downloadLatestImage() 33 | imagePath = Path(downloadedPath.path) 34 | } else { 35 | imagePath = Path(ipswPath) 36 | } 37 | 38 | let requirements = try await imageLoader.loadImageRequirements(from: imagePath.url) 39 | try setDiskSize(diskSize) 40 | 41 | let finalCpuCount = max(cpuCount, requirements.minimumSupportedCPUCount) 42 | try setCpuCount(finalCpuCount) 43 | if finalCpuCount != cpuCount { 44 | Logger.info("CPU count overridden due to minimum image requirements", metadata: ["original": "\(cpuCount)", "final": "\(finalCpuCount)"]) 45 | } 46 | 47 | let finalMemorySize = max(memorySize, requirements.minimumSupportedMemorySize) 48 | try setMemorySize(finalMemorySize) 49 | if finalMemorySize != memorySize { 50 | Logger.info("Memory size overridden due to minimum image requirements", metadata: ["original": "\(memorySize)", "final": "\(finalMemorySize)"]) 51 | } 52 | 53 | try updateVMConfig( 54 | vmConfig: try VMConfig( 55 | os: getOSType(), 56 | cpuCount: finalCpuCount, 57 | memorySize: finalMemorySize, 58 | diskSize: diskSize, 59 | macAddress: DarwinVirtualizationService.generateMacAddress(), 60 | display: display, 61 | hardwareModel: requirements.hardwareModel, 62 | machineIdentifier: DarwinVirtualizationService.generateMachineIdentifier() 63 | ) 64 | ) 65 | 66 | let service: any VMVirtualizationService = try virtualizationServiceFactory( 67 | try createVMVirtualizationServiceContext( 68 | cpuCount: finalCpuCount, 69 | memorySize: finalMemorySize, 70 | display: display 71 | ) 72 | ) 73 | guard let darwinService = service as? DarwinVirtualizationService else { 74 | throw VMError.internalError("Installation requires DarwinVirtualizationService") 75 | } 76 | 77 | // Create auxiliary storage with hardware model 78 | try darwinService.createAuxiliaryStorage(at: vmDirContext.nvramPath, hardwareModel: requirements.hardwareModel) 79 | 80 | try await darwinService.installMacOS(imagePath: imagePath) { progress in 81 | Logger.info("Installing macOS", metadata: ["progress": "\(Int(progress * 100))%"]) 82 | } 83 | } 84 | } 85 | ``` -------------------------------------------------------------------------------- /scripts/build.sh: -------------------------------------------------------------------------------- ```bash 1 | #!/bin/bash 2 | 3 | # Exit on error 4 | set -e 5 | 6 | # Colors for output 7 | RED='\033[0;31m' 8 | GREEN='\033[0;32m' 9 | BLUE='\033[0;34m' 10 | NC='\033[0m' # No Color 11 | 12 | # Function to print step information 13 | print_step() { 14 | echo -e "${BLUE}==> $1${NC}" 15 | } 16 | 17 | # Function to print success message 18 | print_success() { 19 | echo -e "${GREEN}==> Success: $1${NC}" 20 | } 21 | 22 | # Function to print error message 23 | print_error() { 24 | echo -e "${RED}==> Error: $1${NC}" >&2 25 | } 26 | 27 | # Get the script's directory 28 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 29 | PROJECT_ROOT="$( cd "${SCRIPT_DIR}/.." && pwd )" 30 | 31 | # Change to project root 32 | cd "$PROJECT_ROOT" 33 | 34 | # Load environment variables from .env.local 35 | if [ -f .env.local ]; then 36 | print_step "Loading environment variables from .env.local..." 37 | set -a 38 | source .env.local 39 | set +a 40 | print_success "Environment variables loaded" 41 | else 42 | print_error ".env.local file not found" 43 | exit 1 44 | fi 45 | 46 | # Clean up existing environments and cache 47 | print_step "Cleaning up existing environments..." 48 | find . -type d -name "__pycache__" -exec rm -rf {} + 49 | find . -type d -name ".pytest_cache" -exec rm -rf {} + 50 | find . -type d -name "dist" -exec rm -rf {} + 51 | find . -type d -name ".venv" -exec rm -rf {} + 52 | find . -type d -name "*.egg-info" -exec rm -rf {} + 53 | print_success "Environment cleanup complete" 54 | 55 | # Create and activate virtual environment 56 | print_step "Creating virtual environment..." 57 | python -m venv .venv 58 | source .venv/bin/activate 59 | 60 | # Upgrade pip and install build tools 61 | print_step "Upgrading pip and installing build tools..." 62 | python -m pip install --upgrade pip setuptools wheel 63 | 64 | # Function to install a package and its dependencies 65 | install_package() { 66 | local package_dir=$1 67 | local package_name=$2 68 | local extras=$3 69 | print_step "Installing ${package_name}..." 70 | cd "$package_dir" 71 | 72 | if [ -f "pyproject.toml" ]; then 73 | if [ -n "$extras" ]; then 74 | pip install -e ".[${extras}]" 75 | else 76 | pip install -e . 77 | fi 78 | else 79 | print_error "No pyproject.toml found in ${package_dir}" 80 | return 1 81 | fi 82 | 83 | cd "$PROJECT_ROOT" 84 | } 85 | 86 | # Install packages in order of dependency 87 | print_step "Installing packages in development mode..." 88 | 89 | # Install core first (base package with telemetry support) 90 | install_package "libs/python/core" "core" 91 | 92 | # Install pylume (base dependency) 93 | install_package "libs/python/pylume" "pylume" 94 | 95 | # Install computer with all its dependencies and extras 96 | install_package "libs/python/computer" "computer" "all" 97 | 98 | # Install omniparser 99 | install_package "libs/python/som" "som" 100 | 101 | # Install agent with all its dependencies and extras 102 | install_package "libs/python/agent" "agent" "all" 103 | 104 | # Install computer-server 105 | install_package "libs/python/computer-server" "computer-server" 106 | 107 | # Install mcp-server 108 | install_package "libs/python/mcp-server" "mcp-server" 109 | 110 | # Install development tools from root project 111 | print_step "Installing development dependencies..." 112 | pip install -e ".[dev,test,docs]" 113 | 114 | # Create a .env file for VS Code to use the virtual environment 115 | print_step "Creating .env file for VS Code..." 116 | echo "PYTHONPATH=${PROJECT_ROOT}/libs/python/core:${PROJECT_ROOT}/libs/python/computer:${PROJECT_ROOT}/libs/python/agent:${PROJECT_ROOT}/libs/python/som:${PROJECT_ROOT}/libs/python/pylume:${PROJECT_ROOT}/libs/python/computer-server:${PROJECT_ROOT}/libs/python/mcp-server" > .env 117 | 118 | print_success "All packages installed successfully!" 119 | print_step "Your virtual environment is ready. To activate it:" 120 | echo " source .venv/bin/activate" 121 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/callbacks/image_retention.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Image retention callback handler that limits the number of recent images in message history. 3 | """ 4 | 5 | from typing import List, Dict, Any, Optional 6 | from .base import AsyncCallbackHandler 7 | 8 | 9 | class ImageRetentionCallback(AsyncCallbackHandler): 10 | """ 11 | Callback handler that applies image retention policy to limit the number 12 | of recent images in message history to prevent context window overflow. 13 | """ 14 | 15 | def __init__(self, only_n_most_recent_images: Optional[int] = None): 16 | """ 17 | Initialize the image retention callback. 18 | 19 | Args: 20 | only_n_most_recent_images: If set, only keep the N most recent images in message history 21 | """ 22 | self.only_n_most_recent_images = only_n_most_recent_images 23 | 24 | async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 25 | """ 26 | Apply image retention policy to messages before sending to agent loop. 27 | 28 | Args: 29 | messages: List of message dictionaries 30 | 31 | Returns: 32 | List of messages with image retention policy applied 33 | """ 34 | if self.only_n_most_recent_images is None: 35 | return messages 36 | 37 | return self._apply_image_retention(messages) 38 | 39 | def _apply_image_retention(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 40 | """Apply image retention policy to keep only the N most recent images. 41 | 42 | Removes computer_call_output items with image_url and their corresponding computer_call items, 43 | keeping only the most recent N image pairs based on only_n_most_recent_images setting. 44 | 45 | Args: 46 | messages: List of message dictionaries 47 | 48 | Returns: 49 | Filtered list of messages with image retention applied 50 | """ 51 | if self.only_n_most_recent_images is None: 52 | return messages 53 | 54 | # Gather indices of all computer_call_output messages that contain an image_url 55 | output_indices: List[int] = [] 56 | for idx, msg in enumerate(messages): 57 | if msg.get("type") == "computer_call_output": 58 | out = msg.get("output") 59 | if isinstance(out, dict) and ("image_url" in out): 60 | output_indices.append(idx) 61 | 62 | # Nothing to trim 63 | if len(output_indices) <= self.only_n_most_recent_images: 64 | return messages 65 | 66 | # Determine which outputs to keep (most recent N) 67 | keep_output_indices = set(output_indices[-self.only_n_most_recent_images :]) 68 | 69 | # Build set of indices to remove in one pass 70 | to_remove: set[int] = set() 71 | 72 | for idx in output_indices: 73 | if idx in keep_output_indices: 74 | continue # keep this screenshot and its context 75 | 76 | to_remove.add(idx) # remove the computer_call_output itself 77 | 78 | # Remove the immediately preceding computer_call with matching call_id (if present) 79 | call_id = messages[idx].get("call_id") 80 | prev_idx = idx - 1 81 | if prev_idx >= 0 and messages[prev_idx].get("type") == "computer_call" and messages[prev_idx].get("call_id") == call_id: 82 | to_remove.add(prev_idx) 83 | # Check a single reasoning immediately before that computer_call 84 | r_idx = prev_idx - 1 85 | if r_idx >= 0 and messages[r_idx].get("type") == "reasoning": 86 | to_remove.add(r_idx) 87 | 88 | # Construct filtered list 89 | filtered = [m for i, m in enumerate(messages) if i not in to_remove] 90 | return filtered ``` -------------------------------------------------------------------------------- /libs/python/computer/computer/interface/models.py: -------------------------------------------------------------------------------- ```python 1 | from enum import Enum 2 | from typing import Dict, List, Any, TypedDict, Union, Literal 3 | from dataclasses import dataclass 4 | 5 | @dataclass 6 | class CommandResult: 7 | stdout: str 8 | stderr: str 9 | returncode: int 10 | 11 | def __init__(self, stdout: str, stderr: str, returncode: int): 12 | self.stdout = stdout 13 | self.stderr = stderr 14 | self.returncode = returncode 15 | 16 | # Navigation key literals 17 | NavigationKey = Literal['pagedown', 'pageup', 'home', 'end', 'left', 'right', 'up', 'down'] 18 | 19 | # Special key literals 20 | SpecialKey = Literal['enter', 'esc', 'tab', 'space', 'backspace', 'del'] 21 | 22 | # Modifier key literals 23 | ModifierKey = Literal['ctrl', 'alt', 'shift', 'win', 'command', 'option'] 24 | 25 | # Function key literals 26 | FunctionKey = Literal['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12'] 27 | 28 | class Key(Enum): 29 | """Keyboard keys that can be used with press_key. 30 | 31 | These key names map to PyAutoGUI's expected key names. 32 | """ 33 | # Navigation 34 | PAGE_DOWN = 'pagedown' 35 | PAGE_UP = 'pageup' 36 | HOME = 'home' 37 | END = 'end' 38 | LEFT = 'left' 39 | RIGHT = 'right' 40 | UP = 'up' 41 | DOWN = 'down' 42 | 43 | # Special keys 44 | RETURN = 'enter' 45 | ENTER = 'enter' 46 | ESCAPE = 'esc' 47 | ESC = 'esc' 48 | TAB = 'tab' 49 | SPACE = 'space' 50 | BACKSPACE = 'backspace' 51 | DELETE = 'del' 52 | 53 | # Modifier keys 54 | ALT = 'alt' 55 | CTRL = 'ctrl' 56 | SHIFT = 'shift' 57 | WIN = 'win' 58 | COMMAND = 'command' 59 | OPTION = 'option' 60 | 61 | # Function keys 62 | F1 = 'f1' 63 | F2 = 'f2' 64 | F3 = 'f3' 65 | F4 = 'f4' 66 | F5 = 'f5' 67 | F6 = 'f6' 68 | F7 = 'f7' 69 | F8 = 'f8' 70 | F9 = 'f9' 71 | F10 = 'f10' 72 | F11 = 'f11' 73 | F12 = 'f12' 74 | 75 | @classmethod 76 | def from_string(cls, key: str) -> 'Key | str': 77 | """Convert a string key name to a Key enum value. 78 | 79 | Args: 80 | key: String key name to convert 81 | 82 | Returns: 83 | Key enum value if the string matches a known key, 84 | otherwise returns the original string for single character keys 85 | """ 86 | # Map common alternative names to enum values 87 | key_mapping = { 88 | 'page_down': cls.PAGE_DOWN, 89 | 'page down': cls.PAGE_DOWN, 90 | 'pagedown': cls.PAGE_DOWN, 91 | 'page_up': cls.PAGE_UP, 92 | 'page up': cls.PAGE_UP, 93 | 'pageup': cls.PAGE_UP, 94 | 'return': cls.RETURN, 95 | 'enter': cls.ENTER, 96 | 'escape': cls.ESCAPE, 97 | 'esc': cls.ESC, 98 | 'delete': cls.DELETE, 99 | 'del': cls.DELETE, 100 | # Modifier key mappings 101 | 'alt': cls.ALT, 102 | 'ctrl': cls.CTRL, 103 | 'control': cls.CTRL, 104 | 'shift': cls.SHIFT, 105 | 'win': cls.WIN, 106 | 'windows': cls.WIN, 107 | 'super': cls.WIN, 108 | 'command': cls.COMMAND, 109 | 'cmd': cls.COMMAND, 110 | '⌘': cls.COMMAND, 111 | 'option': cls.OPTION, 112 | '⌥': cls.OPTION, 113 | } 114 | 115 | normalized = key.lower().strip() 116 | return key_mapping.get(normalized, key) 117 | 118 | # Combined key type 119 | KeyType = Union[Key, NavigationKey, SpecialKey, ModifierKey, FunctionKey, str] 120 | 121 | # Key type for mouse actions 122 | MouseButton = Literal['left', 'right', 'middle'] 123 | 124 | class AccessibilityWindow(TypedDict): 125 | """Information about a window in the accessibility tree.""" 126 | app_name: str 127 | pid: int 128 | frontmost: bool 129 | has_windows: bool 130 | windows: List[Dict[str, Any]] 131 | 132 | class AccessibilityTree(TypedDict): 133 | """Complete accessibility tree information.""" 134 | success: bool 135 | frontmost_application: str 136 | windows: List[AccessibilityWindow] ``` -------------------------------------------------------------------------------- /docs/content/docs/agent-sdk/migration-guide.mdx: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | title: Migration Guide 3 | --- 4 | 5 | This guide lists **breaking changes** when migrating from the original `ComputerAgent` (v0.3.x) to the rewritten `ComputerAgent` (v0.4.x) and shows old vs new usage for all four agent loops. 6 | 7 | ## Breaking Changes 8 | 9 | - **Initialization:** 10 | - `ComputerAgent` (v0.4.x) uses `model` as a string (e.g. "anthropic/claude-3-5-sonnet-20241022") instead of `LLM` and `AgentLoop` objects. 11 | - `tools` is a list (can include multiple computers and decorated functions). 12 | - `callbacks` are now first-class for extensibility (image retention, budget, trajectory, logging, etc). 13 | - **No explicit `loop` parameter:** 14 | - Loop is inferred from the `model` string (e.g. `anthropic/`, `openai/`, `omniparser+`, `ui-tars`). 15 | - **No explicit `computer` parameter:** 16 | - Computers are added to `tools` list. 17 | 18 | --- 19 | 20 | ## Usage Examples: Old vs New 21 | 22 | ### 1. Anthropic Loop 23 | **Old:** 24 | ```python 25 | async with Computer() as computer: 26 | agent = ComputerAgent( 27 | computer=computer, 28 | loop=AgentLoop.ANTHROPIC, 29 | model=LLM(provider=LLMProvider.ANTHROPIC) 30 | ) 31 | async for result in agent.run("Take a screenshot"): 32 | print(result) 33 | ``` 34 | **New:** 35 | ```python 36 | async with Computer() as computer: 37 | agent = ComputerAgent( 38 | model="anthropic/claude-3-5-sonnet-20241022", 39 | tools=[computer] 40 | ) 41 | messages = [{"role": "user", "content": "Take a screenshot"}] 42 | async for result in agent.run(messages): 43 | for item in result["output"]: 44 | if item["type"] == "message": 45 | print(item["content"][0]["text"]) 46 | ``` 47 | 48 | ### 2. OpenAI Loop 49 | **Old:** 50 | ```python 51 | async with Computer() as computer: 52 | agent = ComputerAgent( 53 | computer=computer, 54 | loop=AgentLoop.OPENAI, 55 | model=LLM(provider=LLMProvider.OPENAI) 56 | ) 57 | async for result in agent.run("Take a screenshot"): 58 | print(result) 59 | ``` 60 | **New:** 61 | ```python 62 | async with Computer() as computer: 63 | agent = ComputerAgent( 64 | model="openai/computer-use-preview", 65 | tools=[computer] 66 | ) 67 | messages = [{"role": "user", "content": "Take a screenshot"}] 68 | async for result in agent.run(messages): 69 | for item in result["output"]: 70 | if item["type"] == "message": 71 | print(item["content"][0]["text"]) 72 | ``` 73 | 74 | ### 3. UI-TARS Loop 75 | **Old:** 76 | ```python 77 | async with Computer() as computer: 78 | agent = ComputerAgent( 79 | computer=computer, 80 | loop=AgentLoop.UITARS, 81 | model=LLM(provider=LLMProvider.OAICOMPAT, name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://.../v1") 82 | ) 83 | async for result in agent.run("Take a screenshot"): 84 | print(result) 85 | ``` 86 | **New:** 87 | ```python 88 | async with Computer() as computer: 89 | agent = ComputerAgent( 90 | model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", 91 | tools=[computer] 92 | ) 93 | messages = [{"role": "user", "content": "Take a screenshot"}] 94 | async for result in agent.run(messages): 95 | for item in result["output"]: 96 | if item["type"] == "message": 97 | print(item["content"][0]["text"]) 98 | ``` 99 | 100 | ### 4. Omni Loop 101 | **Old:** 102 | ```python 103 | async with Computer() as computer: 104 | agent = ComputerAgent( 105 | computer=computer, 106 | loop=AgentLoop.OMNI, 107 | model=LLM(provider=LLMProvider.OLLAMA, name="gemma3") 108 | ) 109 | async for result in agent.run("Take a screenshot"): 110 | print(result) 111 | ``` 112 | **New:** 113 | ```python 114 | async with Computer() as computer: 115 | agent = ComputerAgent( 116 | model="omniparser+ollama_chat/gemma3", 117 | tools=[computer] 118 | ) 119 | messages = [{"role": "user", "content": "Take a screenshot"}] 120 | async for result in agent.run(messages): 121 | for item in result["output"]: 122 | if item["type"] == "message": 123 | print(item["content"][0]["text"]) 124 | ``` 125 | ``` -------------------------------------------------------------------------------- /docs/content/docs/libraries/lume/faq.md: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | title: FAQ 3 | --- 4 | 5 | ### Where are the VMs stored? 6 | 7 | VMs are stored in `~/.lume` by default. You can configure additional storage locations using the `lume config` command. 8 | 9 | ### How are images cached? 10 | 11 | Images are cached in `~/.lume/cache`. When doing `lume pull <image>`, it will check if the image is already cached. If not, it will download the image and cache it, removing any older versions. 12 | 13 | ### Where is the configuration file stored? 14 | 15 | Lume follows the XDG Base Directory specification for the configuration file: 16 | 17 | - Configuration is stored in `$XDG_CONFIG_HOME/lume/config.yaml` (defaults to `~/.config/lume/config.yaml`) 18 | 19 | By default, other data is stored in: 20 | - VM data: `~/.lume` 21 | - Cache files: `~/.lume/cache` 22 | 23 | The config file contains settings for: 24 | - VM storage locations and the default location 25 | - Cache directory location 26 | - Whether caching is enabled 27 | 28 | You can view and modify these settings using the `lume config` commands: 29 | 30 | ```bash 31 | # View current configuration 32 | lume config get 33 | 34 | # Manage VM storage locations 35 | lume config storage list # List all VM storage locations 36 | lume config storage add <name> <path> # Add a new VM storage location 37 | lume config storage remove <name> # Remove a VM storage location 38 | lume config storage default <name> # Set the default VM storage location 39 | 40 | # Manage cache settings 41 | lume config cache get # Get current cache directory 42 | lume config cache set <path> # Set cache directory 43 | 44 | # Manage image caching settings 45 | lume config caching get # Show current caching status 46 | lume config caching set <boolean> # Enable or disable image caching 47 | ``` 48 | 49 | ### How do I use multiple VM storage locations? 50 | 51 | Lume supports storing VMs in different locations (e.g., internal drive, external SSD). After configuring storage locations, you can specify which location to use with the `--storage` parameter in various commands: 52 | 53 | ```bash 54 | # Create a VM in a specific storage location 55 | lume create my-vm --os macos --ipsw latest --storage ssd 56 | 57 | # Run a VM from a specific storage location 58 | lume run my-vm --storage ssd 59 | 60 | # Delete a VM from a specific storage location 61 | lume delete my-vm --storage ssd 62 | 63 | # Pull an image to a specific storage location 64 | lume pull macos-sequoia-vanilla:latest --name my-vm --storage ssd 65 | 66 | # Clone a VM between storage locations 67 | lume clone source-vm cloned-vm --source-storage default --dest-storage ssd 68 | ``` 69 | 70 | If you don't specify a storage location, Lume will use the default one or search across all configured locations. 71 | 72 | ### Are VM disks taking up all the disk space? 73 | 74 | No, macOS uses sparse files, which only allocate space as needed. For example, VM disks totaling 50 GB may only use 20 GB on disk. 75 | 76 | ### How do I get the latest macOS restore image URL? 77 | 78 | ```bash 79 | lume ipsw 80 | ``` 81 | 82 | ### How do I delete a VM? 83 | 84 | ```bash 85 | lume delete <name> 86 | ``` 87 | 88 | ### How to Install macOS from an IPSW Image 89 | 90 | #### Create a new macOS VM using the latest supported IPSW image: 91 | Run the following command to create a new macOS virtual machine using the latest available IPSW image: 92 | 93 | ```bash 94 | lume create <name> --os macos --ipsw latest 95 | ``` 96 | 97 | #### Create a new macOS VM using a specific IPSW image: 98 | To create a macOS virtual machine from an older or specific IPSW file, first download the desired IPSW (UniversalMac) from a trusted source. 99 | 100 | Then, use the downloaded IPSW path: 101 | 102 | ```bash 103 | lume create <name> --os macos --ipsw <downloaded_ipsw_path> 104 | ``` 105 | 106 | ### How do I install a custom Linux image? 107 | 108 | The process for creating a custom Linux image differs than macOS, with IPSW restore files not being used. You need to create a linux VM first, then mount a setup image file to the VM for the first boot. 109 | 110 | ```bash 111 | lume create <name> --os linux 112 | 113 | lume run <name> --mount <path-to-setup-image> 114 | 115 | lume run <name> 116 | ``` 117 | ``` -------------------------------------------------------------------------------- /scripts/run-docker-dev.sh: -------------------------------------------------------------------------------- ```bash 1 | #!/bin/bash 2 | 3 | # Colors for output 4 | GREEN='\033[0;32m' 5 | BLUE='\033[0;34m' 6 | RED='\033[0;31m' 7 | NC='\033[0m' # No Color 8 | 9 | # Print with color 10 | print_info() { 11 | echo -e "${BLUE}==> $1${NC}" 12 | } 13 | 14 | print_success() { 15 | echo -e "${GREEN}==> $1${NC}" 16 | } 17 | 18 | print_error() { 19 | echo -e "${RED}==> $1${NC}" 20 | } 21 | 22 | # Docker image name 23 | IMAGE_NAME="cua-dev-image" 24 | CONTAINER_NAME="cua-dev-container" 25 | PLATFORM="linux/arm64" 26 | 27 | # Detect platform based on architecture 28 | arch=$(uname -m) 29 | 30 | if [[ $arch == x86_64* ]]; then 31 | PLATFORM="linux/amd64" 32 | print_info "X64 Architecture detected, using platform: ${PLATFORM}" 33 | elif [[ $arch == i*86 ]]; then 34 | PLATFORM="linux/386" 35 | print_info "X32 Architecture detected, using platform: ${PLATFORM}" 36 | elif [[ $arch == arm* ]] || [[ $arch == aarch64 ]]; then 37 | PLATFORM="linux/arm64" 38 | print_info "ARM Architecture detected, using platform: ${PLATFORM}" 39 | else 40 | # Fallback to amd64 for unknown architectures 41 | PLATFORM="linux/amd64" 42 | print_info "Unknown architecture ($arch), defaulting to platform: ${PLATFORM}" 43 | fi 44 | 45 | # Environment variables 46 | PYTHONPATH="/app/libs/python/core:/app/libs/python/computer:/app/libs/python/agent:/app/libs/python/som:/app/libs/python/pylume:/app/libs/python/computer-server:/app/libs/python/mcp-server" 47 | 48 | # Check if Docker is installed 49 | if ! command -v docker &> /dev/null; then 50 | print_error "Docker is not installed. Please install Docker first." 51 | exit 1 52 | fi 53 | 54 | # Command options 55 | case "$1" in 56 | build) 57 | print_info "Building the development Docker image..." 58 | print_info "This will install all dependencies but won't include source code" 59 | docker build -f Dockerfile --platform=${PLATFORM} -t ${IMAGE_NAME} . 60 | print_success "Development Docker image built successfully!" 61 | ;; 62 | 63 | run) 64 | # Check for interactive flag 65 | if [ "$2" == "--interactive" ]; then 66 | print_info "Running the development Docker container with interactive shell..." 67 | print_info "Mounting source code from host" 68 | print_info "Connecting to host.docker.internal:7777" 69 | 70 | docker run -it --rm \ 71 | --platform=${PLATFORM} \ 72 | --name ${CONTAINER_NAME} \ 73 | -v "$(pwd):/app" \ 74 | -e PYTHONPATH=${PYTHONPATH} \ 75 | -e DISPLAY=${DISPLAY:-:0} \ 76 | -e PYLUME_HOST="host.docker.internal" \ 77 | -p 7860:7860 \ 78 | ${IMAGE_NAME} bash 79 | else 80 | # Run the specified example 81 | if [ -z "$2" ]; then 82 | print_error "Please specify an example file, e.g., ./run-docker-dev.sh run computer_examples.py" 83 | exit 1 84 | fi 85 | print_info "Running example: $2" 86 | print_info "Connecting to host.docker.internal:7777" 87 | 88 | docker run -it --rm \ 89 | --platform=${PLATFORM} \ 90 | --name ${CONTAINER_NAME} \ 91 | -v "$(pwd):/app" \ 92 | -e PYTHONPATH=${PYTHONPATH} \ 93 | -e DISPLAY=${DISPLAY:-:0} \ 94 | -e PYLUME_HOST="host.docker.internal" \ 95 | -p 7860:7860 \ 96 | ${IMAGE_NAME} python "/app/examples/$2" 97 | fi 98 | ;; 99 | 100 | stop) 101 | print_info "Stopping any running containers..." 102 | docker stop ${CONTAINER_NAME} 2>/dev/null || true 103 | print_success "Done!" 104 | ;; 105 | 106 | *) 107 | echo "Usage: $0 {build|run [--interactive] [filename]|stop}" 108 | echo "" 109 | echo "Commands:" 110 | echo " build Build the development Docker image with dependencies" 111 | echo " run [example_filename] Run the specified example file in the container" 112 | echo " run --interactive Run the container with mounted code and get an interactive shell" 113 | echo " stop Stop the container" 114 | exit 1 115 | esac 116 | 117 | exit 0 ``` -------------------------------------------------------------------------------- /libs/lume/src/Commands/Run.swift: -------------------------------------------------------------------------------- ```swift 1 | import ArgumentParser 2 | import Foundation 3 | import Virtualization 4 | 5 | struct Run: AsyncParsableCommand { 6 | static let configuration = CommandConfiguration( 7 | abstract: "Run a virtual machine" 8 | ) 9 | 10 | @Argument( 11 | help: "Name of the virtual machine or image to pull and run (format: name or name:tag)", 12 | completion: .custom(completeVMName)) 13 | var name: String 14 | 15 | @Flag(name: [.short, .long], help: "Do not start the VNC client") 16 | var noDisplay: Bool = false 17 | 18 | @Option( 19 | name: [.customLong("shared-dir")], 20 | help: 21 | "Directory to share with the VM. Can be just a path for read-write access (e.g. ~/src) or path:tag where tag is 'ro' for read-only or 'rw' for read-write (e.g. ~/src:ro)" 22 | ) 23 | var sharedDirectories: [String] = [] 24 | 25 | @Option( 26 | help: 27 | "For Linux VMs only, a read-only disk image to attach to the VM (e.g. --mount=\"ubuntu.iso\")", 28 | completion: .file()) 29 | var mount: String? 30 | 31 | @Option( 32 | name: [.customLong("usb-storage")], 33 | help: "Disk image to attach as a USB mass storage device (e.g. --usb-storage=\"disk.img\")", 34 | completion: .file()) 35 | var usbStorageDevices: [String] = [] 36 | 37 | @Option(help: "Github Container Registry to pull the images from. Defaults to ghcr.io") 38 | var registry: String = "ghcr.io" 39 | 40 | @Option(help: "Organization to pull the images from. Defaults to trycua") 41 | var organization: String = "trycua" 42 | 43 | @Option( 44 | name: [.customLong("vnc-port")], 45 | help: "Port to use for the VNC server. Defaults to 0 (auto-assign)") 46 | var vncPort: Int = 0 47 | 48 | @Option(help: "For MacOS VMs only, boot into the VM in recovery mode") 49 | var recoveryMode: Bool = false 50 | 51 | @Option(name: .customLong("storage"), help: "VM storage location to use or direct path to VM location") 52 | var storage: String? 53 | 54 | private var parsedSharedDirectories: [SharedDirectory] { 55 | get throws { 56 | try sharedDirectories.map { dirString -> SharedDirectory in 57 | let components = dirString.split(separator: ":", maxSplits: 1) 58 | let hostPath = String(components[0]) 59 | 60 | // If no tag is provided, default to read-write 61 | if components.count == 1 { 62 | return SharedDirectory( 63 | hostPath: hostPath, 64 | tag: VZVirtioFileSystemDeviceConfiguration.macOSGuestAutomountTag, 65 | readOnly: false 66 | ) 67 | } 68 | 69 | // Parse the tag if provided 70 | let tag = String(components[1]) 71 | let readOnly: Bool 72 | switch tag.lowercased() { 73 | case "ro": 74 | readOnly = true 75 | case "rw": 76 | readOnly = false 77 | default: 78 | throw ValidationError( 79 | "Invalid tag value. Must be either 'ro' for read-only or 'rw' for read-write" 80 | ) 81 | } 82 | 83 | return SharedDirectory( 84 | hostPath: hostPath, 85 | tag: VZVirtioFileSystemDeviceConfiguration.macOSGuestAutomountTag, 86 | readOnly: readOnly 87 | ) 88 | } 89 | } 90 | } 91 | 92 | private var parsedUSBStorageDevices: [Path] { 93 | usbStorageDevices.map { Path($0) } 94 | } 95 | 96 | init() { 97 | } 98 | 99 | @MainActor 100 | func run() async throws { 101 | try await LumeController().runVM( 102 | name: name, 103 | noDisplay: noDisplay, 104 | sharedDirectories: parsedSharedDirectories, 105 | mount: mount.map { Path($0) }, 106 | registry: registry, 107 | organization: organization, 108 | vncPort: vncPort, 109 | recoveryMode: recoveryMode, 110 | storage: storage, 111 | usbMassStoragePaths: parsedUSBStorageDevices.isEmpty ? nil : parsedUSBStorageDevices 112 | ) 113 | } 114 | } 115 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/adapters/models/opencua.py: -------------------------------------------------------------------------------- ```python 1 | from typing import List, Dict, Any 2 | import re 3 | import base64 4 | from io import BytesIO 5 | 6 | try: 7 | import torch # type: ignore 8 | from transformers import AutoTokenizer, AutoModel, AutoImageProcessor # type: ignore 9 | from PIL import Image # type: ignore 10 | import blobfile as _ # assert blobfile is installed 11 | OPENCUA_AVAILABLE = True 12 | except Exception: 13 | OPENCUA_AVAILABLE = False 14 | 15 | 16 | class OpenCUAModel: 17 | """OpenCUA model handler using AutoTokenizer, AutoModel and AutoImageProcessor.""" 18 | 19 | def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None: 20 | if not OPENCUA_AVAILABLE: 21 | raise ImportError( 22 | "OpenCUA requirements not found. Install with: pip install \"cua-agent[opencua-hf]\"" 23 | ) 24 | self.model_name = model_name 25 | self.device = device 26 | self.model = None 27 | self.tokenizer = None 28 | self.image_processor = None 29 | self.trust_remote_code = trust_remote_code 30 | self._load() 31 | 32 | def _load(self) -> None: 33 | self.tokenizer = AutoTokenizer.from_pretrained( 34 | self.model_name, trust_remote_code=self.trust_remote_code 35 | ) 36 | self.model = AutoModel.from_pretrained( 37 | self.model_name, 38 | torch_dtype="auto", 39 | device_map=self.device, 40 | trust_remote_code=self.trust_remote_code, 41 | attn_implementation="sdpa", 42 | ) 43 | self.image_processor = AutoImageProcessor.from_pretrained( 44 | self.model_name, trust_remote_code=self.trust_remote_code 45 | ) 46 | 47 | @staticmethod 48 | def _extract_last_image_b64(messages: List[Dict[str, Any]]) -> str: 49 | # Expect HF-format messages with content items type: "image" with data URL 50 | for msg in reversed(messages): 51 | for item in reversed(msg.get("content", [])): 52 | if isinstance(item, dict) and item.get("type") == "image": 53 | url = item.get("image", "") 54 | if isinstance(url, str) and url.startswith("data:image/"): 55 | return url.split(",", 1)[1] 56 | return "" 57 | 58 | def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 512) -> str: 59 | assert self.model is not None and self.tokenizer is not None and self.image_processor is not None 60 | 61 | # Tokenize text side using chat template 62 | input_ids = self.tokenizer.apply_chat_template( 63 | messages, tokenize=True, add_generation_prompt=True 64 | ) 65 | input_ids = torch.tensor([input_ids]).to(self.model.device) 66 | 67 | # Prepare image inputs from last data URL image 68 | image_b64 = self._extract_last_image_b64(messages) 69 | pixel_values = None 70 | grid_thws = None 71 | if image_b64: 72 | image = Image.open(BytesIO(base64.b64decode(image_b64))).convert("RGB") 73 | image_info = self.image_processor.preprocess(images=[image]) 74 | pixel_values = torch.tensor(image_info["pixel_values"]).to( 75 | dtype=torch.bfloat16, device=self.model.device 76 | ) 77 | grid_thws = torch.tensor(image_info["image_grid_thw"]) if "image_grid_thw" in image_info else None 78 | 79 | gen_kwargs: Dict[str, Any] = { 80 | "max_new_tokens": max_new_tokens, 81 | "temperature": 0, 82 | } 83 | if pixel_values is not None: 84 | gen_kwargs["pixel_values"] = pixel_values 85 | if grid_thws is not None: 86 | gen_kwargs["grid_thws"] = grid_thws 87 | 88 | with torch.no_grad(): 89 | generated_ids = self.model.generate( 90 | input_ids, 91 | **gen_kwargs, 92 | ) 93 | 94 | # Remove prompt tokens 95 | prompt_len = input_ids.shape[1] 96 | generated_ids = generated_ids[:, prompt_len:] 97 | output_text = self.tokenizer.batch_decode( 98 | generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False 99 | )[0] 100 | return output_text 101 | ``` -------------------------------------------------------------------------------- /docs/content/docs/agent-sdk/custom-computer-handlers.mdx: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | title: Custom Computers 3 | slug: custom-computer-handlers 4 | --- 5 | 6 | The Agent SDK supports defining custom computer handlers using a simple dictionary interface. This enables integration with custom automation backends, testing frameworks, or specialized computer control systems. 7 | 8 | ## Example: Defining a Custom Computer Handler 9 | 10 | ```python 11 | import asyncio 12 | from PIL import Image 13 | 14 | # Define your custom computer functions 15 | async def take_screenshot(): 16 | """Your custom screenshot implementation""" 17 | # Return PIL Image, bytes, or base64 string 18 | return Image.new('RGB', (1920, 1080), color='white') 19 | 20 | # Create dict-based computer handler - only 'screenshot' is required 21 | custom_computer = { 22 | 'screenshot': take_screenshot, # required 23 | 24 | # everything below is optional 25 | 'environment': 'linux', # linux, mac, windows, browser 26 | 'dimensions': (1920, 1080), # (width, height) 27 | 'click': lambda x, y, button: print(f"Clicking at ({x}, {y}) with {button} button"), 28 | } 29 | ``` 30 | 31 | You can then use this as a tool for your agent: 32 | 33 | ```python 34 | from agent import ComputerAgent 35 | 36 | agent = ComputerAgent( 37 | model="anthropic/claude-3-5-sonnet-20241022", 38 | tools=[custom_computer], 39 | ) 40 | 41 | # Agent will automatically convert dict to agent.computers.CustomComputerHandler 42 | await agent.run("Take a screenshot and click at coordinates 100, 200") 43 | ``` 44 | 45 | ## Class-Based Implementation 46 | 47 | For more complex implementations, you can create a custom class by inheriting from `AsyncComputerHandler`: 48 | 49 | ```python 50 | from agent.computers import AsyncComputerHandler 51 | from PIL import Image 52 | from typing import Literal, List, Dict, Union, Optional 53 | 54 | class MyCustomComputer(AsyncComputerHandler): 55 | """Custom computer handler implementation.""" 56 | 57 | def __init__(self): 58 | # Initialize your custom computer interface here 59 | pass 60 | 61 | # ==== Computer-Use-Preview Action Space ==== 62 | 63 | async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]: 64 | """Get the current environment type.""" 65 | ... 66 | 67 | async def get_dimensions(self) -> tuple[int, int]: 68 | """Get screen dimensions as (width, height).""" 69 | ... 70 | 71 | async def screenshot(self) -> str: 72 | """Take a screenshot and return as base64 string.""" 73 | ... 74 | 75 | async def click(self, x: int, y: int, button: str = "left") -> None: 76 | """Click at coordinates with specified button.""" 77 | ... 78 | 79 | async def double_click(self, x: int, y: int) -> None: 80 | """Double click at coordinates.""" 81 | ... 82 | 83 | async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: 84 | """Scroll at coordinates with specified scroll amounts.""" 85 | ... 86 | 87 | async def type(self, text: str) -> None: 88 | """Type text.""" 89 | ... 90 | 91 | async def wait(self, ms: int = 1000) -> None: 92 | """Wait for specified milliseconds.""" 93 | ... 94 | 95 | async def move(self, x: int, y: int) -> None: 96 | """Move cursor to coordinates.""" 97 | ... 98 | 99 | async def keypress(self, keys: Union[List[str], str]) -> None: 100 | """Press key combination.""" 101 | ... 102 | 103 | async def drag(self, path: List[Dict[str, int]]) -> None: 104 | """Drag along specified path.""" 105 | ... 106 | 107 | async def get_current_url(self) -> str: 108 | """Get current URL (for browser environments).""" 109 | ... 110 | 111 | # ==== Anthropic Action Space ==== 112 | 113 | async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None: 114 | """Left mouse down at coordinates.""" 115 | ... 116 | 117 | async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None: 118 | """Left mouse up at coordinates.""" 119 | ... 120 | 121 | # Use with agent 122 | custom_computer = MyCustomComputer() 123 | 124 | agent = ComputerAgent( 125 | model="anthropic/claude-3-5-sonnet-20241022", 126 | tools=[custom_computer], 127 | ) 128 | 129 | await agent.run("Take a screenshot and click at coordinates 100, 200") 130 | ``` ``` -------------------------------------------------------------------------------- /libs/python/som/som/models.py: -------------------------------------------------------------------------------- ```python 1 | from typing import List, Tuple, Optional, Literal, Dict, Any, Union 2 | from pydantic import BaseModel, Field, validator 3 | 4 | 5 | class BoundingBox(BaseModel): 6 | """Normalized bounding box coordinates.""" 7 | 8 | x1: float = Field(..., description="Normalized left coordinate") 9 | y1: float = Field(..., description="Normalized top coordinate") 10 | x2: float = Field(..., description="Normalized right coordinate") 11 | y2: float = Field(..., description="Normalized bottom coordinate") 12 | 13 | @property 14 | def coordinates(self) -> List[float]: 15 | """Get coordinates as a list [x1, y1, x2, y2].""" 16 | return [self.x1, self.y1, self.x2, self.y2] 17 | 18 | 19 | class UIElement(BaseModel): 20 | """Base class for UI elements.""" 21 | 22 | id: Optional[int] = Field(None, description="Unique identifier for the element (1-indexed)") 23 | type: Literal["icon", "text"] 24 | bbox: BoundingBox 25 | interactivity: bool = Field(default=False, description="Whether the element is interactive") 26 | confidence: float = Field(default=1.0, description="Detection confidence score") 27 | 28 | 29 | class IconElement(UIElement): 30 | """An interactive icon element.""" 31 | 32 | type: Literal["icon"] = "icon" 33 | interactivity: bool = True 34 | scale: Optional[int] = Field(None, description="Detection scale used") 35 | 36 | 37 | class TextElement(UIElement): 38 | """A text element.""" 39 | 40 | type: Literal["text"] = "text" 41 | content: str = Field(..., description="The text content") 42 | interactivity: bool = False 43 | 44 | 45 | class ImageData(BaseModel): 46 | """Image data with dimensions.""" 47 | 48 | base64: str = Field(..., description="Base64 encoded image data") 49 | width: int = Field(..., description="Image width in pixels") 50 | height: int = Field(..., description="Image height in pixels") 51 | 52 | @validator("width", "height") 53 | def dimensions_must_be_positive(cls, v): 54 | if v <= 0: 55 | raise ValueError("Dimensions must be positive") 56 | return v 57 | 58 | 59 | class ParserMetadata(BaseModel): 60 | """Metadata about the parsing process.""" 61 | 62 | image_size: Tuple[int, int] = Field( 63 | ..., description="Original image dimensions (width, height)" 64 | ) 65 | num_icons: int = Field(..., description="Number of icons detected") 66 | num_text: int = Field(..., description="Number of text elements detected") 67 | device: str = Field(..., description="Device used for detection (cpu/cuda/mps)") 68 | ocr_enabled: bool = Field(..., description="Whether OCR was enabled") 69 | latency: float = Field(..., description="Total processing time in seconds") 70 | 71 | @property 72 | def width(self) -> int: 73 | """Get image width from image_size.""" 74 | return self.image_size[0] 75 | 76 | @property 77 | def height(self) -> int: 78 | """Get image height from image_size.""" 79 | return self.image_size[1] 80 | 81 | 82 | class ParseResult(BaseModel): 83 | """Result of parsing a UI screenshot.""" 84 | 85 | elements: List[UIElement] = Field(..., description="Detected UI elements") 86 | annotated_image_base64: str = Field(..., description="Base64 encoded annotated image") 87 | metadata: ParserMetadata = Field(..., description="Processing metadata") 88 | screen_info: Optional[List[str]] = Field( 89 | None, description="Human-readable descriptions of elements" 90 | ) 91 | parsed_content_list: Optional[List[Dict[str, Any]]] = Field( 92 | None, description="Parsed elements as dictionaries" 93 | ) 94 | 95 | @property 96 | def image(self) -> ImageData: 97 | """Get image data as a convenience property.""" 98 | return ImageData( 99 | base64=self.annotated_image_base64, 100 | width=self.metadata.width, 101 | height=self.metadata.height, 102 | ) 103 | 104 | @property 105 | def width(self) -> int: 106 | """Get image width from metadata.""" 107 | return self.metadata.width 108 | 109 | @property 110 | def height(self) -> int: 111 | """Get image height from metadata.""" 112 | return self.metadata.height 113 | 114 | def model_dump(self) -> Dict[str, Any]: 115 | """Convert model to dict for compatibility with older code.""" 116 | result = super().model_dump() 117 | # Add image data dict for backward compatibility 118 | result["image"] = self.image.model_dump() 119 | return result 120 | ``` -------------------------------------------------------------------------------- /docs/content/docs/agent-sdk/customizing-computeragent.mdx: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | title: Customizing Your ComputerAgent 3 | --- 4 | 5 | <Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/customizing_computeragent.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.</Callout> 6 | 7 | The `ComputerAgent` interface provides an easy proxy to any computer-using model configuration, and it is a powerful framework for extending and building your own agentic systems. 8 | 9 | This guide shows four proven ways to increase capabilities and success rate: 10 | 11 | - 1 — Simple: Prompt engineering 12 | - 2 — Easy: Tools 13 | - 3 — Intermediate: Callbacks 14 | - 4 — Expert: Custom `@register_agent` 15 | 16 | ## 1) Simple: Prompt engineering 17 | 18 | Provide guiding instructions to shape behavior. `ComputerAgent` accepts an optional `instructions: str | None` which acts like a system-style preface. Internally, this uses a callback that pre-pends a user message before each LLM call. 19 | 20 | ```python 21 | from agent.agent import ComputerAgent 22 | 23 | agent = ComputerAgent( 24 | model="openai/computer-use-preview", 25 | tools=[computer], 26 | instructions=( 27 | "You are a meticulous software operator. Prefer safe, deterministic actions. " 28 | "Always confirm via on-screen text before proceeding." 29 | ), 30 | ) 31 | ``` 32 | 33 | ## 2) Easy: Tools 34 | 35 | Expose deterministic capabilities as tools (Python functions or custom computer handlers). The agent will call them when appropriate. 36 | 37 | ```python 38 | def calculate_percentage(numerator: float, denominator: float) -> str: 39 | """Calculate percentage as a string. 40 | 41 | Args: 42 | numerator: Numerator value 43 | denominator: Denominator value 44 | Returns: 45 | A formatted percentage string (e.g., '75.00%'). 46 | """ 47 | if denominator == 0: 48 | return "0.00%" 49 | return f"{(numerator/denominator)*100:.2f}%" 50 | 51 | agent = ComputerAgent( 52 | model="openai/computer-use-preview", 53 | tools=[computer, calculate_percentage], 54 | ) 55 | ``` 56 | 57 | - See `docs/agent-sdk/custom-tools` for authoring function tools. 58 | - See `docs/agent-sdk/custom-computer-handlers` for building full computer interfaces. 59 | 60 | ## 3) Intermediate: Callbacks 61 | 62 | Callbacks provide lifecycle hooks to preprocess messages, postprocess outputs, record trajectories, manage costs, and more. 63 | 64 | ```python 65 | from agent.callbacks import ImageRetentionCallback, TrajectorySaverCallback, BudgetManagerCallback 66 | 67 | agent = ComputerAgent( 68 | model="anthropic/claude-3-5-sonnet-20241022", 69 | tools=[computer], 70 | callbacks=[ 71 | ImageRetentionCallback(only_n_most_recent_images=3), 72 | TrajectorySaverCallback("./trajectories"), 73 | BudgetManagerCallback(max_budget=10.0, raise_error=True), 74 | ], 75 | ) 76 | ``` 77 | 78 | - Browse implementations in `libs/python/agent/agent/loops/`. 79 | 80 | ## 4) Expert: Custom `@register_agent` 81 | 82 | Build your own agent configuration class to control prompting, message shaping, and tool handling. This is the most flexible option for specialized domains. 83 | 84 | - Register your own `model=...` loop using `@register_agent` 85 | - Browse implementations in `libs/python/agent/agent/loops/`. 86 | - Implement `predict_step()` (and optionally `predict_click()`) and return the standardized output schema. 87 | 88 | ```python 89 | from agent.decorators import register_agent 90 | 91 | @register_agent(models=r".*my-special-model.*", priority=10) 92 | class MyCustomAgentConfig: 93 | async def predict_step(self, messages, model, tools, **kwargs): 94 | # 1) Format messages for your provider 95 | # 2) Call provider 96 | # 3) Convert responses to the agent output schema 97 | return {"output": [], "usage": {}} 98 | 99 | async def predict_click(self, model, image_b64, instruction): 100 | # Optional: click-only capability 101 | return None 102 | 103 | def get_capabilities(self): 104 | return ["step"] 105 | ``` 106 | 107 | ## HUD integration (optional) 108 | 109 | When using the HUD evaluation integration (`agent/integrations/hud/`), you can pass `instructions`, `tools`, and `callbacks` directly 110 | 111 | ```python 112 | from agent.integrations.hud import run_single_task 113 | 114 | await run_single_task( 115 | dataset="username/dataset-name", 116 | model="openai/computer-use-preview", 117 | instructions="Operate carefully. Always verify on-screen text before actions.", 118 | # tools=[your_custom_function], 119 | # callbacks=[YourCustomCallback()], 120 | ) 121 | ``` ``` -------------------------------------------------------------------------------- /libs/python/pylume/pylume/client.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import asyncio 3 | import subprocess 4 | from typing import Optional, Any, Dict 5 | import shlex 6 | 7 | from .exceptions import ( 8 | LumeError, 9 | LumeServerError, 10 | LumeConnectionError, 11 | LumeTimeoutError, 12 | LumeNotFoundError, 13 | LumeConfigError, 14 | ) 15 | 16 | class LumeClient: 17 | def __init__(self, base_url: str, timeout: float = 60.0, debug: bool = False): 18 | self.base_url = base_url 19 | self.timeout = timeout 20 | self.debug = debug 21 | 22 | def _log_debug(self, message: str, **kwargs) -> None: 23 | """Log debug information if debug mode is enabled.""" 24 | if self.debug: 25 | print(f"DEBUG: {message}") 26 | if kwargs: 27 | print(json.dumps(kwargs, indent=2)) 28 | 29 | async def _run_curl(self, method: str, path: str, data: Optional[Dict[str, Any]] = None, params: Optional[Dict[str, Any]] = None) -> Any: 30 | """Execute a curl command and return the response.""" 31 | url = f"{self.base_url}{path}" 32 | if params: 33 | param_str = "&".join(f"{k}={v}" for k, v in params.items()) 34 | url = f"{url}?{param_str}" 35 | 36 | cmd = ["curl", "-X", method, "-s", "-w", "%{http_code}", "-m", str(self.timeout)] 37 | 38 | if data is not None: 39 | cmd.extend(["-H", "Content-Type: application/json", "-d", json.dumps(data)]) 40 | 41 | cmd.append(url) 42 | 43 | self._log_debug(f"Running curl command: {' '.join(map(shlex.quote, cmd))}") 44 | 45 | try: 46 | process = await asyncio.create_subprocess_exec( 47 | *cmd, 48 | stdout=subprocess.PIPE, 49 | stderr=subprocess.PIPE 50 | ) 51 | stdout, stderr = await process.communicate() 52 | 53 | if process.returncode != 0: 54 | raise LumeConnectionError(f"Curl command failed: {stderr.decode()}") 55 | 56 | # The last 3 characters are the status code 57 | response = stdout.decode() 58 | status_code = int(response[-3:]) 59 | response_body = response[:-3] # Remove status code from response 60 | 61 | if status_code >= 400: 62 | if status_code == 404: 63 | raise LumeNotFoundError(f"Resource not found: {path}") 64 | elif status_code == 400: 65 | raise LumeConfigError(f"Invalid request: {response_body}") 66 | elif status_code >= 500: 67 | raise LumeServerError(f"Server error: {response_body}") 68 | else: 69 | raise LumeError(f"Request failed with status {status_code}: {response_body}") 70 | 71 | return json.loads(response_body) if response_body.strip() else None 72 | 73 | except asyncio.TimeoutError: 74 | raise LumeTimeoutError(f"Request timed out after {self.timeout} seconds") 75 | 76 | async def get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Any: 77 | """Make a GET request.""" 78 | return await self._run_curl("GET", path, params=params) 79 | 80 | async def post(self, path: str, data: Optional[Dict[str, Any]] = None, timeout: Optional[float] = None) -> Any: 81 | """Make a POST request.""" 82 | old_timeout = self.timeout 83 | if timeout is not None: 84 | self.timeout = timeout 85 | try: 86 | return await self._run_curl("POST", path, data=data) 87 | finally: 88 | self.timeout = old_timeout 89 | 90 | async def patch(self, path: str, data: Dict[str, Any]) -> None: 91 | """Make a PATCH request.""" 92 | await self._run_curl("PATCH", path, data=data) 93 | 94 | async def delete(self, path: str) -> None: 95 | """Make a DELETE request.""" 96 | await self._run_curl("DELETE", path) 97 | 98 | def print_curl(self, method: str, path: str, data: Optional[Dict[str, Any]] = None) -> None: 99 | """Print equivalent curl command for debugging.""" 100 | curl_cmd = f"""curl -X {method} \\ 101 | '{self.base_url}{path}'""" 102 | 103 | if data: 104 | curl_cmd += f" \\\n -H 'Content-Type: application/json' \\\n -d '{json.dumps(data)}'" 105 | 106 | print("\nEquivalent curl command:") 107 | print(curl_cmd) 108 | print() 109 | 110 | async def close(self) -> None: 111 | """Close the client resources.""" 112 | pass # No shared resources to clean up ``` -------------------------------------------------------------------------------- /docs/src/components/iou.tsx: -------------------------------------------------------------------------------- ```typescript 1 | 'use client'; 2 | import React, { useRef, useEffect, useState, useCallback } from 'react'; 3 | 4 | /** 5 | * Represents a rectangle with position, dimensions, styling, and identification 6 | */ 7 | interface Rectangle { 8 | /** The x-coordinate of the rectangle's left edge */ 9 | left: number; 10 | /** The y-coordinate of the rectangle's top edge */ 11 | top: number; 12 | /** The width of the rectangle */ 13 | width: number; 14 | /** The height of the rectangle */ 15 | height: number; 16 | /** The fill color of the rectangle */ 17 | fill: string; 18 | /** The display name of the rectangle */ 19 | name: string; 20 | } 21 | 22 | /** 23 | * Props for the IOU component 24 | */ 25 | interface IOUProps { 26 | /** The title to display above the visualization */ 27 | title: string; 28 | /** The description text to display below the IOU value */ 29 | description: string; 30 | /** The first rectangle for IOU calculation */ 31 | rect1: Rectangle; 32 | /** The second rectangle for IOU calculation */ 33 | rect2: Rectangle; 34 | } 35 | 36 | /** 37 | * A React component that visualizes and calculates the Intersection over Union (IOU) 38 | * of two rectangles on a canvas 39 | * @param props - The component props 40 | * @returns The rendered IOU visualization component 41 | */ 42 | export default function IOU({ title, description, rect1, rect2 }: IOUProps) { 43 | const canvasRef = useRef<HTMLCanvasElement>(null); 44 | const [actualIOU, setActualIOU] = useState<number>(0); 45 | 46 | /** 47 | * Converts a rectangle to a bounding box with left, right, top, and bottom coordinates 48 | * @param rect - The rectangle to convert 49 | * @returns An object containing the bounding box coordinates 50 | */ 51 | const getBbox = (rect: Rectangle) => ({ 52 | left: rect.left, 53 | right: rect.left + rect.width, 54 | top: rect.top, 55 | bottom: rect.top + rect.height, 56 | }); 57 | 58 | /** 59 | * Calculates the intersection area between two bounding boxes 60 | * @param bbox1 - The first bounding box 61 | * @param bbox2 - The second bounding box 62 | * @returns The area of intersection between the two bounding boxes 63 | */ 64 | const calcIntersection = (bbox1: any, bbox2: any): number => { 65 | const x1 = Math.max(bbox1.left, bbox2.left); 66 | const x2 = Math.min(bbox1.right, bbox2.right); 67 | const y1 = Math.max(bbox1.top, bbox2.top); 68 | const y2 = Math.min(bbox1.bottom, bbox2.bottom); 69 | 70 | // Check if there's actually an overlap 71 | if (x2 <= x1 || y2 <= y1) { 72 | return 0; 73 | } 74 | 75 | const intersection = (x2 - x1) * (y2 - y1); 76 | return intersection; 77 | }; 78 | 79 | /** 80 | * Calculates the area of a rectangle 81 | * @param rect - The rectangle to calculate area for 82 | * @returns The area of the rectangle 83 | */ 84 | const calcArea = (rect: Rectangle): number => { 85 | return rect.width * rect.height; 86 | }; 87 | 88 | /** 89 | * Draws the rectangles on the canvas and calculates the IOU value 90 | */ 91 | const drawCanvas = useCallback(() => { 92 | const canvas = canvasRef.current; 93 | if (!canvas) return; 94 | 95 | const ctx = canvas.getContext('2d'); 96 | if (!ctx) return; 97 | 98 | // Clear canvas 99 | ctx.clearRect(0, 0, canvas.width, canvas.height); 100 | 101 | // Calculate IOU 102 | const bbox1 = getBbox(rect1); 103 | const bbox2 = getBbox(rect2); 104 | const intersection = calcIntersection(bbox1, bbox2); 105 | const union = calcArea(rect1) + calcArea(rect2) - intersection; 106 | const iou = intersection / union; 107 | setActualIOU(iou); 108 | 109 | // Draw rectangles 110 | [rect1, rect2].forEach((rect) => { 111 | ctx.fillStyle = rect.fill; 112 | ctx.fillRect(rect.left, rect.top, rect.width, rect.height); 113 | 114 | ctx.strokeStyle = '#000'; 115 | ctx.lineWidth = 2; 116 | ctx.strokeRect(rect.left, rect.top, rect.width, rect.height); 117 | 118 | ctx.fillStyle = '#000'; 119 | ctx.font = '12px'; 120 | ctx.fillText(rect.name, rect.left + 5, rect.top + 15); 121 | }); 122 | }, [rect1, rect2]); 123 | 124 | useEffect(() => { 125 | drawCanvas(); 126 | }, [drawCanvas]); 127 | 128 | return ( 129 | <div className=""> 130 | <h3 className="text-sm font-semibold ">{title}</h3> 131 | <div className="flex items-start gap-6"> 132 | <div> 133 | <canvas 134 | ref={canvasRef} 135 | width={200} 136 | height={150} 137 | className="border bg-white rounded-md" 138 | /> 139 | <div className="mt-2 text-sm"> 140 | <div className="font-mono mb-2">IOU = {actualIOU.toFixed(3)}</div> 141 | <span className="">{description}</span> 142 | </div> 143 | </div> 144 | </div> 145 | </div> 146 | ); 147 | } 148 | ``` -------------------------------------------------------------------------------- /blog/cua-hackathon.md: -------------------------------------------------------------------------------- ```markdown 1 | # Computer-Use Agents SOTA Challenge: Hack the North + Global Online 2 | 3 | *Published on August 25, 2025 by Francesco Bonacci* 4 | 5 | We’re bringing something new to [Hack the North](https://hackthenorth.com), Canada’s largest hackathon, this year: a head-to-head competition for **Computer-Use Agents** - on-site at Waterloo and a **Global online challenge**. From September 12–14, 2025, teams build on the **Cua Agent Framework** and are scored in **HUD’s OSWorld-Verified** environment to push past today’s SOTA on [OS-World](https://os-world.github.io). 6 | 7 | <img src="./assets/hack-the-north.png"> 8 | 9 | ## Track A: On-site @ Hack the North 10 | 11 | There’s one global leaderboard: **Cua - Best State-of-the-Art Computer-Use Agent**. Use any model setup you like (cloud or local). After projects are submitted, [HUD](https://www.hud.so) runs the official benchmark; the top team earns a **guaranteed YC partner interview (W26 batch)**. We’ll also feature winners on our blog and socials and kit the team out with swag. 12 | 13 | ## Track B: Cua Global Online Hackathon 14 | 15 | **Cua** and [**Ollama**](https://ollama.com) organize a global hackathon to find the **most creative uses of local and hybrid computer-use agents**. There are no geographic restrictions on who can join — this is a worldwide competition focused on **originality, impact, and inventive applications** that showcase what's possible with local and hybrid inference. 16 | 17 | **Prizes:** 18 | - 1st **MacBook Air M4 (or equivalent value)** + features in Cua & Ollama channels 19 | - 2nd **$500 CAD + swag** 20 | - 3rd **swag + public feature** 21 | 22 | --- 23 | 24 | ## How it works 25 | 26 | Two different tracks, two different processes: 27 | 28 | ### On-site (Track A) 29 | Build during the weekend and submit a repo with a one-line start command. **HUD** executes your command in a clean environment and runs **OSWorld-Verified**. Scores come from official benchmark results; ties break by median, then wall-clock time, then earliest submission. Any model setup is allowed (cloud or local). 30 | 31 | **HUD** runs official evaluations immediately after submission. Winners are announced at the **closing ceremony**. 32 | 33 | ### Rules 34 | - Fork and star the [Cua repo](https://github.com/trycua/cua). 35 | - Add your agent and instructions in `samples/community/hack-the-north/<YOUR_TEAM_NAME>`. 36 | - Include a README with details on the approach and any required notes. 37 | - Submit a PR. 38 | 39 | **Deadline: Sept 15, 8:00 AM EDT** 40 | 41 | ### Global Online (Track B) 42 | Open to anyone, anywhere. Build on your own timeline and submit through the **Cua Discord form** by the deadline. 43 | 44 | **Project Requirements:** 45 | - Your agent must integrate **Cua and Ollama** in some way 46 | - Your agent must be **easily runnable by judges** 47 | 48 | Judged by **Cua** and **Ollama** teams on: 49 | - **Creativity (30%)** – originality, usefulness, surprise factor 50 | - **Technical Depth (30%)** – quality of engineering and agent design 51 | - **Use of Ollama (30%)** – effective integration of local/hybrid inference 52 | - **Polish (10%)** – presentation, clarity, demo readiness 53 | 54 | ### Submission Process 55 | Submissions will be collected via a **form link provided in the Cua Discord**. Your submission must contain: 56 | 57 | - **GitHub repo** containing the agent source code and a clear README with instructions on how to use the agent 58 | - **Explanation** of the models and tools used, and what's local or hybrid about your design 59 | - **Short demo video** (up to two minutes) 60 | 61 | A **commit freeze** will be used to ensure that no changes are made after the deadline. Winners will be announced after judging is complete. 62 | 63 | **Deadline: Sept 28, 11:59 PM UTC (extended due to popular demand!)** 64 | 65 | --- 66 | 67 | ## Join us 68 | 69 | Bring a team, pick a model stack, and push what agents can do on real computers. We can’t wait to see what you build at **Hack the North 2025**. 70 | 71 | **Discord channels** 72 | - Join the Discord first: https://discord.gg/cua-ai 73 | - **#hack-the-north (on-site):** https://discord.com/channels/1328377437301641247/1409508526774157342 74 | - **#global-online (Ollama × Cua):** https://discord.com/channels/1328377437301641247/1409518100491145226 75 | 76 | **Contact** 77 | Questions on Hack the North? Email **[email protected]**. 78 | 79 | *P.S. If you’re planning ahead, start with the Cua Agent Framework and OSWorld-Verified docs at docs.trycua.com; we’ll share office-hour times in both Discord channels.* ``` -------------------------------------------------------------------------------- /libs/python/computer/computer/providers/base.py: -------------------------------------------------------------------------------- ```python 1 | """Base provider interface for VM backends.""" 2 | 3 | import abc 4 | from enum import StrEnum 5 | from typing import Dict, Optional, Any, AsyncContextManager 6 | 7 | from .types import ListVMsResponse 8 | 9 | 10 | class VMProviderType(StrEnum): 11 | """Enum of supported VM provider types.""" 12 | LUME = "lume" 13 | LUMIER = "lumier" 14 | CLOUD = "cloud" 15 | WINSANDBOX = "winsandbox" 16 | DOCKER = "docker" 17 | UNKNOWN = "unknown" 18 | 19 | 20 | class BaseVMProvider(AsyncContextManager): 21 | """Base interface for VM providers. 22 | 23 | All VM provider implementations must implement this interface. 24 | """ 25 | 26 | @property 27 | @abc.abstractmethod 28 | def provider_type(self) -> VMProviderType: 29 | """Get the provider type.""" 30 | pass 31 | 32 | @abc.abstractmethod 33 | async def get_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: 34 | """Get VM information by name. 35 | 36 | Args: 37 | name: Name of the VM to get information for 38 | storage: Optional storage path override. If provided, this will be used 39 | instead of the provider's default storage path. 40 | 41 | Returns: 42 | Dictionary with VM information including status, IP address, etc. 43 | """ 44 | pass 45 | 46 | @abc.abstractmethod 47 | async def list_vms(self) -> ListVMsResponse: 48 | """List all available VMs. 49 | 50 | Returns: 51 | ListVMsResponse: A list of minimal VM objects as defined in 52 | `computer.providers.types.MinimalVM`. 53 | """ 54 | pass 55 | 56 | @abc.abstractmethod 57 | async def run_vm(self, image: str, name: str, run_opts: Dict[str, Any], storage: Optional[str] = None) -> Dict[str, Any]: 58 | """Run a VM by name with the given options. 59 | 60 | Args: 61 | image: Name/tag of the image to use 62 | name: Name of the VM to run 63 | run_opts: Dictionary of run options (memory, cpu, etc.) 64 | storage: Optional storage path override. If provided, this will be used 65 | instead of the provider's default storage path. 66 | 67 | Returns: 68 | Dictionary with VM run status and information 69 | """ 70 | pass 71 | 72 | @abc.abstractmethod 73 | async def stop_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: 74 | """Stop a VM by name. 75 | 76 | Args: 77 | name: Name of the VM to stop 78 | storage: Optional storage path override. If provided, this will be used 79 | instead of the provider's default storage path. 80 | 81 | Returns: 82 | Dictionary with VM stop status and information 83 | """ 84 | pass 85 | 86 | @abc.abstractmethod 87 | async def restart_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: 88 | """Restart a VM by name. 89 | 90 | Args: 91 | name: Name of the VM to restart 92 | storage: Optional storage path override. If provided, this will be used 93 | instead of the provider's default storage path. 94 | 95 | Returns: 96 | Dictionary with VM restart status and information 97 | """ 98 | pass 99 | 100 | @abc.abstractmethod 101 | async def update_vm(self, name: str, update_opts: Dict[str, Any], storage: Optional[str] = None) -> Dict[str, Any]: 102 | """Update VM configuration. 103 | 104 | Args: 105 | name: Name of the VM to update 106 | update_opts: Dictionary of update options (memory, cpu, etc.) 107 | storage: Optional storage path override. If provided, this will be used 108 | instead of the provider's default storage path. 109 | 110 | Returns: 111 | Dictionary with VM update status and information 112 | """ 113 | pass 114 | 115 | @abc.abstractmethod 116 | async def get_ip(self, name: str, storage: Optional[str] = None, retry_delay: int = 2) -> str: 117 | """Get the IP address of a VM, waiting indefinitely until it's available. 118 | 119 | Args: 120 | name: Name of the VM to get the IP for 121 | storage: Optional storage path override. If provided, this will be used 122 | instead of the provider's default storage path. 123 | retry_delay: Delay between retries in seconds (default: 2) 124 | 125 | Returns: 126 | IP address of the VM when it becomes available 127 | """ 128 | pass 129 | ``` -------------------------------------------------------------------------------- /libs/lume/src/Virtualization/DHCPLeaseParser.swift: -------------------------------------------------------------------------------- ```swift 1 | import Foundation 2 | 3 | /// Represents a DHCP lease entry from the system's DHCP lease file 4 | private struct DHCPLease { 5 | let macAddress: String 6 | let ipAddress: String 7 | let expirationDate: Date 8 | 9 | /// Creates a lease entry from raw DHCP lease file key-value pairs 10 | /// - Parameter dict: Dictionary containing the raw lease data 11 | /// - Returns: A DHCPLease instance if the data is valid, nil otherwise 12 | static func from(_ dict: [String: String]) -> DHCPLease? { 13 | guard let hwAddress = dict["hw_address"], 14 | let ipAddress = dict["ip_address"], 15 | let lease = dict["lease"] else { 16 | return nil 17 | } 18 | 19 | // Parse MAC address from hw_address field (format can be "1,xx:xx:xx:xx:xx:xx" or "ff,...") 20 | let hwParts = hwAddress.split(separator: ",") 21 | guard hwParts.count >= 2 else { return nil } 22 | 23 | // Get the MAC part after the prefix and normalize it 24 | let rawMacAddress = String(hwParts[1]).trimmingCharacters(in: .whitespaces) 25 | 26 | // Normalize the MAC address by ensuring each component is two digits 27 | let normalizedMacAddress = rawMacAddress.split(separator: ":") 28 | .map { component in 29 | let hex = String(component) 30 | return hex.count == 1 ? "0\(hex)" : hex 31 | } 32 | .joined(separator: ":") 33 | 34 | // Convert hex timestamp to Date 35 | let timestampHex = lease.trimmingCharacters(in: CharacterSet(charactersIn: "0x")) 36 | guard let timestamp = UInt64(timestampHex, radix: 16) else { return nil } 37 | let expirationDate = Date(timeIntervalSince1970: TimeInterval(timestamp)) 38 | 39 | return DHCPLease( 40 | macAddress: normalizedMacAddress, 41 | ipAddress: ipAddress, 42 | expirationDate: expirationDate 43 | ) 44 | } 45 | 46 | /// Checks if the lease is currently valid 47 | var isValid: Bool { 48 | expirationDate > Date() 49 | } 50 | } 51 | 52 | /// Parses DHCP lease files to retrieve IP addresses for VMs based on their MAC addresses 53 | enum DHCPLeaseParser { 54 | private static let leasePath = "/var/db/dhcpd_leases" 55 | 56 | /// Retrieves the IP address for a given MAC address from the DHCP lease file 57 | /// - Parameter macAddress: The MAC address to look up 58 | /// - Returns: The IP address if found, nil otherwise 59 | static func getIPAddress(forMAC macAddress: String) -> String? { 60 | guard let leaseContents = try? String(contentsOfFile: leasePath, encoding: .utf8) else { 61 | return nil 62 | } 63 | 64 | // Normalize the input MAC address to ensure consistent format 65 | let normalizedMacAddress = macAddress.split(separator: ":").map { component in 66 | let hex = String(component) 67 | return hex.count == 1 ? "0\(hex)" : hex 68 | }.joined(separator: ":") 69 | 70 | let leases = try? parseDHCPLeases(leaseContents) 71 | return leases?.first { lease in 72 | lease.macAddress == normalizedMacAddress 73 | }?.ipAddress 74 | } 75 | 76 | /// Parses the contents of a DHCP lease file into lease entries 77 | /// - Parameter contents: The raw contents of the lease file 78 | /// - Returns: Array of parsed lease entries 79 | private static func parseDHCPLeases(_ contents: String) throws -> [DHCPLease] { 80 | var leases: [DHCPLease] = [] 81 | var currentLease: [String: String] = [:] 82 | var inLeaseBlock = false 83 | 84 | let lines = contents.components(separatedBy: .newlines) 85 | 86 | for line in lines { 87 | let trimmedLine = line.trimmingCharacters(in: .whitespaces) 88 | 89 | if trimmedLine == "{" { 90 | inLeaseBlock = true 91 | currentLease = [:] 92 | } else if trimmedLine == "}" { 93 | if let lease = DHCPLease.from(currentLease) { 94 | leases.append(lease) 95 | } 96 | inLeaseBlock = false 97 | } else if inLeaseBlock { 98 | let parts = trimmedLine.split(separator: "=", maxSplits: 1) 99 | if parts.count == 2 { 100 | let key = String(parts[0]).trimmingCharacters(in: .whitespaces) 101 | let value = String(parts[1]).trimmingCharacters(in: .whitespaces) 102 | currentLease[key] = value 103 | } 104 | } 105 | } 106 | 107 | return leases 108 | } 109 | } ``` -------------------------------------------------------------------------------- /examples/computer_examples.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import asyncio 3 | from pathlib import Path 4 | import sys 5 | import traceback 6 | 7 | # Load environment variables from .env file 8 | project_root = Path(__file__).parent.parent 9 | env_file = project_root / ".env" 10 | print(f"Loading environment from: {env_file}") 11 | from dotenv import load_dotenv 12 | 13 | load_dotenv(env_file) 14 | 15 | # Add paths to sys.path if needed 16 | pythonpath = os.environ.get("PYTHONPATH", "") 17 | for path in pythonpath.split(":"): 18 | if path and path not in sys.path: 19 | sys.path.insert(0, path) # Insert at beginning to prioritize 20 | print(f"Added to sys.path: {path}") 21 | 22 | from computer.computer import Computer 23 | from computer.providers.base import VMProviderType 24 | from computer.logger import LogLevel 25 | 26 | async def main(): 27 | try: 28 | print("\n=== Using direct initialization ===") 29 | 30 | # Create a local macOS computer 31 | computer = Computer( 32 | display="1024x768", 33 | memory="8GB", 34 | cpu="4", 35 | os_type="macos", 36 | name="macos", 37 | verbosity=LogLevel.VERBOSE, 38 | provider_type=VMProviderType.LUME, 39 | storage="/Users/<USER>/repos/trycua/computer/examples/storage", 40 | shared_directories=[ 41 | "/Users/<USER>/repos/trycua/computer/examples/shared" 42 | ], 43 | ephemeral=False, 44 | ) 45 | 46 | # Create a remote Linux computer with Cua 47 | # computer = Computer( 48 | # os_type="linux", 49 | # api_key=os.getenv("CUA_API_KEY"), 50 | # name=os.getenv("CONTAINER_NAME"), 51 | # provider_type=VMProviderType.CLOUD, 52 | # ) 53 | 54 | try: 55 | # Run the computer with default parameters 56 | await computer.run() 57 | 58 | screenshot = await computer.interface.screenshot() 59 | 60 | # Create output directory if it doesn't exist 61 | output_dir = Path("./output") 62 | output_dir.mkdir(exist_ok=True) 63 | 64 | screenshot_path = output_dir / "screenshot.png" 65 | with open(screenshot_path, "wb") as f: 66 | f.write(screenshot) 67 | print(f"Screenshot saved to: {screenshot_path.absolute()}") 68 | 69 | # await computer.interface.hotkey("command", "space") 70 | 71 | # res = await computer.interface.run_command("touch ./Downloads/empty_file") 72 | # print(f"Run command result: {res}") 73 | 74 | accessibility_tree = await computer.interface.get_accessibility_tree() 75 | print(f"Accessibility tree: {accessibility_tree}") 76 | 77 | # Screen Actions Examples 78 | # print("\n=== Screen Actions ===") 79 | # screenshot = await computer.interface.screenshot() 80 | # with open("screenshot_direct.png", "wb") as f: 81 | # f.write(screenshot) 82 | 83 | screen_size = await computer.interface.get_screen_size() 84 | print(f"Screen size: {screen_size}") 85 | 86 | # Demonstrate coordinate conversion 87 | center_x, center_y = 733, 736 88 | print(f"Center in screen coordinates: ({center_x}, {center_y})") 89 | 90 | screenshot_center = await computer.to_screenshot_coordinates(center_x, center_y) 91 | print(f"Center in screenshot coordinates: {screenshot_center}") 92 | 93 | screen_center = await computer.to_screen_coordinates(*screenshot_center) 94 | print(f"Back to screen coordinates: {screen_center}") 95 | 96 | # Mouse Actions Examples 97 | print("\n=== Mouse Actions ===") 98 | await computer.interface.move_cursor(100, 100) 99 | await computer.interface.left_click() 100 | await computer.interface.right_click(300, 300) 101 | await computer.interface.double_click(400, 400) 102 | 103 | # Keyboard Actions Examples 104 | print("\n=== Keyboard Actions ===") 105 | await computer.interface.type_text("Hello, World!") 106 | await computer.interface.press_key("enter") 107 | 108 | # Clipboard Actions Examples 109 | print("\n=== Clipboard Actions ===") 110 | await computer.interface.set_clipboard("Test clipboard") 111 | content = await computer.interface.copy_to_clipboard() 112 | print(f"Clipboard content: {content}") 113 | 114 | finally: 115 | # Important to clean up resources 116 | await computer.stop() 117 | except Exception as e: 118 | print(f"Error in main: {e}") 119 | traceback.print_exc() 120 | 121 | 122 | if __name__ == "__main__": 123 | asyncio.run(main()) 124 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/loops/opencua.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | OpenCUA agent loop implementation for click prediction using litellm.acompletion 3 | Based on OpenCUA model for GUI grounding tasks. 4 | """ 5 | 6 | import asyncio 7 | import json 8 | import re 9 | import base64 10 | from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple 11 | from io import BytesIO 12 | import uuid 13 | from PIL import Image 14 | import litellm 15 | import math 16 | 17 | from .composed_grounded import ComposedGroundedConfig 18 | from ..decorators import register_agent 19 | from ..types import Messages, AgentResponse, Tools, AgentCapability 20 | from ..loops.base import AsyncAgentConfig 21 | 22 | def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]: 23 | """Extract coordinates from pyautogui.click(x=..., y=...) format.""" 24 | try: 25 | # Look for pyautogui.click(x=1443, y=343) pattern 26 | pattern = r"pyautogui\.click\(x=(\d+),\s*y=(\d+)\)" 27 | match = re.search(pattern, text) 28 | if match: 29 | x, y = int(match.group(1)), int(match.group(2)) 30 | return (x, y) 31 | return None 32 | except Exception: 33 | return None 34 | 35 | @register_agent(models=r"(?i).*OpenCUA.*") 36 | class OpenCUAConfig(ComposedGroundedConfig): 37 | """OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction.""" 38 | 39 | def __init__(self): 40 | super().__init__() 41 | self.current_model = None 42 | self.last_screenshot_b64 = None 43 | 44 | async def predict_step( 45 | self, 46 | messages: List[Dict[str, Any]], 47 | model: str, 48 | tools: Optional[List[Dict[str, Any]]] = None, 49 | max_retries: Optional[int] = None, 50 | stream: bool = False, 51 | computer_handler=None, 52 | _on_api_start=None, 53 | _on_api_end=None, 54 | _on_usage=None, 55 | _on_screenshot=None, 56 | **kwargs 57 | ) -> Dict[str, Any]: 58 | """Fallback to a self-composed model""" 59 | return await super().predict_step( 60 | messages=messages, 61 | model=f"{model}+{model}", 62 | tools=tools, 63 | max_retries=max_retries, 64 | stream=stream, 65 | computer_handler=computer_handler, 66 | _on_api_start=_on_api_start, 67 | _on_api_end=_on_api_end, 68 | _on_usage=_on_usage, 69 | _on_screenshot=_on_screenshot, 70 | **kwargs 71 | ) 72 | 73 | async def predict_click( 74 | self, 75 | model: str, 76 | image_b64: str, 77 | instruction: str, 78 | **kwargs 79 | ) -> Optional[Tuple[int, int]]: 80 | """ 81 | Predict click coordinates using OpenCUA model via litellm.acompletion. 82 | 83 | Args: 84 | model: The OpenCUA model name 85 | image_b64: Base64 encoded image 86 | instruction: Instruction for where to click 87 | 88 | Returns: 89 | Tuple of (x, y) coordinates or None if prediction fails 90 | """ 91 | # Prepare system message 92 | system_prompt = ( 93 | "You are a GUI agent. You are given a task and a screenshot of the screen. " 94 | "You need to perform a series of pyautogui actions to complete the task." 95 | ) 96 | 97 | system_message = { 98 | "role": "system", 99 | "content": system_prompt 100 | } 101 | 102 | # Prepare user message with image and instruction 103 | user_message = { 104 | "role": "user", 105 | "content": [ 106 | { 107 | "type": "image_url", 108 | "image_url": { 109 | "url": f"data:image/png;base64,{image_b64}" 110 | } 111 | }, 112 | { 113 | "type": "text", 114 | "text": f"Click on {instruction}" 115 | } 116 | ] 117 | } 118 | 119 | # Prepare API call kwargs 120 | api_kwargs = { 121 | "model": model, 122 | "messages": [system_message, user_message], 123 | "max_new_tokens": 2056, 124 | "temperature": 0, 125 | **kwargs 126 | } 127 | 128 | # Use liteLLM acompletion 129 | response = await litellm.acompletion(**api_kwargs) 130 | 131 | # Extract response text 132 | output_text = response.choices[0].message.content 133 | # print(output_text) 134 | 135 | # Extract coordinates from pyautogui format 136 | coordinates = extract_coordinates_from_pyautogui(output_text) 137 | 138 | return coordinates 139 | 140 | def get_capabilities(self) -> List[AgentCapability]: 141 | """Return the capabilities supported by this agent.""" 142 | return ["click"] 143 | ```