This is page 7 of 21. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .all-contributorsrc ├── .cursorignore ├── .devcontainer │ ├── devcontainer.json │ ├── post-install.sh │ └── README.md ├── .dockerignore ├── .gitattributes ├── .github │ ├── FUNDING.yml │ ├── scripts │ │ ├── get_pyproject_version.py │ │ └── tests │ │ ├── __init__.py │ │ ├── README.md │ │ └── test_get_pyproject_version.py │ └── workflows │ ├── ci-lume.yml │ ├── docker-publish-kasm.yml │ ├── docker-publish-xfce.yml │ ├── docker-reusable-publish.yml │ ├── npm-publish-computer.yml │ ├── npm-publish-core.yml │ ├── publish-lume.yml │ ├── pypi-publish-agent.yml │ ├── pypi-publish-computer-server.yml │ ├── pypi-publish-computer.yml │ ├── pypi-publish-core.yml │ ├── pypi-publish-mcp-server.yml │ ├── pypi-publish-pylume.yml │ ├── pypi-publish-som.yml │ ├── pypi-reusable-publish.yml │ └── test-validation-script.yml ├── .gitignore ├── .vscode │ ├── docs.code-workspace │ ├── launch.json │ ├── libs-ts.code-workspace │ ├── lume.code-workspace │ ├── lumier.code-workspace │ ├── py.code-workspace │ └── settings.json ├── blog │ ├── app-use.md │ ├── assets │ │ ├── composite-agents.png │ │ ├── docker-ubuntu-support.png │ │ ├── hack-booth.png │ │ ├── hack-closing-ceremony.jpg │ │ ├── hack-cua-ollama-hud.jpeg │ │ ├── hack-leaderboard.png │ │ ├── hack-the-north.png │ │ ├── hack-winners.jpeg │ │ ├── hack-workshop.jpeg │ │ ├── hud-agent-evals.png │ │ └── trajectory-viewer.jpeg │ ├── bringing-computer-use-to-the-web.md │ ├── build-your-own-operator-on-macos-1.md │ ├── build-your-own-operator-on-macos-2.md │ ├── composite-agents.md │ ├── cua-hackathon.md │ ├── hack-the-north.md │ ├── hud-agent-evals.md │ ├── human-in-the-loop.md │ ├── introducing-cua-cloud-containers.md │ ├── lume-to-containerization.md │ ├── sandboxed-python-execution.md │ ├── training-computer-use-models-trajectories-1.md │ ├── trajectory-viewer.md │ ├── ubuntu-docker-support.md │ └── windows-sandbox.md ├── CONTRIBUTING.md ├── Development.md ├── Dockerfile ├── docs │ ├── .gitignore │ ├── .prettierrc │ ├── content │ │ └── docs │ │ ├── agent-sdk │ │ │ ├── agent-loops.mdx │ │ │ ├── benchmarks │ │ │ │ ├── index.mdx │ │ │ │ ├── interactive.mdx │ │ │ │ ├── introduction.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── osworld-verified.mdx │ │ │ │ ├── screenspot-pro.mdx │ │ │ │ └── screenspot-v2.mdx │ │ │ ├── callbacks │ │ │ │ ├── agent-lifecycle.mdx │ │ │ │ ├── cost-saving.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── logging.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── pii-anonymization.mdx │ │ │ │ └── trajectories.mdx │ │ │ ├── chat-history.mdx │ │ │ ├── custom-computer-handlers.mdx │ │ │ ├── custom-tools.mdx │ │ │ ├── customizing-computeragent.mdx │ │ │ ├── integrations │ │ │ │ ├── hud.mdx │ │ │ │ └── meta.json │ │ │ ├── message-format.mdx │ │ │ ├── meta.json │ │ │ ├── migration-guide.mdx │ │ │ ├── prompt-caching.mdx │ │ │ ├── supported-agents │ │ │ │ ├── composed-agents.mdx │ │ │ │ ├── computer-use-agents.mdx │ │ │ │ ├── grounding-models.mdx │ │ │ │ ├── human-in-the-loop.mdx │ │ │ │ └── meta.json │ │ │ ├── supported-model-providers │ │ │ │ ├── index.mdx │ │ │ │ └── local-models.mdx │ │ │ └── usage-tracking.mdx │ │ ├── computer-sdk │ │ │ ├── cloud-vm-management.mdx │ │ │ ├── commands.mdx │ │ │ ├── computer-ui.mdx │ │ │ ├── computers.mdx │ │ │ ├── meta.json │ │ │ └── sandboxed-python.mdx │ │ ├── index.mdx │ │ ├── libraries │ │ │ ├── agent │ │ │ │ └── index.mdx │ │ │ ├── computer │ │ │ │ └── index.mdx │ │ │ ├── computer-server │ │ │ │ ├── Commands.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── REST-API.mdx │ │ │ │ └── WebSocket-API.mdx │ │ │ ├── core │ │ │ │ └── index.mdx │ │ │ ├── lume │ │ │ │ ├── cli-reference.mdx │ │ │ │ ├── faq.md │ │ │ │ ├── http-api.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── meta.json │ │ │ │ └── prebuilt-images.mdx │ │ │ ├── lumier │ │ │ │ ├── building-lumier.mdx │ │ │ │ ├── docker-compose.mdx │ │ │ │ ├── docker.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ └── meta.json │ │ │ ├── mcp-server │ │ │ │ ├── client-integrations.mdx │ │ │ │ ├── configuration.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── llm-integrations.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── tools.mdx │ │ │ │ └── usage.mdx │ │ │ └── som │ │ │ ├── configuration.mdx │ │ │ └── index.mdx │ │ ├── meta.json │ │ ├── quickstart-cli.mdx │ │ ├── quickstart-devs.mdx │ │ └── telemetry.mdx │ ├── next.config.mjs │ ├── package-lock.json │ ├── package.json │ ├── pnpm-lock.yaml │ ├── postcss.config.mjs │ ├── public │ │ └── img │ │ ├── agent_gradio_ui.png │ │ ├── agent.png │ │ ├── cli.png │ │ ├── computer.png │ │ ├── som_box_threshold.png │ │ └── som_iou_threshold.png │ ├── README.md │ ├── source.config.ts │ ├── src │ │ ├── app │ │ │ ├── (home) │ │ │ │ ├── [[...slug]] │ │ │ │ │ └── page.tsx │ │ │ │ └── layout.tsx │ │ │ ├── api │ │ │ │ └── search │ │ │ │ └── route.ts │ │ │ ├── favicon.ico │ │ │ ├── global.css │ │ │ ├── layout.config.tsx │ │ │ ├── layout.tsx │ │ │ ├── llms.mdx │ │ │ │ └── [[...slug]] │ │ │ │ └── route.ts │ │ │ └── llms.txt │ │ │ └── route.ts │ │ ├── assets │ │ │ ├── discord-black.svg │ │ │ ├── discord-white.svg │ │ │ ├── logo-black.svg │ │ │ └── logo-white.svg │ │ ├── components │ │ │ ├── iou.tsx │ │ │ └── mermaid.tsx │ │ ├── lib │ │ │ ├── llms.ts │ │ │ └── source.ts │ │ └── mdx-components.tsx │ └── tsconfig.json ├── examples │ ├── agent_examples.py │ ├── agent_ui_examples.py │ ├── cloud_api_examples.py │ ├── computer_examples_windows.py │ ├── computer_examples.py │ ├── computer_ui_examples.py │ ├── computer-example-ts │ │ ├── .env.example │ │ ├── .gitignore │ │ ├── .prettierrc │ │ ├── package-lock.json │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── README.md │ │ ├── src │ │ │ ├── helpers.ts │ │ │ └── index.ts │ │ └── tsconfig.json │ ├── docker_examples.py │ ├── evals │ │ ├── hud_eval_examples.py │ │ └── wikipedia_most_linked.txt │ ├── pylume_examples.py │ ├── sandboxed_functions_examples.py │ ├── som_examples.py │ ├── utils.py │ └── winsandbox_example.py ├── img │ ├── agent_gradio_ui.png │ ├── agent.png │ ├── cli.png │ ├── computer.png │ ├── logo_black.png │ └── logo_white.png ├── libs │ ├── kasm │ │ ├── Dockerfile │ │ ├── LICENSE │ │ ├── README.md │ │ └── src │ │ └── ubuntu │ │ └── install │ │ └── firefox │ │ ├── custom_startup.sh │ │ ├── firefox.desktop │ │ └── install_firefox.sh │ ├── lume │ │ ├── .cursorignore │ │ ├── CONTRIBUTING.md │ │ ├── Development.md │ │ ├── img │ │ │ └── cli.png │ │ ├── Package.resolved │ │ ├── Package.swift │ │ ├── README.md │ │ ├── resources │ │ │ └── lume.entitlements │ │ ├── scripts │ │ │ ├── build │ │ │ │ ├── build-debug.sh │ │ │ │ ├── build-release-notarized.sh │ │ │ │ └── build-release.sh │ │ │ └── install.sh │ │ ├── src │ │ │ ├── Commands │ │ │ │ ├── Clone.swift │ │ │ │ ├── Config.swift │ │ │ │ ├── Create.swift │ │ │ │ ├── Delete.swift │ │ │ │ ├── Get.swift │ │ │ │ ├── Images.swift │ │ │ │ ├── IPSW.swift │ │ │ │ ├── List.swift │ │ │ │ ├── Logs.swift │ │ │ │ ├── Options │ │ │ │ │ └── FormatOption.swift │ │ │ │ ├── Prune.swift │ │ │ │ ├── Pull.swift │ │ │ │ ├── Push.swift │ │ │ │ ├── Run.swift │ │ │ │ ├── Serve.swift │ │ │ │ ├── Set.swift │ │ │ │ └── Stop.swift │ │ │ ├── ContainerRegistry │ │ │ │ ├── ImageContainerRegistry.swift │ │ │ │ ├── ImageList.swift │ │ │ │ └── ImagesPrinter.swift │ │ │ ├── Errors │ │ │ │ └── Errors.swift │ │ │ ├── FileSystem │ │ │ │ ├── Home.swift │ │ │ │ ├── Settings.swift │ │ │ │ ├── VMConfig.swift │ │ │ │ ├── VMDirectory.swift │ │ │ │ └── VMLocation.swift │ │ │ ├── LumeController.swift │ │ │ ├── Main.swift │ │ │ ├── Server │ │ │ │ ├── Handlers.swift │ │ │ │ ├── HTTP.swift │ │ │ │ ├── Requests.swift │ │ │ │ ├── Responses.swift │ │ │ │ └── Server.swift │ │ │ ├── Utils │ │ │ │ ├── CommandRegistry.swift │ │ │ │ ├── CommandUtils.swift │ │ │ │ ├── Logger.swift │ │ │ │ ├── NetworkUtils.swift │ │ │ │ ├── Path.swift │ │ │ │ ├── ProcessRunner.swift │ │ │ │ ├── ProgressLogger.swift │ │ │ │ ├── String.swift │ │ │ │ └── Utils.swift │ │ │ ├── Virtualization │ │ │ │ ├── DarwinImageLoader.swift │ │ │ │ ├── DHCPLeaseParser.swift │ │ │ │ ├── ImageLoaderFactory.swift │ │ │ │ └── VMVirtualizationService.swift │ │ │ ├── VM │ │ │ │ ├── DarwinVM.swift │ │ │ │ ├── LinuxVM.swift │ │ │ │ ├── VM.swift │ │ │ │ ├── VMDetails.swift │ │ │ │ ├── VMDetailsPrinter.swift │ │ │ │ ├── VMDisplayResolution.swift │ │ │ │ └── VMFactory.swift │ │ │ └── VNC │ │ │ ├── PassphraseGenerator.swift │ │ │ └── VNCService.swift │ │ └── tests │ │ ├── Mocks │ │ │ ├── MockVM.swift │ │ │ ├── MockVMVirtualizationService.swift │ │ │ └── MockVNCService.swift │ │ ├── VM │ │ │ └── VMDetailsPrinterTests.swift │ │ ├── VMTests.swift │ │ ├── VMVirtualizationServiceTests.swift │ │ └── VNCServiceTests.swift │ ├── lumier │ │ ├── .dockerignore │ │ ├── Dockerfile │ │ ├── README.md │ │ └── src │ │ ├── bin │ │ │ └── entry.sh │ │ ├── config │ │ │ └── constants.sh │ │ ├── hooks │ │ │ └── on-logon.sh │ │ └── lib │ │ ├── utils.sh │ │ └── vm.sh │ ├── python │ │ ├── agent │ │ │ ├── .bumpversion.cfg │ │ │ ├── agent │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── adapters │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── huggingfacelocal_adapter.py │ │ │ │ │ ├── human_adapter.py │ │ │ │ │ ├── mlxvlm_adapter.py │ │ │ │ │ └── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── qwen2_5_vl.py │ │ │ │ ├── agent.py │ │ │ │ ├── callbacks │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── budget_manager.py │ │ │ │ │ ├── image_retention.py │ │ │ │ │ ├── logging.py │ │ │ │ │ ├── operator_validator.py │ │ │ │ │ ├── pii_anonymization.py │ │ │ │ │ ├── prompt_instructions.py │ │ │ │ │ ├── telemetry.py │ │ │ │ │ └── trajectory_saver.py │ │ │ │ ├── cli.py │ │ │ │ ├── computers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cua.py │ │ │ │ │ └── custom.py │ │ │ │ ├── decorators.py │ │ │ │ ├── human_tool │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ ├── server.py │ │ │ │ │ └── ui.py │ │ │ │ ├── integrations │ │ │ │ │ └── hud │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── agent.py │ │ │ │ │ └── proxy.py │ │ │ │ ├── loops │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── anthropic.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── composed_grounded.py │ │ │ │ │ ├── gemini.py │ │ │ │ │ ├── glm45v.py │ │ │ │ │ ├── gta1.py │ │ │ │ │ ├── holo.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── model_types.csv │ │ │ │ │ ├── moondream3.py │ │ │ │ │ ├── omniparser.py │ │ │ │ │ ├── openai.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── uitars.py │ │ │ │ ├── proxy │ │ │ │ │ ├── examples.py │ │ │ │ │ └── handlers.py │ │ │ │ ├── responses.py │ │ │ │ ├── types.py │ │ │ │ └── ui │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ └── gradio │ │ │ │ ├── __init__.py │ │ │ │ ├── app.py │ │ │ │ └── ui_components.py │ │ │ ├── benchmarks │ │ │ │ ├── .gitignore │ │ │ │ ├── contrib.md │ │ │ │ ├── interactive.py │ │ │ │ ├── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ └── gta1.py │ │ │ │ ├── README.md │ │ │ │ ├── ss-pro.py │ │ │ │ ├── ss-v2.py │ │ │ │ └── utils.py │ │ │ ├── example.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer │ │ │ ├── .bumpversion.cfg │ │ │ ├── computer │ │ │ │ ├── __init__.py │ │ │ │ ├── computer.py │ │ │ │ ├── diorama_computer.py │ │ │ │ ├── helpers.py │ │ │ │ ├── interface │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ ├── models.py │ │ │ │ │ └── windows.py │ │ │ │ ├── logger.py │ │ │ │ ├── models.py │ │ │ │ ├── providers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cloud │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── docker │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── lume │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── lume_api.py │ │ │ │ │ ├── lumier │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── types.py │ │ │ │ │ └── winsandbox │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── provider.py │ │ │ │ │ └── setup_script.ps1 │ │ │ │ ├── ui │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ └── gradio │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── app.py │ │ │ │ └── utils.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer-server │ │ │ ├── .bumpversion.cfg │ │ │ ├── computer_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── cli.py │ │ │ │ ├── diorama │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── diorama_computer.py │ │ │ │ │ ├── diorama.py │ │ │ │ │ ├── draw.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── safezone.py │ │ │ │ ├── handlers │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── windows.py │ │ │ │ ├── main.py │ │ │ │ ├── server.py │ │ │ │ └── watchdog.py │ │ │ ├── examples │ │ │ │ ├── __init__.py │ │ │ │ └── usage_example.py │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ ├── run_server.py │ │ │ └── test_connection.py │ │ ├── core │ │ │ ├── .bumpversion.cfg │ │ │ ├── core │ │ │ │ ├── __init__.py │ │ │ │ └── telemetry │ │ │ │ ├── __init__.py │ │ │ │ └── posthog.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── mcp-server │ │ │ ├── .bumpversion.cfg │ │ │ ├── CONCURRENT_SESSIONS.md │ │ │ ├── mcp_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── server.py │ │ │ │ └── session_manager.py │ │ │ ├── pdm.lock │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ └── scripts │ │ │ ├── install_mcp_server.sh │ │ │ └── start_mcp_server.sh │ │ ├── pylume │ │ │ ├── __init__.py │ │ │ ├── .bumpversion.cfg │ │ │ ├── pylume │ │ │ │ ├── __init__.py │ │ │ │ ├── client.py │ │ │ │ ├── exceptions.py │ │ │ │ ├── lume │ │ │ │ ├── models.py │ │ │ │ ├── pylume.py │ │ │ │ └── server.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ └── som │ │ ├── .bumpversion.cfg │ │ ├── LICENSE │ │ ├── poetry.toml │ │ ├── pyproject.toml │ │ ├── README.md │ │ ├── som │ │ │ ├── __init__.py │ │ │ ├── detect.py │ │ │ ├── detection.py │ │ │ ├── models.py │ │ │ ├── ocr.py │ │ │ ├── util │ │ │ │ └── utils.py │ │ │ └── visualization.py │ │ └── tests │ │ └── test_omniparser.py │ ├── typescript │ │ ├── .gitignore │ │ ├── .nvmrc │ │ ├── agent │ │ │ ├── examples │ │ │ │ ├── playground-example.html │ │ │ │ └── README.md │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── client.ts │ │ │ │ ├── index.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ └── client.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── biome.json │ │ ├── computer │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── computer │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── providers │ │ │ │ │ │ ├── base.ts │ │ │ │ │ │ ├── cloud.ts │ │ │ │ │ │ └── index.ts │ │ │ │ │ └── types.ts │ │ │ │ ├── index.ts │ │ │ │ ├── interface │ │ │ │ │ ├── base.ts │ │ │ │ │ ├── factory.ts │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── linux.ts │ │ │ │ │ ├── macos.ts │ │ │ │ │ └── windows.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ ├── computer │ │ │ │ │ └── cloud.test.ts │ │ │ │ ├── interface │ │ │ │ │ ├── factory.test.ts │ │ │ │ │ ├── index.test.ts │ │ │ │ │ ├── linux.test.ts │ │ │ │ │ ├── macos.test.ts │ │ │ │ │ └── windows.test.ts │ │ │ │ └── setup.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── core │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── index.ts │ │ │ │ └── telemetry │ │ │ │ ├── clients │ │ │ │ │ ├── index.ts │ │ │ │ │ └── posthog.ts │ │ │ │ └── index.ts │ │ │ ├── tests │ │ │ │ └── telemetry.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── pnpm-workspace.yaml │ │ └── README.md │ └── xfce │ ├── .dockerignore │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ └── src │ ├── scripts │ │ ├── resize-display.sh │ │ ├── start-computer-server.sh │ │ ├── start-novnc.sh │ │ ├── start-vnc.sh │ │ └── xstartup.sh │ ├── supervisor │ │ └── supervisord.conf │ └── xfce-config │ ├── helpers.rc │ ├── xfce4-power-manager.xml │ └── xfce4-session.xml ├── LICENSE.md ├── Makefile ├── notebooks │ ├── agent_nb.ipynb │ ├── blog │ │ ├── build-your-own-operator-on-macos-1.ipynb │ │ └── build-your-own-operator-on-macos-2.ipynb │ ├── composite_agents_docker_nb.ipynb │ ├── computer_nb.ipynb │ ├── computer_server_nb.ipynb │ ├── customizing_computeragent.ipynb │ ├── eval_osworld.ipynb │ ├── ollama_nb.ipynb │ ├── pylume_nb.ipynb │ ├── README.md │ ├── sota_hackathon_cloud.ipynb │ └── sota_hackathon.ipynb ├── pdm.lock ├── pyproject.toml ├── pyrightconfig.json ├── README.md ├── samples │ └── community │ ├── global-online │ │ └── README.md │ └── hack-the-north │ └── README.md ├── scripts │ ├── build-uv.sh │ ├── build.ps1 │ ├── build.sh │ ├── cleanup.sh │ ├── playground-docker.sh │ ├── playground.sh │ └── run-docker-dev.sh └── tests ├── pytest.ini ├── shell_cmd.py ├── test_files.py ├── test_mcp_server_session_management.py ├── test_mcp_server_streaming.py ├── test_shell_bash.py ├── test_telemetry.py ├── test_venv.py └── test_watchdog.py ``` # Files -------------------------------------------------------------------------------- /docs/content/docs/telemetry.mdx: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | title: Telemetry 3 | description: This document explains how telemetry works in CUA libraries and how you can control it. 4 | icon: RadioTower 5 | --- 6 | 7 | # Telemetry in CUA 8 | 9 | CUA tracks anonymized usage and error report statistics; we ascribe to Posthog's approach as detailed [here](https://posthog.com/blog/open-source-telemetry-ethical). If you would like to opt out of sending anonymized info, you can set `telemetry_enabled` to false. 10 | 11 | ## What telemetry data we collect 12 | 13 | CUA libraries collect usage data to help improve our software. We have two categories of telemetry: 14 | 15 | ### Opt-Out Telemetry (Enabled by Default) 16 | 17 | Basic performance metrics and system information that help us understand usage patterns: 18 | 19 | - **System Information**: Operating system, OS version, Python version 20 | - **Module Initialization**: When modules are imported and their versions 21 | - **Performance Metrics**: Agent run durations, step counts, token usage, and API costs 22 | - **Session Tracking**: Anonymous session IDs and run IDs for performance analysis 23 | 24 | ### Opt-In Telemetry (Disabled by Default) 25 | 26 | **Conversation Trajectory Logging**: Full conversation history including: 27 | - User messages and agent responses 28 | - Computer actions and their outputs 29 | - Reasoning traces from the agent 30 | 31 | **Important**: Trajectory logging is **opt-in only** and must be explicitly enabled. 32 | 33 | ### We do NOT collect: 34 | 35 | - Personal information or user identifiers 36 | - API keys or credentials 37 | - File contents or application data 38 | - Information about files being accessed 39 | - Actual screenshots or screen contents (unless trajectory logging is enabled) 40 | - Specific text being typed, including user inputs, model outputs, computer outputs, or tool call outputs (unless trajectory logging is enabled) 41 | 42 | ## Controlling Telemetry 43 | 44 | We are committed to transparency and user control over telemetry. There are two ways to control telemetry: 45 | 46 | ### 1. Environment Variable (Global Control) 47 | 48 | Telemetry is enabled by default. To disable telemetry, set the `CUA_TELEMETRY_ENABLED` environment variable to a falsy value (`0`, `false`, `no`, or `off`): 49 | 50 | ```bash 51 | # Disable telemetry before running your script 52 | export CUA_TELEMETRY_ENABLED=false 53 | 54 | # Or as part of the command 55 | CUA_TELEMETRY_ENABLED=1 python your_script.py 56 | 57 | ``` 58 | 59 | Or from Python: 60 | 61 | ```python 62 | import os 63 | os.environ["CUA_TELEMETRY_ENABLED"] = "false" 64 | ``` 65 | 66 | ### 2. Instance-Level Control 67 | 68 | #### Computer SDK 69 | 70 | ```python 71 | from computer import Computer 72 | 73 | # Enable telemetry (default) 74 | computer = Computer(telemetry_enabled=True) 75 | 76 | # Disable telemetry 77 | computer = Computer(telemetry_enabled=False) 78 | ``` 79 | 80 | #### Agent SDK 81 | 82 | ```python 83 | from agent import ComputerAgent 84 | import os 85 | 86 | # Basic telemetry - performance metrics only (opt-out, enabled by default) 87 | agent = ComputerAgent( 88 | model="claude-3-5-sonnet-20241022", 89 | telemetry_enabled=True # Default is True 90 | ) 91 | 92 | # Enable telemetry with full conversation trajectory logging (opt-in) 93 | agent = ComputerAgent( 94 | model="claude-3-5-sonnet-20241022", 95 | telemetry_enabled={ 96 | "log_trajectory": True # Logs full conversation items 97 | } 98 | ) 99 | 100 | # Disable telemetry completely 101 | agent = ComputerAgent( 102 | model="claude-3-5-sonnet-20241022", 103 | telemetry_enabled=False 104 | ) 105 | 106 | # Disable telemetry completely using environment variables 107 | os.environ["CUA_TELEMETRY_ENABLED"] = "false" 108 | agent = ComputerAgent( 109 | model="claude-3-5-sonnet-20241022" 110 | ) 111 | ``` 112 | 113 | You can check if telemetry is enabled for an instance: 114 | 115 | ```python 116 | print(computer.telemetry_enabled) # Will print True or False 117 | print(agent.telemetry_enabled) # Will print True, False, or dict 118 | ``` 119 | 120 | Note that telemetry settings must be configured during initialization and cannot be changed after the object is created. 121 | 122 | ## Detailed Telemetry Events 123 | 124 | ### Computer SDK Events 125 | 126 | | Event Name | Data Collected | Trigger Notes | 127 | |------------|----------------|---------------| 128 | | **computer_initialized** | • `os`: Operating system (e.g., 'windows', 'darwin', 'linux')<br />• `os_version`: OS version<br />• `python_version`: Python version | Triggered when a Computer instance is created | 129 | | **module_init** | • `module`: "computer"<br />• `version`: Package version<br />• `python_version`: Full Python version string | Triggered once when the computer package is imported for the first time | 130 | 131 | ### Agent SDK Events 132 | 133 | | Event Name | Data Collected | Trigger Notes | 134 | |------------|----------------|---------------| 135 | | **module_init** | • `module`: "agent"<br />• `version`: Package version<br />• `python_version`: Full Python version string | Triggered once when the agent package is imported for the first time | 136 | | **agent_session_start** | • `session_id`: Unique UUID for this agent instance<br />• `agent_type`: Class name (e.g., "ComputerAgent")<br />• `model`: Model name (e.g., "claude-3-5-sonnet")<br />• `os`: Operating system<br />• `os_version`: OS version<br />• `python_version`: Python version | Triggered when TelemetryCallback is initialized (agent instantiation) | 137 | | **agent_run_start** | • `session_id`: Agent session UUID<br />• `run_id`: Unique UUID for this run<br />• `start_time`: Unix timestamp<br />• `input_context_size`: Character count of input messages<br />• `num_existing_messages`: Count of existing messages<br />• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the start of each agent.run() call | 138 | | **agent_run_end** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `end_time`: Unix timestamp<br />• `duration_seconds`: Total run duration<br />• `num_steps`: Total steps taken in this run<br />• `total_usage`: Accumulated token usage and costs<br />• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the end of each agent.run() call | 139 | | **agent_step** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `step`: Step number (incremental)<br />• `timestamp`: Unix timestamp<br />• `duration_seconds`: Duration of previous step | Triggered on each agent response/step during a run | 140 | | **agent_usage** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `step`: Current step number<br />• `prompt_tokens`: Tokens in prompt<br />• `completion_tokens`: Tokens in response<br />• `total_tokens`: Total tokens used<br />• `response_cost`: Cost of this API call | Triggered whenever usage information is received from LLM API | 141 | 142 | ## Transparency 143 | 144 | We believe in being transparent about the data we collect. If you have any questions about our telemetry practices, please open an issue on our GitHub repository. 145 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/callbacks/operator_validator.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | OperatorValidatorCallback 3 | 4 | Ensures agent output actions conform to expected schemas by fixing common issues: 5 | - click: add default button='left' if missing 6 | - keypress: wrap keys string into a list 7 | - etc. 8 | 9 | This runs in on_llm_end, which receives the output array (AgentMessage[] as dicts). 10 | The purpose is to avoid spending another LLM call to fix broken computer call syntax when possible. 11 | """ 12 | from __future__ import annotations 13 | 14 | from typing import Any, Dict, List 15 | 16 | from .base import AsyncCallbackHandler 17 | 18 | 19 | class OperatorNormalizerCallback(AsyncCallbackHandler): 20 | """Normalizes common computer call hallucinations / errors in computer call syntax.""" 21 | 22 | async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 23 | # Mutate in-place as requested, but still return the list for chaining 24 | for item in output or []: 25 | if item.get("type") != "computer_call": 26 | continue 27 | action = item.get("action") 28 | if not isinstance(action, dict): 29 | continue 30 | 31 | # rename mouse click actions to "click" 32 | for mouse_btn in ["left", "right", "wheel", "back", "forward"]: 33 | if action.get("type", "") == f"{mouse_btn}_click": 34 | action["type"] = "click" 35 | action["button"] = mouse_btn 36 | # rename hotkey actions to "keypress" 37 | for alias in ["hotkey", "key", "press", "key_press"]: 38 | if action.get("type", "") == alias: 39 | action["type"] = "keypress" 40 | # assume click actions 41 | if "button" in action and "type" not in action: 42 | action["type"] = "click" 43 | if "click" in action and "type" not in action: 44 | action["type"] = "click" 45 | if ("scroll_x" in action or "scroll_y" in action) and "type" not in action: 46 | action["type"] = "scroll" 47 | if "text" in action and "type" not in action: 48 | action["type"] = "type" 49 | 50 | action_type = action.get("type") 51 | def _keep_keys(action: Dict[str, Any], keys_to_keep: List[str]): 52 | """Keep only the provided keys on action; delete everything else. 53 | Always ensures required 'type' is present if listed in keys_to_keep. 54 | """ 55 | for key in list(action.keys()): 56 | if key not in keys_to_keep: 57 | del action[key] 58 | # rename "coordinate" to "x", "y" 59 | if "coordinate" in action: 60 | action["x"] = action["coordinate"][0] 61 | action["y"] = action["coordinate"][1] 62 | del action["coordinate"] 63 | if action_type == "click": 64 | # convert "click" to "button" 65 | if "button" not in action and "click" in action: 66 | action["button"] = action["click"] 67 | del action["click"] 68 | # default button to "left" 69 | action["button"] = action.get("button", "left") 70 | # add default scroll x, y if missing 71 | if action_type == "scroll": 72 | action["scroll_x"] = action.get("scroll_x", 0) 73 | action["scroll_y"] = action.get("scroll_y", 0) 74 | # ensure keys arg is a list (normalize aliases first) 75 | if action_type == "keypress": 76 | keys = action.get("keys") 77 | for keys_alias in ["keypress", "key", "press", "key_press", "text"]: 78 | if keys_alias in action: 79 | action["keys"] = action[keys_alias] 80 | del action[keys_alias] 81 | keys = action.get("keys") 82 | if isinstance(keys, str): 83 | action["keys"] = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys] 84 | required_keys_by_type = { 85 | # OpenAI actions 86 | "click": ["type", "button", "x", "y"], 87 | "double_click": ["type", "x", "y"], 88 | "drag": ["type", "path"], 89 | "keypress": ["type", "keys"], 90 | "move": ["type", "x", "y"], 91 | "screenshot": ["type"], 92 | "scroll": ["type", "scroll_x", "scroll_y", "x", "y"], 93 | "type": ["type", "text"], 94 | "wait": ["type"], 95 | # Anthropic actions 96 | "left_mouse_down": ["type", "x", "y"], 97 | "left_mouse_up": ["type", "x", "y"], 98 | "triple_click": ["type", "button", "x", "y"], 99 | } 100 | keep = required_keys_by_type.get(action_type or "") 101 | if keep: 102 | _keep_keys(action, keep) 103 | 104 | 105 | # # Second pass: if an assistant message is immediately followed by a computer_call, 106 | # # replace the assistant message itself with a reasoning message with summary text. 107 | # if isinstance(output, list): 108 | # for i, item in enumerate(output): 109 | # # AssistantMessage shape: { type: 'message', role: 'assistant', content: OutputContent[] } 110 | # if item.get("type") == "message" and item.get("role") == "assistant": 111 | # next_idx = i + 1 112 | # if next_idx >= len(output): 113 | # continue 114 | # next_item = output[next_idx] 115 | # if not isinstance(next_item, dict): 116 | # continue 117 | # if next_item.get("type") != "computer_call": 118 | # continue 119 | # contents = item.get("content") or [] 120 | # # Extract text from OutputContent[] 121 | # text_parts: List[str] = [] 122 | # if isinstance(contents, list): 123 | # for c in contents: 124 | # if isinstance(c, dict) and c.get("type") == "output_text" and isinstance(c.get("text"), str): 125 | # text_parts.append(c["text"]) 126 | # text_content = "\n".join(text_parts).strip() 127 | # # Replace assistant message with reasoning message 128 | # output[i] = { 129 | # "type": "reasoning", 130 | # "summary": [ 131 | # { 132 | # "type": "summary_text", 133 | # "text": text_content, 134 | # } 135 | # ], 136 | # } 137 | 138 | return output 139 | ``` -------------------------------------------------------------------------------- /.github/workflows/docker-reusable-publish.yml: -------------------------------------------------------------------------------- ```yaml 1 | name: Reusable Docker Publish Workflow 2 | 3 | on: 4 | workflow_call: 5 | inputs: 6 | image_name: 7 | description: "Name of the Docker image (e.g. cua-ubuntu, cua-xfce)" 8 | required: true 9 | type: string 10 | context_dir: 11 | description: "Directory containing the Dockerfile relative to workspace root (e.g. libs/kasm, libs/xfce)" 12 | required: true 13 | type: string 14 | dockerfile_path: 15 | description: "Path to Dockerfile relative to context_dir (e.g. Dockerfile)" 16 | required: false 17 | type: string 18 | default: "Dockerfile" 19 | tag_prefix: 20 | description: "Prefix for semantic version tags (e.g. docker-kasm-v, docker-xfce-v)" 21 | required: true 22 | type: string 23 | docker_hub_org: 24 | description: "Docker Hub organization name" 25 | required: false 26 | type: string 27 | default: "trycua" 28 | secrets: 29 | DOCKER_HUB_TOKEN: 30 | required: true 31 | 32 | jobs: 33 | build-and-push: 34 | runs-on: ubuntu-latest 35 | strategy: 36 | fail-fast: false 37 | matrix: 38 | platform: 39 | - linux/amd64 40 | - linux/arm64 41 | steps: 42 | - name: Checkout repository 43 | uses: actions/checkout@v4 44 | 45 | - name: Prepare platform tag 46 | id: platform 47 | run: | 48 | # Convert platform (e.g., linux/amd64) to a valid tag suffix (e.g., linux-amd64) 49 | PLATFORM_TAG=$(echo "${{ matrix.platform }}" | sed 's/\//-/g') 50 | echo "tag=${PLATFORM_TAG}" >> $GITHUB_OUTPUT 51 | 52 | - name: Set up Docker Buildx 53 | uses: docker/setup-buildx-action@v3 54 | 55 | - name: Log in to Docker Hub 56 | uses: docker/login-action@v3 57 | with: 58 | username: ${{ inputs.docker_hub_org }} 59 | password: ${{ secrets.DOCKER_HUB_TOKEN }} 60 | 61 | - name: Extract metadata (PR) 62 | if: github.event_name == 'pull_request' 63 | id: meta-pr 64 | uses: docker/metadata-action@v5 65 | with: 66 | images: ${{ inputs.docker_hub_org }}/${{ inputs.image_name }} 67 | tags: | 68 | type=raw,value=${{ github.sha }} 69 | 70 | - name: Extract metadata (main branch) 71 | if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main' 72 | id: meta-main 73 | uses: docker/metadata-action@v5 74 | with: 75 | images: ${{ inputs.docker_hub_org }}/${{ inputs.image_name }} 76 | tags: | 77 | type=raw,value=latest 78 | 79 | - name: Extract metadata (semantic version tag) 80 | if: startsWith(github.ref, format('refs/tags/{0}', inputs.tag_prefix)) 81 | id: meta-semver 82 | uses: docker/metadata-action@v5 83 | with: 84 | images: ${{ inputs.docker_hub_org }}/${{ inputs.image_name }} 85 | tags: | 86 | type=semver,pattern={{version}},prefix=${{ inputs.tag_prefix }} 87 | type=semver,pattern={{major}}.{{minor}},prefix=${{ inputs.tag_prefix }} 88 | type=semver,pattern={{major}},prefix=${{ inputs.tag_prefix }} 89 | type=raw,value=latest 90 | 91 | - name: Build and push Docker image (PR) 92 | if: github.event_name == 'pull_request' 93 | uses: docker/build-push-action@v5 94 | with: 95 | context: ./${{ inputs.context_dir }} 96 | file: ./${{ inputs.context_dir }}/${{ inputs.dockerfile_path }} 97 | push: true 98 | tags: ${{ steps.meta-pr.outputs.tags }} 99 | labels: ${{ steps.meta-pr.outputs.labels }} 100 | platforms: ${{ matrix.platform }} 101 | cache-from: | 102 | type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }} 103 | type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:latest 104 | cache-to: type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }},mode=max 105 | 106 | - name: Build and push Docker image (main branch) 107 | if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main' 108 | uses: docker/build-push-action@v5 109 | with: 110 | context: ./${{ inputs.context_dir }} 111 | file: ./${{ inputs.context_dir }}/${{ inputs.dockerfile_path }} 112 | push: true 113 | tags: ${{ steps.meta-main.outputs.tags }} 114 | labels: ${{ steps.meta-main.outputs.labels }} 115 | platforms: ${{ matrix.platform }} 116 | cache-from: | 117 | type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }} 118 | type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:latest 119 | cache-to: type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }},mode=max 120 | 121 | - name: Build and push Docker image (semantic version tag) 122 | if: startsWith(github.ref, format('refs/tags/{0}', inputs.tag_prefix)) 123 | uses: docker/build-push-action@v5 124 | with: 125 | context: ./${{ inputs.context_dir }} 126 | file: ./${{ inputs.context_dir }}/${{ inputs.dockerfile_path }} 127 | push: true 128 | tags: ${{ steps.meta-semver.outputs.tags }} 129 | labels: ${{ steps.meta-semver.outputs.labels }} 130 | platforms: ${{ matrix.platform }} 131 | cache-from: | 132 | type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }} 133 | type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:latest 134 | cache-to: type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }},mode=max 135 | 136 | - name: Image digest 137 | if: github.event_name == 'pull_request' || github.ref == 'refs/heads/main' || startsWith(github.ref, format('refs/tags/{0}', inputs.tag_prefix)) 138 | run: | 139 | if [ "${{ github.event_name }}" == "pull_request" ]; then 140 | echo "Image pushed with digest ${{ steps.meta-pr.outputs.digest }}" 141 | elif [[ "${{ github.ref }}" == refs/tags/${{ inputs.tag_prefix }}* ]]; then 142 | echo "Image pushed with digest ${{ steps.meta-semver.outputs.digest }}" 143 | else 144 | echo "Image pushed with digest ${{ steps.meta-main.outputs.digest }}" 145 | fi 146 | 147 | - name: print image tags 148 | run: | 149 | if [ "${{ github.event_name }}" == "pull_request" ]; then 150 | echo "Image tags: ${{ steps.meta-pr.outputs.tags }}" 151 | elif [[ "${{ github.ref }}" == refs/tags/${{ inputs.tag_prefix }}* ]]; then 152 | echo "Image tags: ${{ steps.meta-semver.outputs.tags }}" 153 | else 154 | echo "Image tags: ${{ steps.meta-main.outputs.tags }}" 155 | fi 156 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/loops/internvl.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | InternVL agent loop implementation for click prediction using litellm.acompletion. 3 | 4 | Implements the ScreenSpot InternVL grounding baseline behavior: 5 | - Uses the exact grounding prompt format with <image> and <ref> tags 6 | - Expects coordinates in 0-1000 normalized range in formats [[x1,y1,x2,y2]] or [[x,y]] 7 | - Converts to pixel coordinates relative to the original screenshot size 8 | 9 | Note: We do NOT manually load the InternVL model; acompletions (via HuggingFaceLocalAdapter) 10 | will handle loading based on the provided model name. 11 | """ 12 | 13 | from __future__ import annotations 14 | 15 | import base64 16 | import math 17 | import re 18 | from io import BytesIO 19 | from typing import Any, Dict, List, Optional, Tuple 20 | 21 | from PIL import Image 22 | import litellm 23 | 24 | from ..decorators import register_agent 25 | from .composed_grounded import ComposedGroundedConfig 26 | from ..types import AgentCapability 27 | 28 | 29 | # Regex patterns for extracting coordinates 30 | # Accept optional whitespace and optional decimal fractions 31 | _NUM = r"(\d+(?:\.\d+)?)" 32 | _POINT_PATTERN = re.compile(r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]") 33 | _BBOX_PATTERN = re.compile( 34 | r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]" 35 | ) 36 | 37 | 38 | def _extract_first_point(text: str) -> Optional[Tuple[float, float]]: 39 | """Extract the first [[x,y]] as normalized (0-1000) floats.""" 40 | m = _POINT_PATTERN.search(text) 41 | if not m: 42 | return None 43 | try: 44 | x = float(m.group(1)) 45 | y = float(m.group(2)) 46 | return x, y 47 | except Exception: 48 | return None 49 | 50 | 51 | def _extract_last_bbox(text: str) -> Optional[Tuple[float, float, float, float]]: 52 | """Extract the last [[x1,y1,x2,y2]] as normalized (0-1000) floats.""" 53 | matches = list(_BBOX_PATTERN.finditer(text)) 54 | if not matches: 55 | return None 56 | m = matches[-1] 57 | try: 58 | x1 = float(m.group(1)) 59 | y1 = float(m.group(2)) 60 | x2 = float(m.group(3)) 61 | y2 = float(m.group(4)) 62 | return x1, y1, x2, y2 63 | except Exception: 64 | return None 65 | 66 | 67 | def _scale_norm_to_pixels(x_norm: float, y_norm: float, width: int, height: int) -> Tuple[int, int]: 68 | """Scale 0-1000 normalized coordinates to pixel coordinates for given image size.""" 69 | x_px = int(math.floor((x_norm / 1000.0) * width)) 70 | y_px = int(math.floor((y_norm / 1000.0) * height)) 71 | # Clamp to image bounds just in case 72 | x_px = max(0, min(width - 1, x_px)) 73 | y_px = max(0, min(height - 1, y_px)) 74 | return x_px, y_px 75 | 76 | 77 | @register_agent(models=r"(?i).*InternVL.*") 78 | class InternVLConfig(ComposedGroundedConfig): 79 | """InternVL agent configuration reusing ComposedGroundedConfig for steps and 80 | overriding predict_click to implement ScreenSpot InternVL grounding baseline.""" 81 | 82 | async def predict_step( 83 | self, 84 | messages: List[Dict[str, Any]], 85 | model: str, 86 | tools: Optional[List[Dict[str, Any]]] = None, 87 | max_retries: Optional[int] = None, 88 | stream: bool = False, 89 | computer_handler=None, 90 | _on_api_start=None, 91 | _on_api_end=None, 92 | _on_usage=None, 93 | _on_screenshot=None, 94 | **kwargs 95 | ) -> Dict[str, Any]: 96 | """Fallback to a self-composed model""" 97 | return await super().predict_step( 98 | messages=messages, 99 | model=f"{model}+{model}", 100 | tools=tools, 101 | max_retries=max_retries, 102 | stream=stream, 103 | computer_handler=computer_handler, 104 | _on_api_start=_on_api_start, 105 | _on_api_end=_on_api_end, 106 | _on_usage=_on_usage, 107 | _on_screenshot=_on_screenshot, 108 | **kwargs 109 | ) 110 | 111 | async def predict_click( 112 | self, 113 | model: str, 114 | image_b64: str, 115 | instruction: str, 116 | **kwargs 117 | ) -> Optional[Tuple[int, int]]: 118 | """ 119 | Predict click coordinates using InternVL via litellm.acompletion. 120 | 121 | Behavior mirrors the ScreenSpot InternVL baseline: 122 | - Prompt: "<image>\nPlease provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. Answer in the format of [[x1, y1, x2, y2]]" 123 | - Parse either [[x,y]] point or [[x1,y1,x2,y2]] bbox, using bbox center if point missing 124 | - Coordinates are 0-1000 normalized; convert to pixel coordinates for the original screenshot 125 | """ 126 | try: 127 | # Decode image dimensions to scale the normalized outputs 128 | img_bytes = base64.b64decode(image_b64) 129 | image = Image.open(BytesIO(img_bytes)) 130 | width, height = image.size 131 | except Exception: 132 | # If decoding fails, proceed with a safe default size to avoid crash 133 | width, height = 1920, 1080 134 | 135 | # Build grounding prompt exactly like the baseline 136 | grounding_prompt = ( 137 | f"Please provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. " 138 | f"Answer in the format of [[x1, y1, x2, y2]]" 139 | ) 140 | 141 | # Prepare messages for LiteLLM 142 | messages = [ 143 | { 144 | "role": "user", 145 | "content": [ 146 | { 147 | "type": "image_url", 148 | "image_url": {"url": f"data:image/png;base64,{image_b64}"}, 149 | }, 150 | {"type": "text", "text": grounding_prompt}, 151 | ], 152 | } 153 | ] 154 | 155 | # Call acompletion; HuggingFaceLocalAdapter/model handler will handle InternVL loading 156 | api_kwargs = { 157 | "model": model, 158 | "messages": messages, 159 | # Conservative generation params akin to baseline (deterministic) 160 | "max_tokens": kwargs.get("max_tokens", 256), 161 | "temperature": kwargs.get("temperature", 0.0), 162 | } 163 | 164 | response = await litellm.acompletion(**api_kwargs) 165 | output_text = (response.choices[0].message.content or "").strip() # type: ignore 166 | 167 | # print(f"InternVL output: {output_text}") 168 | 169 | # Try to parse a point first; if absent, parse bbox and take center 170 | point = _extract_first_point(output_text) 171 | if point is None: 172 | bbox = _extract_last_bbox(output_text) 173 | if bbox is None: 174 | return None 175 | x1, y1, x2, y2 = bbox 176 | cx = (x1 + x2) / 2.0 177 | cy = (y1 + y2) / 2.0 178 | point = (cx, cy) 179 | 180 | x_norm, y_norm = point 181 | x_px, y_px = _scale_norm_to_pixels(x_norm, y_norm, width, height) 182 | return (x_px, y_px) 183 | 184 | def get_capabilities(self) -> List[AgentCapability]: 185 | return ["click", "step"] 186 | ``` -------------------------------------------------------------------------------- /libs/python/computer/computer/providers/factory.py: -------------------------------------------------------------------------------- ```python 1 | """Factory for creating VM providers.""" 2 | 3 | import logging 4 | from typing import Dict, Optional, Any, Type, Union 5 | 6 | from .base import BaseVMProvider, VMProviderType 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class VMProviderFactory: 12 | """Factory for creating VM providers based on provider type.""" 13 | 14 | @staticmethod 15 | def create_provider( 16 | provider_type: Union[str, VMProviderType], 17 | port: int = 7777, 18 | host: str = "localhost", 19 | bin_path: Optional[str] = None, 20 | storage: Optional[str] = None, 21 | shared_path: Optional[str] = None, 22 | image: Optional[str] = None, 23 | verbose: bool = False, 24 | ephemeral: bool = False, 25 | noVNC_port: Optional[int] = None, 26 | **kwargs, 27 | ) -> BaseVMProvider: 28 | """Create a VM provider of the specified type. 29 | 30 | Args: 31 | provider_type: Type of VM provider to create 32 | port: Port for the API server 33 | host: Hostname for the API server 34 | bin_path: Path to provider binary if needed 35 | storage: Path for persistent VM storage 36 | shared_path: Path for shared folder between host and VM 37 | image: VM image to use (for Lumier provider) 38 | verbose: Enable verbose logging 39 | ephemeral: Use ephemeral (temporary) storage 40 | noVNC_port: Specific port for noVNC interface (for Lumier provider) 41 | 42 | Returns: 43 | An instance of the requested VM provider 44 | 45 | Raises: 46 | ImportError: If the required dependencies for the provider are not installed 47 | ValueError: If the provider type is not supported 48 | """ 49 | # Convert string to enum if needed 50 | if isinstance(provider_type, str): 51 | try: 52 | provider_type = VMProviderType(provider_type.lower()) 53 | except ValueError: 54 | provider_type = VMProviderType.UNKNOWN 55 | 56 | if provider_type == VMProviderType.LUME: 57 | try: 58 | from .lume import LumeProvider, HAS_LUME 59 | if not HAS_LUME: 60 | raise ImportError( 61 | "The pylume package is required for LumeProvider. " 62 | "Please install it with 'pip install cua-computer[lume]'" 63 | ) 64 | return LumeProvider( 65 | port=port, 66 | host=host, 67 | storage=storage, 68 | verbose=verbose, 69 | ephemeral=ephemeral 70 | ) 71 | except ImportError as e: 72 | logger.error(f"Failed to import LumeProvider: {e}") 73 | raise ImportError( 74 | "The pylume package is required for LumeProvider. " 75 | "Please install it with 'pip install cua-computer[lume]'" 76 | ) from e 77 | elif provider_type == VMProviderType.LUMIER: 78 | try: 79 | from .lumier import LumierProvider, HAS_LUMIER 80 | if not HAS_LUMIER: 81 | raise ImportError( 82 | "Docker is required for LumierProvider. " 83 | "Please install Docker for Apple Silicon and Lume CLI before using this provider." 84 | ) 85 | return LumierProvider( 86 | port=port, 87 | host=host, 88 | storage=storage, 89 | shared_path=shared_path, 90 | image=image or "macos-sequoia-cua:latest", 91 | verbose=verbose, 92 | ephemeral=ephemeral, 93 | noVNC_port=noVNC_port 94 | ) 95 | except ImportError as e: 96 | logger.error(f"Failed to import LumierProvider: {e}") 97 | raise ImportError( 98 | "Docker and Lume CLI are required for LumierProvider. " 99 | "Please install Docker for Apple Silicon and run the Lume installer script." 100 | ) from e 101 | 102 | elif provider_type == VMProviderType.CLOUD: 103 | try: 104 | from .cloud import CloudProvider 105 | return CloudProvider( 106 | verbose=verbose, 107 | **kwargs, 108 | ) 109 | except ImportError as e: 110 | logger.error(f"Failed to import CloudProvider: {e}") 111 | raise ImportError( 112 | "The CloudProvider is not fully implemented yet. " 113 | "Please use LUME or LUMIER provider instead." 114 | ) from e 115 | elif provider_type == VMProviderType.WINSANDBOX: 116 | try: 117 | from .winsandbox import WinSandboxProvider, HAS_WINSANDBOX 118 | if not HAS_WINSANDBOX: 119 | raise ImportError( 120 | "pywinsandbox is required for WinSandboxProvider. " 121 | "Please install it with 'pip install -U git+https://github.com/karkason/pywinsandbox.git'" 122 | ) 123 | return WinSandboxProvider( 124 | port=port, 125 | host=host, 126 | storage=storage, 127 | verbose=verbose, 128 | ephemeral=ephemeral, 129 | **kwargs 130 | ) 131 | except ImportError as e: 132 | logger.error(f"Failed to import WinSandboxProvider: {e}") 133 | raise ImportError( 134 | "pywinsandbox is required for WinSandboxProvider. " 135 | "Please install it with 'pip install -U git+https://github.com/karkason/pywinsandbox.git'" 136 | ) from e 137 | elif provider_type == VMProviderType.DOCKER: 138 | try: 139 | from .docker import DockerProvider, HAS_DOCKER 140 | if not HAS_DOCKER: 141 | raise ImportError( 142 | "Docker is required for DockerProvider. " 143 | "Please install Docker and ensure it is running." 144 | ) 145 | return DockerProvider( 146 | port=port, 147 | host=host, 148 | storage=storage, 149 | shared_path=shared_path, 150 | image=image or "trycua/cua-ubuntu:latest", 151 | verbose=verbose, 152 | ephemeral=ephemeral, 153 | vnc_port=noVNC_port 154 | ) 155 | except ImportError as e: 156 | logger.error(f"Failed to import DockerProvider: {e}") 157 | raise ImportError( 158 | "Docker is required for DockerProvider. " 159 | "Please install Docker and ensure it is running." 160 | ) from e 161 | else: 162 | raise ValueError(f"Unsupported provider type: {provider_type}") 163 | ``` -------------------------------------------------------------------------------- /libs/python/agent/benchmarks/interactive.py: -------------------------------------------------------------------------------- ```python 1 | #!/usr/bin/env python3 2 | """ 3 | Interactive Click Prediction Tool 4 | 5 | Takes screenshots and allows testing multiple models interactively. 6 | Models are loaded/unloaded one at a time to avoid memory issues. 7 | """ 8 | 9 | import asyncio 10 | import os 11 | from datetime import datetime 12 | from typing import List, Dict, Any 13 | 14 | from utils import ( 15 | ModelWrapper, 16 | take_screenshot, 17 | save_prediction_visualization, 18 | get_available_models 19 | ) 20 | 21 | 22 | async def predict_with_all_models(image, instruction: str, models) -> List[Dict[str, Any]]: 23 | """ 24 | Predict click coordinates with all models sequentially. 25 | 26 | Args: 27 | image: PIL Image to analyze 28 | instruction: Instruction text 29 | models: List of model instances 30 | 31 | Returns: 32 | List of prediction results 33 | """ 34 | predictions = [] 35 | 36 | for model in models: 37 | model_wrapper = ModelWrapper(model) 38 | print(f"\n🔄 Loading {model_wrapper.model_name}...") 39 | 40 | try: 41 | # Load model 42 | await model_wrapper.load_model() 43 | 44 | # Predict 45 | coords = await model_wrapper.predict_click(image, instruction) 46 | 47 | predictions.append({ 48 | 'model_name': model_wrapper.model_name, 49 | 'coords': coords, 50 | 'error': None 51 | }) 52 | 53 | if coords: 54 | print(f"✅ {model_wrapper.model_name}: ({coords[0]}, {coords[1]})") 55 | else: 56 | print(f"❌ {model_wrapper.model_name}: No prediction") 57 | 58 | except Exception as e: 59 | print(f"❌ {model_wrapper.model_name}: ERROR - {str(e)}") 60 | predictions.append({ 61 | 'model_name': model_wrapper.model_name, 62 | 'coords': None, 63 | 'error': str(e) 64 | }) 65 | 66 | finally: 67 | # Always unload model to free memory 68 | try: 69 | await model_wrapper.unload_model() 70 | print(f"🗑️ Unloaded {model_wrapper.model_name}") 71 | except Exception as e: 72 | print(f"⚠️ Error unloading {model_wrapper.model_name}: {e}") 73 | 74 | return predictions 75 | 76 | 77 | def print_header(): 78 | """Print the interactive tool header.""" 79 | print("=" * 60) 80 | print("🖱️ Interactive Click Prediction Tool") 81 | print("=" * 60) 82 | print("Commands:") 83 | print(" • Type an instruction to test models on last screenshot") 84 | print(" • 'screenshot' - Take a new screenshot") 85 | print(" • 'models' - List available models") 86 | print(" • 'quit' or 'exit' - Exit the tool") 87 | print("=" * 60) 88 | print("💡 Tip: Take a screenshot first, then send instructions to test models!") 89 | 90 | 91 | def print_models(models): 92 | """Print available models.""" 93 | print("\n📋 Available Models:") 94 | for i, model in enumerate(models, 1): 95 | if isinstance(model, str): 96 | print(f" {i}. {model}") 97 | else: 98 | print(f" {i}. models.{model.__class__.__name__}") 99 | 100 | 101 | async def main(): 102 | """ 103 | Main interactive loop. 104 | """ 105 | print_header() 106 | 107 | # Get available models 108 | models = get_available_models() 109 | print_models(models) 110 | 111 | # Create output directory for visualizations 112 | output_dir = "interactive_output" 113 | os.makedirs(output_dir, exist_ok=True) 114 | 115 | session_count = 0 116 | last_screenshot = None 117 | screenshot_timestamp = None 118 | 119 | while True: 120 | try: 121 | # Get user input 122 | print(f"\n{'='*40}") 123 | user_input = input("🎯 Enter instruction (or command): ").strip() 124 | 125 | if not user_input: 126 | continue 127 | 128 | # Handle commands 129 | if user_input.lower() in ['quit', 'exit', 'q']: 130 | print("👋 Goodbye!") 131 | break 132 | 133 | elif user_input.lower() == 'models': 134 | print_models(models) 135 | continue 136 | 137 | elif user_input.lower() == 'screenshot': 138 | print("📸 Taking screenshot...") 139 | try: 140 | last_screenshot = take_screenshot() 141 | screenshot_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 142 | screenshot_path = os.path.join(output_dir, f"screenshot_{screenshot_timestamp}.png") 143 | last_screenshot.save(screenshot_path) 144 | print(f"✅ Screenshot captured and saved to: {screenshot_path}") 145 | print(f"📝 Ready for instructions! Screenshot size: {last_screenshot.size}") 146 | except Exception as e: 147 | print(f"❌ Error taking screenshot: {e}") 148 | continue 149 | 150 | # Handle instruction input 151 | if last_screenshot is None: 152 | print("⚠️ No screenshot available! Please take a screenshot first using 'screenshot' command.") 153 | continue 154 | 155 | session_count += 1 156 | print(f"\n🎯 Session {session_count}: '{user_input}'") 157 | print(f"📷 Using screenshot from: {screenshot_timestamp}") 158 | 159 | # Predict with all models using last screenshot 160 | print(f"\n🤖 Testing {len(models)} models on screenshot...") 161 | predictions = await predict_with_all_models(last_screenshot, user_input, models) 162 | 163 | # Display results summary 164 | print(f"\n📊 Results Summary:") 165 | print("-" * 50) 166 | for pred in predictions: 167 | if pred['coords']: 168 | print(f"✅ {pred['model_name']}: ({pred['coords'][0]}, {pred['coords'][1]})") 169 | elif pred['error']: 170 | print(f"❌ {pred['model_name']}: ERROR - {pred['error']}") 171 | else: 172 | print(f"❌ {pred['model_name']}: No prediction") 173 | 174 | # Save visualization 175 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 176 | vis_filename = f"session_{session_count:03d}_{timestamp}.png" 177 | vis_path = os.path.join(output_dir, vis_filename) 178 | 179 | try: 180 | save_prediction_visualization(last_screenshot, user_input, predictions, vis_path) 181 | print(f"\n💾 Visualization saved to: {vis_path}") 182 | except Exception as e: 183 | print(f"⚠️ Error saving visualization: {e}") 184 | 185 | print(f"\n✨ Session {session_count} completed!") 186 | 187 | except KeyboardInterrupt: 188 | print("\n\n👋 Interrupted by user. Goodbye!") 189 | break 190 | except Exception as e: 191 | print(f"\n❌ Unexpected error: {e}") 192 | print("Continuing...") 193 | 194 | 195 | if __name__ == "__main__": 196 | try: 197 | asyncio.run(main()) 198 | except KeyboardInterrupt: 199 | print("\n👋 Goodbye!") 200 | except Exception as e: 201 | print(f"❌ Fatal error: {e}") 202 | ``` -------------------------------------------------------------------------------- /tests/test_venv.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Virtual Environment Testing Module 3 | This module tests the ability to execute python code in a virtual environment within Cua Containers. 4 | 5 | Required environment variables: 6 | - CUA_API_KEY: API key for Cua cloud provider 7 | - CUA_CONTAINER_NAME: Name of the container to use 8 | """ 9 | 10 | import os 11 | import asyncio 12 | import pytest 13 | from pathlib import Path 14 | import sys 15 | import traceback 16 | 17 | # Load environment variables from .env file 18 | project_root = Path(__file__).parent.parent 19 | env_file = project_root / ".env" 20 | print(f"Loading environment from: {env_file}") 21 | from dotenv import load_dotenv 22 | 23 | load_dotenv(env_file) 24 | 25 | # Add paths to sys.path if needed 26 | pythonpath = os.environ.get("PYTHONPATH", "") 27 | for path in pythonpath.split(":"): 28 | if path and path not in sys.path: 29 | sys.path.insert(0, path) # Insert at beginning to prioritize 30 | print(f"Added to sys.path: {path}") 31 | 32 | from computer import Computer, VMProviderType 33 | from computer.helpers import sandboxed, set_default_computer 34 | 35 | 36 | @pytest.fixture(scope="session") 37 | async def computer(): 38 | """Shared Computer instance for all test cases.""" 39 | # Create a remote Linux computer with Cua 40 | computer = Computer( 41 | os_type="linux", 42 | api_key=os.getenv("CUA_API_KEY"), 43 | name=str(os.getenv("CUA_CONTAINER_NAME")), 44 | provider_type=VMProviderType.CLOUD, 45 | ) 46 | 47 | # # Create a local macOS computer with Cua 48 | # computer = Computer() 49 | 50 | try: 51 | await computer.run() 52 | yield computer 53 | finally: 54 | await computer.disconnect() 55 | 56 | 57 | # Sample test cases 58 | @pytest.mark.asyncio(loop_scope="session") 59 | async def test_venv_install(computer): 60 | """Test virtual environment creation and package installation.""" 61 | # Create a test virtual environment and install requests 62 | stdout, _ = await computer.venv_install("test_env", ["requests"]) 63 | 64 | # Check that installation was successful (no major errors) 65 | assert "Successfully installed" in stdout or "Requirement already satisfied" in stdout 66 | 67 | @pytest.mark.asyncio(loop_scope="session") 68 | async def test_venv_cmd(computer): 69 | """Test executing shell commands in virtual environment.""" 70 | # Test Python version check 71 | stdout, _ = await computer.venv_cmd("test_env", "python --version") 72 | 73 | assert "Python" in stdout 74 | 75 | @pytest.mark.asyncio(loop_scope="session") 76 | async def test_venv_exec(computer): 77 | """Test executing Python functions in virtual environment.""" 78 | def test_function(message="Hello World"): 79 | import sys 80 | return f"Python {sys.version_info.major}.{sys.version_info.minor}: {message}" 81 | 82 | result = await computer.venv_exec("test_env", test_function, message="Test successful!") 83 | 84 | assert "Python" in result 85 | assert "Test successful!" in result 86 | 87 | @pytest.mark.asyncio(loop_scope="session") 88 | async def test_venv_exec_with_package(computer): 89 | """Test executing Python functions that use installed packages.""" 90 | def test_requests(): 91 | import requests 92 | return f"requests version: {requests.__version__}" 93 | 94 | result = await computer.venv_exec("test_env", test_requests) 95 | 96 | assert "requests version:" in result 97 | 98 | @pytest.mark.asyncio(loop_scope="session") 99 | async def test_venv_exec_error_handling(computer): 100 | """Test error handling in venv_exec.""" 101 | def test_error(): 102 | raise ValueError("This is a test error") 103 | 104 | with pytest.raises(ValueError, match="This is a test error"): 105 | await computer.venv_exec("test_env", test_error) 106 | 107 | @pytest.mark.asyncio(loop_scope="session") 108 | async def test_venv_exec_with_args_kwargs(computer): 109 | """Test executing Python functions with args and kwargs that return an object.""" 110 | def create_data_object(name, age, *hobbies, **metadata): 111 | return { 112 | "name": name, 113 | "age": age, 114 | "hobbies": list(hobbies), 115 | "metadata": metadata, 116 | "status": "active" 117 | } 118 | 119 | args = ["Alice", 25, "reading", "coding"] 120 | kwargs = {"location": "New York", "department": "Engineering"} 121 | 122 | result = await computer.venv_exec( 123 | "test_env", 124 | create_data_object, 125 | *args, 126 | **kwargs 127 | ) 128 | 129 | assert result["name"] == "Alice" 130 | assert result["age"] == 25 131 | assert result["hobbies"] == ["reading", "coding"] 132 | assert result["metadata"]["location"] == "New York" 133 | assert result["status"] == "active" 134 | 135 | @pytest.mark.asyncio(loop_scope="session") 136 | async def test_venv_exec_stdout_capture(computer, capfd): 137 | """Test capturing stdout from Python functions executed in virtual environment.""" 138 | def hello_world_function(): 139 | print("Hello World!") 140 | return "Function completed" 141 | 142 | # Execute the function in the virtual environment 143 | result = await computer.venv_exec("test_env", hello_world_function) 144 | 145 | # Capture stdout and stderr 146 | out, _ = capfd.readouterr() 147 | 148 | # Assert the stdout contains our expected output 149 | assert out == "Hello World!\n\n" 150 | assert result == "Function completed" 151 | 152 | @pytest.mark.asyncio(loop_scope="session") 153 | async def test_remote_decorator(computer): 154 | """Test the remote decorator from computer.helpers module.""" 155 | # Set the computer as default for the remote decorator 156 | set_default_computer(computer) 157 | 158 | # Define a function with the remote decorator 159 | @sandboxed("test_env") 160 | def get_package_version(): 161 | import sys 162 | import platform 163 | return { 164 | "python_version": sys.version, 165 | "platform": platform.platform(), 166 | "success": True 167 | } 168 | 169 | # Call the decorated function 170 | result = await get_package_version() 171 | 172 | # Verify the function executed in the virtual environment 173 | assert "python_version" in result 174 | assert "platform" in result 175 | assert result["success"] == True 176 | 177 | @pytest.mark.asyncio(loop_scope="session") 178 | async def test_remote_decorator_with_custom_computer(computer): 179 | """Test the remote decorator with explicitly specified computer instance.""" 180 | # Define a function with the remote decorator that explicitly specifies the computer 181 | @sandboxed("test_env", computer=computer) 182 | def get_system_info(): 183 | import os 184 | import sys 185 | return { 186 | "python_version": sys.version, 187 | "environment_vars": dict(os.environ), 188 | "working_directory": os.getcwd() 189 | } 190 | 191 | # Call the decorated function 192 | result = await get_system_info() 193 | 194 | # Verify the function executed in the virtual environment 195 | assert "python_version" in result 196 | assert "environment_vars" in result 197 | assert "working_directory" in result 198 | # The virtual environment should have a different working directory 199 | # than the current test process 200 | assert result["working_directory"] != os.getcwd() 201 | 202 | if __name__ == "__main__": 203 | # Run tests directly 204 | pytest.main([__file__, "-v"]) 205 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/adapters/huggingfacelocal_adapter.py: -------------------------------------------------------------------------------- ```python 1 | import asyncio 2 | import functools 3 | import warnings 4 | from concurrent.futures import ThreadPoolExecutor 5 | from typing import Iterator, AsyncIterator, Dict, List, Any, Optional 6 | from litellm.types.utils import GenericStreamingChunk, ModelResponse 7 | from litellm.llms.custom_llm import CustomLLM 8 | from litellm import completion, acompletion 9 | 10 | # Try to import HuggingFace dependencies 11 | try: 12 | import torch 13 | from transformers import AutoModelForImageTextToText, AutoProcessor 14 | HF_AVAILABLE = True 15 | except ImportError: 16 | HF_AVAILABLE = False 17 | 18 | from .models import load_model as load_model_handler 19 | 20 | class HuggingFaceLocalAdapter(CustomLLM): 21 | """HuggingFace Local Adapter for running vision-language models locally.""" 22 | 23 | def __init__(self, device: str = "auto", trust_remote_code: bool = False, **kwargs): 24 | """Initialize the adapter. 25 | 26 | Args: 27 | device: Device to load model on ("auto", "cuda", "cpu", etc.) 28 | trust_remote_code: Whether to trust remote code 29 | **kwargs: Additional arguments 30 | """ 31 | super().__init__() 32 | self.device = device 33 | self.trust_remote_code = trust_remote_code 34 | # Cache for model handlers keyed by model_name 35 | self._handlers: Dict[str, Any] = {} 36 | self._executor = ThreadPoolExecutor(max_workers=1) # Single thread pool 37 | 38 | def _get_handler(self, model_name: str): 39 | """Get or create a model handler for the given model name.""" 40 | if model_name not in self._handlers: 41 | self._handlers[model_name] = load_model_handler(model_name=model_name, device=self.device, trust_remote_code=self.trust_remote_code) 42 | return self._handlers[model_name] 43 | 44 | def _convert_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 45 | """Convert OpenAI format messages to HuggingFace format. 46 | 47 | Args: 48 | messages: Messages in OpenAI format 49 | 50 | Returns: 51 | Messages in HuggingFace format 52 | """ 53 | converted_messages = [] 54 | 55 | for message in messages: 56 | converted_message = { 57 | "role": message["role"], 58 | "content": [] 59 | } 60 | 61 | content = message.get("content", []) 62 | if isinstance(content, str): 63 | # Simple text content 64 | converted_message["content"].append({ 65 | "type": "text", 66 | "text": content 67 | }) 68 | elif isinstance(content, list): 69 | # Multi-modal content 70 | for item in content: 71 | if item.get("type") == "text": 72 | converted_message["content"].append({ 73 | "type": "text", 74 | "text": item.get("text", "") 75 | }) 76 | elif item.get("type") == "image_url": 77 | # Convert image_url format to image format 78 | image_url = item.get("image_url", {}).get("url", "") 79 | converted_message["content"].append({ 80 | "type": "image", 81 | "image": image_url 82 | }) 83 | 84 | converted_messages.append(converted_message) 85 | 86 | return converted_messages 87 | 88 | def _generate(self, **kwargs) -> str: 89 | """Generate response using the local HuggingFace model. 90 | 91 | Args: 92 | **kwargs: Keyword arguments containing messages and model info 93 | 94 | Returns: 95 | Generated text response 96 | """ 97 | if not HF_AVAILABLE: 98 | raise ImportError( 99 | "HuggingFace transformers dependencies not found. " 100 | "Please install with: pip install \"cua-agent[uitars-hf]\"" 101 | ) 102 | 103 | # Extract messages and model from kwargs 104 | messages = kwargs.get('messages', []) 105 | model_name = kwargs.get('model', 'ByteDance-Seed/UI-TARS-1.5-7B') 106 | max_new_tokens = kwargs.get('max_tokens', 128) 107 | 108 | # Warn about ignored kwargs 109 | ignored_kwargs = set(kwargs.keys()) - {'messages', 'model', 'max_tokens'} 110 | if ignored_kwargs: 111 | warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}") 112 | 113 | # Convert messages to HuggingFace format 114 | hf_messages = self._convert_messages(messages) 115 | 116 | # Delegate to model handler 117 | handler = self._get_handler(model_name) 118 | generated_text = handler.generate(hf_messages, max_new_tokens=max_new_tokens) 119 | return generated_text 120 | 121 | def completion(self, *args, **kwargs) -> ModelResponse: 122 | """Synchronous completion method. 123 | 124 | Returns: 125 | ModelResponse with generated text 126 | """ 127 | generated_text = self._generate(**kwargs) 128 | 129 | return completion( 130 | model=f"huggingface-local/{kwargs['model']}", 131 | mock_response=generated_text, 132 | ) 133 | 134 | async def acompletion(self, *args, **kwargs) -> ModelResponse: 135 | """Asynchronous completion method. 136 | 137 | Returns: 138 | ModelResponse with generated text 139 | """ 140 | # Run _generate in thread pool to avoid blocking 141 | loop = asyncio.get_event_loop() 142 | generated_text = await loop.run_in_executor( 143 | self._executor, 144 | functools.partial(self._generate, **kwargs) 145 | ) 146 | 147 | return await acompletion( 148 | model=f"huggingface-local/{kwargs['model']}", 149 | mock_response=generated_text, 150 | ) 151 | 152 | def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]: 153 | """Synchronous streaming method. 154 | 155 | Returns: 156 | Iterator of GenericStreamingChunk 157 | """ 158 | generated_text = self._generate(**kwargs) 159 | 160 | generic_streaming_chunk: GenericStreamingChunk = { 161 | "finish_reason": "stop", 162 | "index": 0, 163 | "is_finished": True, 164 | "text": generated_text, 165 | "tool_use": None, 166 | "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0}, 167 | } 168 | 169 | yield generic_streaming_chunk 170 | 171 | async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]: 172 | """Asynchronous streaming method. 173 | 174 | Returns: 175 | AsyncIterator of GenericStreamingChunk 176 | """ 177 | # Run _generate in thread pool to avoid blocking 178 | loop = asyncio.get_event_loop() 179 | generated_text = await loop.run_in_executor( 180 | self._executor, 181 | functools.partial(self._generate, **kwargs) 182 | ) 183 | 184 | generic_streaming_chunk: GenericStreamingChunk = { 185 | "finish_reason": "stop", 186 | "index": 0, 187 | "is_finished": True, 188 | "text": generated_text, 189 | "tool_use": None, 190 | "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0}, 191 | } 192 | 193 | yield generic_streaming_chunk ``` -------------------------------------------------------------------------------- /libs/python/som/som/util/utils.py: -------------------------------------------------------------------------------- ```python 1 | import easyocr 2 | import cv2 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | from PIL import Image 6 | from typing import Union, List, Tuple, Any, Optional, cast, Sequence 7 | import time 8 | import signal 9 | from contextlib import contextmanager 10 | import logging 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class TimeoutException(Exception): 16 | pass 17 | 18 | 19 | @contextmanager 20 | def timeout(seconds): 21 | def timeout_handler(signum, frame): 22 | logger.warning(f"OCR process timed out after {seconds} seconds") 23 | raise TimeoutException("OCR processing timed out") 24 | 25 | # Register the signal handler 26 | original_handler = signal.signal(signal.SIGALRM, timeout_handler) 27 | signal.alarm(seconds) 28 | 29 | try: 30 | yield 31 | finally: 32 | signal.alarm(0) 33 | signal.signal(signal.SIGALRM, original_handler) 34 | 35 | 36 | # Initialize EasyOCR with optimized settings 37 | logger.info("Initializing EasyOCR with optimized settings...") 38 | reader = easyocr.Reader( 39 | ["en"], 40 | gpu=True, # Use GPU if available 41 | model_storage_directory=None, # Use default directory 42 | download_enabled=True, 43 | detector=True, # Enable text detection 44 | recognizer=True, # Enable text recognition 45 | verbose=False, # Disable verbose output 46 | quantize=True, # Enable quantization for faster inference 47 | cudnn_benchmark=True, # Enable cuDNN benchmarking 48 | ) 49 | logger.info("EasyOCR initialization complete") 50 | 51 | 52 | def check_ocr_box( 53 | image_source: Union[str, Image.Image], 54 | display_img=True, 55 | output_bb_format="xywh", 56 | goal_filtering=None, 57 | easyocr_args=None, 58 | use_paddleocr=False, 59 | ) -> Tuple[Tuple[List[str], List[Tuple[float, float, float, float]]], Optional[Any]]: 60 | """Check OCR box using EasyOCR with optimized settings. 61 | 62 | Args: 63 | image_source: Either a file path or PIL Image 64 | display_img: Whether to display the annotated image 65 | output_bb_format: Format for bounding boxes ('xywh' or 'xyxy') 66 | goal_filtering: Optional filtering of results 67 | easyocr_args: Arguments for EasyOCR 68 | use_paddleocr: Ignored (kept for backward compatibility) 69 | 70 | Returns: 71 | Tuple containing: 72 | - Tuple of (text_list, bounding_boxes) 73 | - goal_filtering value 74 | """ 75 | logger.info("Starting OCR processing...") 76 | start_time = time.time() 77 | 78 | if isinstance(image_source, str): 79 | logger.info(f"Loading image from path: {image_source}") 80 | image_source = Image.open(image_source) 81 | if image_source.mode == "RGBA": 82 | logger.info("Converting RGBA image to RGB") 83 | image_source = image_source.convert("RGB") 84 | image_np = np.array(image_source) 85 | w, h = image_source.size 86 | logger.info(f"Image size: {w}x{h}") 87 | 88 | # Default EasyOCR arguments optimized for speed 89 | default_args = { 90 | "paragraph": False, # Disable paragraph detection 91 | "text_threshold": 0.5, # Confidence threshold 92 | "link_threshold": 0.4, # Text link threshold 93 | "canvas_size": 2560, # Max image size 94 | "mag_ratio": 1.0, # Magnification ratio 95 | "slope_ths": 0.1, # Slope threshold 96 | "ycenter_ths": 0.5, # Y-center threshold 97 | "height_ths": 0.5, # Height threshold 98 | "width_ths": 0.5, # Width threshold 99 | "add_margin": 0.1, # Margin around text 100 | "min_size": 20, # Minimum text size 101 | } 102 | 103 | # Update with user-provided arguments 104 | if easyocr_args: 105 | logger.info(f"Using custom EasyOCR arguments: {easyocr_args}") 106 | default_args.update(easyocr_args) 107 | 108 | try: 109 | # Use EasyOCR with timeout 110 | logger.info("Starting EasyOCR detection with 5 second timeout...") 111 | with timeout(5): # 5 second timeout 112 | # EasyOCR's readtext returns a list of tuples, where each tuple is (bbox, text, confidence) 113 | raw_result = reader.readtext(image_np, **default_args) 114 | result = cast(Sequence[Tuple[List[Tuple[float, float]], str, float]], raw_result) 115 | coord = [item[0] for item in result] # item[0] is the bbox coordinates 116 | text = [item[1] for item in result] # item[1] is the text content 117 | logger.info(f"OCR completed successfully. Found {len(text)} text regions") 118 | logger.info(f"Detected text: {text}") 119 | 120 | except TimeoutException: 121 | logger.error("OCR processing timed out after 5 seconds") 122 | coord = [] 123 | text = [] 124 | except Exception as e: 125 | logger.error(f"OCR processing failed with error: {str(e)}") 126 | coord = [] 127 | text = [] 128 | 129 | processing_time = time.time() - start_time 130 | logger.info(f"Total OCR processing time: {processing_time:.2f} seconds") 131 | 132 | if display_img: 133 | logger.info("Creating visualization of OCR results...") 134 | opencv_img = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR) 135 | bb = [] 136 | for item in coord: 137 | x, y, a, b = get_xywh(item) 138 | bb.append((x, y, a, b)) 139 | # Convert float coordinates to integers for cv2.rectangle 140 | x_val = cast(float, x) 141 | y_val = cast(float, y) 142 | a_val = cast(float, a) 143 | b_val = cast(float, b) 144 | x_int, y_int = int(x_val), int(y_val) 145 | a_int, b_int = int(a_val), int(b_val) 146 | cv2.rectangle( 147 | opencv_img, (x_int, y_int), (x_int + a_int, y_int + b_int), (0, 255, 0), 2 148 | ) 149 | plt.imshow(cv2.cvtColor(opencv_img, cv2.COLOR_BGR2RGB)) 150 | else: 151 | if output_bb_format == "xywh": 152 | bb = [get_xywh(item) for item in coord] 153 | elif output_bb_format == "xyxy": 154 | bb = [get_xyxy(item) for item in coord] 155 | 156 | # Cast the bounding boxes to the expected type 157 | bb = cast(List[Tuple[float, float, float, float]], bb) 158 | 159 | logger.info("OCR processing complete") 160 | return (text, bb), goal_filtering 161 | 162 | 163 | def get_xywh(box): 164 | """ 165 | Convert a bounding box to xywh format (x, y, width, height). 166 | 167 | Args: 168 | box: Bounding box coordinates (various formats supported) 169 | 170 | Returns: 171 | Tuple of (x, y, width, height) 172 | """ 173 | # Handle different input formats 174 | if len(box) == 4: 175 | # If already in xywh format or xyxy format 176 | if isinstance(box[0], (int, float)) and isinstance(box[2], (int, float)): 177 | if box[2] < box[0] or box[3] < box[1]: 178 | # Already xyxy format, convert to xywh 179 | x1, y1, x2, y2 = box 180 | return x1, y1, x2 - x1, y2 - y1 181 | else: 182 | # Already in xywh format 183 | return box 184 | elif len(box) == 2: 185 | # Format like [[x1,y1],[x2,y2]] from some OCR engines 186 | (x1, y1), (x2, y2) = box 187 | return x1, y1, x2 - x1, y2 - y1 188 | 189 | # Default case - try to convert assuming it's a list of points 190 | x_coords = [p[0] for p in box] 191 | y_coords = [p[1] for p in box] 192 | x1, y1 = min(x_coords), min(y_coords) 193 | width, height = max(x_coords) - x1, max(y_coords) - y1 194 | return x1, y1, width, height 195 | 196 | 197 | def get_xyxy(box): 198 | """ 199 | Convert a bounding box to xyxy format (x1, y1, x2, y2). 200 | 201 | Args: 202 | box: Bounding box coordinates (various formats supported) 203 | 204 | Returns: 205 | Tuple of (x1, y1, x2, y2) 206 | """ 207 | # Get xywh first, then convert to xyxy 208 | x, y, w, h = get_xywh(box) 209 | return x, y, x + w, y + h 210 | ``` -------------------------------------------------------------------------------- /libs/python/agent/benchmarks/ss-v2.py: -------------------------------------------------------------------------------- ```python 1 | #!/usr/bin/env python3 2 | """ 3 | ScreenSpot-v2 Benchmark Script 4 | 5 | Evaluates models on the ScreenSpot-v2 dataset for click prediction accuracy. 6 | Supports both ComputerAgent model strings and custom model classes. 7 | """ 8 | 9 | import argparse 10 | import asyncio 11 | import random 12 | import statistics 13 | import time 14 | from typing import Optional 15 | 16 | from datasets import load_dataset 17 | from tqdm import tqdm 18 | 19 | from utils import ( 20 | ModelWrapper, 21 | is_click_in_bbox, 22 | save_results_to_markdown, 23 | save_visualizations, 24 | get_available_models, 25 | get_gpu_memory 26 | ) 27 | 28 | 29 | async def evaluate_model(model_wrapper: ModelWrapper, samples, max_samples: Optional[int] = None) -> dict: 30 | """ 31 | Evaluate a model on any iterable of samples. 32 | 33 | Args: 34 | model_wrapper: ModelWrapper instance 35 | samples: Iterable of dicts with keys: image, bbox, instruction 36 | max_samples: Maximum number of samples to evaluate (None for all) 37 | 38 | Returns: 39 | Dictionary with evaluation results 40 | """ 41 | print(f"\nEvaluating model: {model_wrapper.model_name}") 42 | 43 | # Load model 44 | await model_wrapper.load_model() 45 | 46 | # Convert to list if needed and limit samples 47 | if hasattr(samples, '__len__'): 48 | total_samples = len(samples) 49 | if max_samples is not None: 50 | total_samples = min(max_samples, total_samples) 51 | sample_list = list(samples)[:total_samples] 52 | else: 53 | # For iterators, take max_samples or all 54 | sample_list = list(samples) 55 | if max_samples is not None: 56 | sample_list = sample_list[:max_samples] 57 | total_samples = len(sample_list) 58 | 59 | correct_predictions = 0 60 | error_predictions = 0 61 | results = [] 62 | 63 | for i, sample in enumerate(tqdm(sample_list, desc=f"Evaluating {model_wrapper.model_name}")): 64 | # Extract required data (only these 3 keys matter) 65 | image = sample['image'] 66 | instruction = sample['instruction'] 67 | bbox = sample['bbox'] # [x1, y1, x2, y2] 68 | 69 | # Predict click coordinates with timing 70 | start_time = time.time() 71 | click_coords = await model_wrapper.predict_click(image, instruction) 72 | prediction_time = time.time() - start_time 73 | 74 | # Check if prediction is correct 75 | is_correct = is_click_in_bbox(click_coords, bbox) 76 | 77 | if is_correct: 78 | correct_predictions += 1 79 | 80 | results.append({ 81 | 'sample_idx': i, 82 | 'instruction': instruction, 83 | 'bbox': bbox, 84 | 'predicted_coords': click_coords, 85 | 'is_correct': is_correct, 86 | 'failed': False, 87 | 'prediction_time': prediction_time 88 | }) 89 | 90 | # Unload model 91 | await model_wrapper.unload_model() 92 | 93 | # Calculate metrics 94 | accuracy = correct_predictions / total_samples if total_samples > 0 else 0.0 95 | error_rate = error_predictions / total_samples if total_samples > 0 else 0.0 96 | 97 | # Calculate timing statistics 98 | successful_times = [r['prediction_time'] for r in results if not r['failed']] 99 | avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0 100 | median_prediction_time = statistics.median(successful_times) if successful_times else 0.0 101 | min_prediction_time = min(successful_times) if successful_times else 0.0 102 | max_prediction_time = max(successful_times) if successful_times else 0.0 103 | 104 | # Get VRAM statistics 105 | vram_stats = model_wrapper.get_vram_stats() 106 | 107 | return { 108 | 'model_name': model_wrapper.model_name, 109 | 'total_samples': total_samples, 110 | 'correct_predictions': correct_predictions, 111 | 'failed_predictions': error_predictions, 112 | 'accuracy': accuracy, 113 | 'failure_rate': error_rate, 114 | 'avg_prediction_time': avg_prediction_time, 115 | 'median_prediction_time': median_prediction_time, 116 | 'min_prediction_time': min_prediction_time, 117 | 'max_prediction_time': max_prediction_time, 118 | 'vram_max_mb': vram_stats['max_mb'], 119 | 'vram_avg_mb': vram_stats['avg_mb'], 120 | 'results': results 121 | } 122 | 123 | 124 | async def main(): 125 | """ 126 | Main function to run the benchmark. 127 | """ 128 | # Parse command line arguments 129 | parser = argparse.ArgumentParser(description='ScreenSpot-v2 Benchmark Script') 130 | parser.add_argument('--samples', type=int, default=500, 131 | help='Number of samples to evaluate (default: 500)') 132 | parser.add_argument('--seed', type=int, default=42, 133 | help='Random seed for shuffling (default: 42)') 134 | args = parser.parse_args() 135 | 136 | # Set random seed 137 | random.seed(args.seed) 138 | 139 | # Load dataset 140 | print("Loading ScreenSpot-v2 dataset...") 141 | ds = load_dataset("lmms-lab/ScreenSpot-v2") 142 | dataset = ds['train'] # type: ignore 143 | # Convert to simple list of dicts with only required keys 144 | samples = [] 145 | for item in dataset: 146 | # Convert dataset item to dict if needed 147 | item_dict = dict(item) if hasattr(item, 'keys') else item 148 | 149 | # Convert ScreenSpot-v2 bbox format [x, y, w, h] to [x1, y1, x2, y2] 150 | bbox_xywh = item_dict['bbox'] # type: ignore 151 | x, y, w, h = bbox_xywh 152 | bbox_xyxy = [x, y, x + w, y + h] 153 | 154 | samples.append({ 155 | 'image': item_dict['image'], # type: ignore 156 | 'instruction': item_dict['instruction'], # type: ignore 157 | 'bbox': bbox_xyxy 158 | }) 159 | print(f"Dataset loaded: {len(samples)} samples") 160 | 161 | # Shuffle samples with seed 162 | random.shuffle(samples) 163 | print(f"Samples shuffled with seed {args.seed}") 164 | 165 | # Get available models 166 | models = get_available_models() 167 | 168 | # Evaluation settings 169 | max_samples = args.samples # Use command line argument 170 | 171 | # Run evaluations 172 | all_results = [] 173 | 174 | for model in models: 175 | model_wrapper = ModelWrapper(model) 176 | result = await evaluate_model(model_wrapper, samples, max_samples) 177 | all_results.append(result) 178 | 179 | # Print summary 180 | print(f"\n{result['model_name']} Results:") 181 | print(f" Accuracy: {result['accuracy']*100:.2f}%") 182 | print(f" Correct: {result['correct_predictions']}/{result['total_samples']}") 183 | print(f" Errors: {result['failed_predictions']}") 184 | print(f" Error Rate: {result['failure_rate']*100:.2f}%") 185 | print(f" Avg Time: {result['avg_prediction_time']:.2f}s") 186 | print(f" Median Time: {result['median_prediction_time']:.2f}s") 187 | print(f" Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s") 188 | print(f" VRAM Max: {result['vram_max_mb']:.1f}MB") 189 | print(f" VRAM Avg: {result['vram_avg_mb']:.1f}MB") 190 | 191 | # Print GPU memory info 192 | gpu_memory = get_gpu_memory() 193 | if gpu_memory and gpu_memory[0] > 0: 194 | print(f" GPU Free Memory: {gpu_memory[0]:.1f}MB") 195 | 196 | # Save results 197 | if all_results: 198 | save_results_to_markdown(all_results, "screenspot_v2_results.md", title="ScreenSpot-v2 Benchmark Results") 199 | save_visualizations(all_results, samples) 200 | print("\nBenchmark completed successfully!") 201 | else: 202 | print("\nNo successful evaluations completed.") 203 | 204 | 205 | if __name__ == "__main__": 206 | asyncio.run(main()) ``` -------------------------------------------------------------------------------- /blog/ubuntu-docker-support.md: -------------------------------------------------------------------------------- ```markdown 1 | # Ubuntu Docker Support in Cua with Kasm 2 | 3 | *Published Aug 26, 2025 by Francesco Bonacci* 4 | 5 | Today we’re shipping **Ubuntu Docker support** in Cua. You get a full Linux desktop inside a Docker container, viewable right in your browser—no VM spin-up, no extra clients. It behaves the same on macOS, Windows, and Linux. 6 | 7 | <img src="./assets/docker-ubuntu-support.png" alt="Cua + KasmVNC Ubuntu container desktop"> 8 | 9 | ## Why we did this 10 | 11 | If you build automation or RL workflows with Cua, you’ve probably run into the usual platform walls: macOS VMs (via Lume) are Apple-Silicon only; Windows Sandbox needs Pro/Enterprise; giving agents your host desktop is… exciting, but risky; and little OS quirks make “build once, run anywhere” harder than it should be. 12 | 13 | We wanted something lightweight, isolated, and identical across machines. So we put a desktop in a container. 14 | 15 | ## Why we didn’t use QEMU/KVM 16 | 17 | Short answer: **portability, startup time, and ops friction.** 18 | 19 | * **Runs everywhere, no hypervisor drama.** KVM needs Linux; Hyper-V/Virtualization.Framework setups vary by host and policy. Docker is ubiquitous across macOS/Windows/Linux and allowed in most CI runners—so your GUI env actually runs where your team works. 20 | * **Faster boot & smaller footprints.** Containers cold-start in seconds and images are GB-scale; VMs tend to be minutes and tens of GB. That matters for parallel agents, CI, and local iteration. 21 | * **Lower ops overhead.** No nested virt, kernel modules, or privileged host tweaks that many orgs (and cloud runners) block. Pull → run → browser. 22 | * **Same image, everywhere.** One Docker image gives you an identical desktop on every dev laptop and in CI. 23 | * **Web-first access out of the box.** KasmVNC serves the desktop over HTTP—no extra VNC/RDP clients or SPICE config. 24 | 25 | **When we *do* reach for QEMU/KVM:** 26 | 27 | * You need **true OS isolation** or to run **non-Linux** guests. 28 | * You want **kernel-level features** or **device/GPU passthrough** (VFIO). 29 | * You’re optimizing for **hardware realism** over startup speed and density. 30 | 31 | For this release, the goal was a **cross-platform Linux desktop that feels instant and identical** across local dev and CI. Containers + KasmVNC hit that sweet spot. 32 | 33 | ## What we built 34 | 35 | Under the hood it’s **KasmVNC + Ubuntu 22.04 (Xfce) in Docker**, pre-configured for computer-use automation. You get a proper GUI desktop served over HTTP (no VNC/RDP client), accessible from any modern browser. Cua’s Computer server boots automatically so your agents can connect immediately. 36 | 37 | ### How it works (at a glance) 38 | 39 | ``` 40 | Your System 41 | └─ Docker Container 42 | └─ Xfce Desktop + KasmVNC → open in your browser 43 | ``` 44 | 45 | --- 46 | 47 | ## Quick start 48 | 49 | 1. **Install Docker** — Docker Desktop (macOS/Windows) or Docker Engine (Linux). 50 | 51 | 2. **Pull or build the image** 52 | 53 | ```bash 54 | # Pull (recommended) 55 | docker pull --platform=linux/amd64 trycua/cua-ubuntu:latest 56 | 57 | # Or build locally 58 | cd libs/kasm 59 | docker build -t cua-ubuntu:latest . 60 | ``` 61 | 62 | 3. **Run with Cua’s Computer SDK** 63 | 64 | ```python 65 | from computer import Computer 66 | 67 | computer = Computer( 68 | os_type="linux", 69 | provider_type="docker", 70 | image="trycua/cua-ubuntu:latest", 71 | name="my-automation-container" 72 | ) 73 | 74 | await computer.run() 75 | ``` 76 | 77 | ### Make an agent that drives this desktop 78 | 79 | ```python 80 | from agent import ComputerAgent 81 | 82 | # assumes `computer` is the instance created above 83 | agent = ComputerAgent("openrouter/z-ai/glm-4.5v", tools=[computer]) 84 | 85 | async for _ in agent.run("Click on the search bar and type 'hello world'"): 86 | pass 87 | ``` 88 | 89 | > Use any VLM with tool use; just make sure your OpenRouter creds are set. 90 | 91 | By default you land on **Ubuntu 22.04 + Xfce** with a browser and desktop basics, the **Computer server** is running, the **web viewer** is available at `http://localhost:8006`, and common automation tools are preinstalled. 92 | 93 | --- 94 | 95 | ## What’s inside (in plain English) 96 | 97 | A tidy Linux desktop with web access through **KasmVNC**, Python 3.11 and dev tools, plus utilities you’ll actually use for automation—`wmctrl` for windows, `xclip` for clipboard, `ffmpeg` for media, screenshot helpers, and so on. It starts as a **non-root `kasm-user`**, lives in an **isolated filesystem** (unless you mount volumes), and ships with **SSL off for local dev** so you terminate TLS upstream when you deploy. 98 | 99 | --- 100 | 101 | ## How it compares 102 | 103 | | Feature | KasmVNC Docker | Lume (macOS VM) | Windows Sandbox | 104 | | ---------------- | --------------------- | --------------------- | ---------------------- | 105 | | Platform support | macOS, Windows, Linux | macOS (Apple Silicon) | Windows Pro/Enterprise | 106 | | Resource usage | Low (container) | Medium (full VM) | Medium (full VM) | 107 | | Setup time | \~30s | 2–5 min | 1–2 min | 108 | | GUI desktop | Linux | macOS | Windows | 109 | | Web access | Browser (no client) | Typically VNC client | Typically RDP client | 110 | | Consistency | Same everywhere | Hardware-dependent | OS-dependent | 111 | 112 | **Use KasmVNC Docker when…** you want the **same GUI env across devs/CI/platforms**, you’re doing **RL or end-to-end GUI tests**, or you need **many isolated desktops on one machine**. 113 | **Use alternatives when…** you need native **macOS** (→ Lume) or native **Windows** (→ Windows Sandbox). 114 | 115 | --- 116 | 117 | ## Using the Agent Framework (parallel example) 118 | 119 | A compact pattern for running multiple desktops and agents side-by-side: 120 | 121 | ```python 122 | import asyncio 123 | from computer import Computer 124 | from agent import ComputerAgent 125 | 126 | # Create multiple computer instances (each gets its own desktop) 127 | computers = [] 128 | for i in range(3): 129 | c = Computer( 130 | os_type="linux", 131 | provider_type="docker", 132 | image="trycua/cua-ubuntu:latest", 133 | name=f"parallel-desktop-{i}" 134 | ) 135 | computers.append(c) 136 | await c.run() 137 | 138 | # Pair each desktop with a task 139 | tasks = [ 140 | "open github and search for 'trycua/cua'", 141 | "open a text editor and write 'hello world'", 142 | "open the browser and go to google.com", 143 | ] 144 | 145 | agents = [ 146 | ComputerAgent(model="openrouter/z-ai/glm-4.5v", tools=[c]) 147 | for c in computers 148 | ] 149 | 150 | async def run_agent(agent, task): 151 | async for _ in agent.run(task): 152 | pass 153 | 154 | await asyncio.gather(*[run_agent(a, t) for a, t in zip(agents, tasks)]) 155 | ``` 156 | 157 | --- 158 | 159 | ## What’s next 160 | 161 | We’re polishing a **CLI to push/scale these containers on Cua Cloud**, exploring **GPU acceleration** for in-container inference, and publishing **prebuilt images** for Playwright, Selenium, and friends. 162 | 163 | --- 164 | 165 | ## Try it 166 | 167 | ```python 168 | from computer import Computer 169 | computer = Computer(os_type="linux", provider_type="docker", image="trycua/cua-ubuntu:latest") 170 | await computer.run() 171 | ``` 172 | 173 | --- 174 | 175 | ## Links 176 | 177 | * **Docker Provider Docs:** [https://docs.trycua.com/computers/docker](https://docs.trycua.com/computers/docker) 178 | * **KasmVNC:** [https://github.com/kasmtech/KasmVNC](https://github.com/kasmtech/KasmVNC) 179 | * **Container Source:** [https://github.com/trycua/cua/tree/main/libs/kasm](https://github.com/trycua/cua/tree/main/libs/kasm) 180 | * **Computer SDK:** [https://docs.trycua.com/docs/computer-sdk/computers](https://docs.trycua.com/docs/computer-sdk/computers) 181 | * **Discord:** [https://discord.gg/cua-ai](https://discord.gg/cua-ai) 182 | 183 | Questions or weird edge cases? Ping us on Discord—we’re curious to see what you build. ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/callbacks/telemetry.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Telemetry callback handler for Computer-Use Agent (cua-agent) 3 | """ 4 | 5 | import time 6 | import uuid 7 | from typing import List, Dict, Any, Optional, Union 8 | 9 | from .base import AsyncCallbackHandler 10 | from core.telemetry import ( 11 | record_event, 12 | is_telemetry_enabled, 13 | ) 14 | 15 | import platform 16 | 17 | SYSTEM_INFO = { 18 | "os": platform.system().lower(), 19 | "os_version": platform.release(), 20 | "python_version": platform.python_version(), 21 | } 22 | 23 | class TelemetryCallback(AsyncCallbackHandler): 24 | """ 25 | Telemetry callback handler for Computer-Use Agent (cua-agent) 26 | 27 | Tracks agent usage, performance metrics, and optionally trajectory data. 28 | """ 29 | 30 | def __init__( 31 | self, 32 | agent, 33 | log_trajectory: bool = False 34 | ): 35 | """ 36 | Initialize telemetry callback. 37 | 38 | Args: 39 | agent: The ComputerAgent instance 40 | log_trajectory: Whether to log full trajectory items (opt-in) 41 | """ 42 | self.agent = agent 43 | self.log_trajectory = log_trajectory 44 | 45 | # Generate session/run IDs 46 | self.session_id = str(uuid.uuid4()) 47 | self.run_id = None 48 | 49 | # Track timing and metrics 50 | self.run_start_time = None 51 | self.step_count = 0 52 | self.step_start_time = None 53 | self.total_usage = { 54 | "prompt_tokens": 0, 55 | "completion_tokens": 0, 56 | "total_tokens": 0, 57 | "response_cost": 0.0 58 | } 59 | 60 | # Record agent initialization 61 | if is_telemetry_enabled(): 62 | self._record_agent_initialization() 63 | 64 | def _record_agent_initialization(self) -> None: 65 | """Record agent type/model and session initialization.""" 66 | agent_info = { 67 | "session_id": self.session_id, 68 | "agent_type": self.agent.agent_loop.__name__ if hasattr(self.agent, 'agent_loop') else 'unknown', 69 | "model": getattr(self.agent, 'model', 'unknown'), 70 | **SYSTEM_INFO 71 | } 72 | 73 | record_event("agent_session_start", agent_info) 74 | 75 | async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None: 76 | """Called at the start of an agent run loop.""" 77 | if not is_telemetry_enabled(): 78 | return 79 | 80 | self.run_id = str(uuid.uuid4()) 81 | self.run_start_time = time.time() 82 | self.step_count = 0 83 | 84 | # Calculate input context size 85 | input_context_size = self._calculate_context_size(old_items) 86 | 87 | run_data = { 88 | "session_id": self.session_id, 89 | "run_id": self.run_id, 90 | "start_time": self.run_start_time, 91 | "input_context_size": input_context_size, 92 | "num_existing_messages": len(old_items) 93 | } 94 | 95 | # Log trajectory if opted in 96 | if self.log_trajectory: 97 | trajectory = self._extract_trajectory(old_items) 98 | if trajectory: 99 | run_data["uploaded_trajectory"] = trajectory 100 | 101 | record_event("agent_run_start", run_data) 102 | 103 | async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None: 104 | """Called at the end of an agent run loop.""" 105 | if not is_telemetry_enabled() or not self.run_start_time: 106 | return 107 | 108 | run_duration = time.time() - self.run_start_time 109 | 110 | run_data = { 111 | "session_id": self.session_id, 112 | "run_id": self.run_id, 113 | "end_time": time.time(), 114 | "duration_seconds": run_duration, 115 | "num_steps": self.step_count, 116 | "total_usage": self.total_usage.copy() 117 | } 118 | 119 | # Log trajectory if opted in 120 | if self.log_trajectory: 121 | trajectory = self._extract_trajectory(new_items) 122 | if trajectory: 123 | run_data["uploaded_trajectory"] = trajectory 124 | 125 | record_event("agent_run_end", run_data) 126 | 127 | async def on_usage(self, usage: Dict[str, Any]) -> None: 128 | """Called when usage information is received.""" 129 | if not is_telemetry_enabled(): 130 | return 131 | 132 | # Accumulate usage stats 133 | self.total_usage["prompt_tokens"] += usage.get("prompt_tokens", 0) 134 | self.total_usage["completion_tokens"] += usage.get("completion_tokens", 0) 135 | self.total_usage["total_tokens"] += usage.get("total_tokens", 0) 136 | self.total_usage["response_cost"] += usage.get("response_cost", 0.0) 137 | 138 | # Record individual usage event 139 | usage_data = { 140 | "session_id": self.session_id, 141 | "run_id": self.run_id, 142 | "step": self.step_count, 143 | **usage 144 | } 145 | 146 | record_event("agent_usage", usage_data) 147 | 148 | async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None: 149 | """Called when responses are received.""" 150 | if not is_telemetry_enabled(): 151 | return 152 | 153 | self.step_count += 1 154 | step_duration = None 155 | 156 | if self.step_start_time: 157 | step_duration = time.time() - self.step_start_time 158 | 159 | self.step_start_time = time.time() 160 | 161 | step_data = { 162 | "session_id": self.session_id, 163 | "run_id": self.run_id, 164 | "step": self.step_count, 165 | "timestamp": self.step_start_time 166 | } 167 | 168 | if step_duration is not None: 169 | step_data["duration_seconds"] = step_duration 170 | 171 | record_event("agent_step", step_data) 172 | 173 | def _calculate_context_size(self, items: List[Dict[str, Any]]) -> int: 174 | """Calculate approximate context size in tokens/characters.""" 175 | total_size = 0 176 | 177 | for item in items: 178 | if item.get("type") == "message" and "content" in item: 179 | content = item["content"] 180 | if isinstance(content, str): 181 | total_size += len(content) 182 | elif isinstance(content, list): 183 | for part in content: 184 | if isinstance(part, dict) and "text" in part: 185 | total_size += len(part["text"]) 186 | elif "content" in item and isinstance(item["content"], str): 187 | total_size += len(item["content"]) 188 | 189 | return total_size 190 | 191 | def _extract_trajectory(self, items: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 192 | """Extract trajectory items that should be logged.""" 193 | trajectory = [] 194 | 195 | for item in items: 196 | # Include user messages, assistant messages, reasoning, computer calls, and computer outputs 197 | if ( 198 | item.get("role") == "user" or # User inputs 199 | (item.get("type") == "message" and item.get("role") == "assistant") or # Model outputs 200 | item.get("type") == "reasoning" or # Reasoning traces 201 | item.get("type") == "computer_call" or # Computer actions 202 | item.get("type") == "computer_call_output" # Computer outputs 203 | ): 204 | # Create a copy of the item with timestamp 205 | trajectory_item = item.copy() 206 | trajectory_item["logged_at"] = time.time() 207 | trajectory.append(trajectory_item) 208 | 209 | return trajectory ``` -------------------------------------------------------------------------------- /libs/python/computer-server/computer_server/handlers/base.py: -------------------------------------------------------------------------------- ```python 1 | from abc import ABC, abstractmethod 2 | from typing import Optional, Dict, Any, List, Tuple 3 | 4 | class BaseAccessibilityHandler(ABC): 5 | """Abstract base class for OS-specific accessibility handlers.""" 6 | 7 | @abstractmethod 8 | async def get_accessibility_tree(self) -> Dict[str, Any]: 9 | """Get the accessibility tree of the current window.""" 10 | pass 11 | 12 | @abstractmethod 13 | async def find_element(self, role: Optional[str] = None, 14 | title: Optional[str] = None, 15 | value: Optional[str] = None) -> Dict[str, Any]: 16 | """Find an element in the accessibility tree by criteria.""" 17 | pass 18 | 19 | class BaseFileHandler(ABC): 20 | """Abstract base class for OS-specific file handlers.""" 21 | 22 | @abstractmethod 23 | async def file_exists(self, path: str) -> Dict[str, Any]: 24 | """Check if a file exists at the specified path.""" 25 | pass 26 | 27 | @abstractmethod 28 | async def directory_exists(self, path: str) -> Dict[str, Any]: 29 | """Check if a directory exists at the specified path.""" 30 | pass 31 | 32 | @abstractmethod 33 | async def list_dir(self, path: str) -> Dict[str, Any]: 34 | """List the contents of a directory.""" 35 | pass 36 | 37 | @abstractmethod 38 | async def read_text(self, path: str) -> Dict[str, Any]: 39 | """Read the text contents of a file.""" 40 | pass 41 | 42 | @abstractmethod 43 | async def write_text(self, path: str, content: str) -> Dict[str, Any]: 44 | """Write text content to a file.""" 45 | pass 46 | 47 | @abstractmethod 48 | async def write_bytes(self, path: str, content_b64: str) -> Dict[str, Any]: 49 | """Write binary content to a file. Sent over the websocket as a base64 string.""" 50 | pass 51 | 52 | @abstractmethod 53 | async def delete_file(self, path: str) -> Dict[str, Any]: 54 | """Delete a file.""" 55 | pass 56 | 57 | @abstractmethod 58 | async def create_dir(self, path: str) -> Dict[str, Any]: 59 | """Create a directory.""" 60 | pass 61 | 62 | @abstractmethod 63 | async def delete_dir(self, path: str) -> Dict[str, Any]: 64 | """Delete a directory.""" 65 | pass 66 | 67 | @abstractmethod 68 | async def read_bytes(self, path: str, offset: int = 0, length: Optional[int] = None) -> Dict[str, Any]: 69 | """Read the binary contents of a file. Sent over the websocket as a base64 string. 70 | 71 | Args: 72 | path: Path to the file 73 | offset: Byte offset to start reading from (default: 0) 74 | length: Number of bytes to read (default: None for entire file) 75 | """ 76 | pass 77 | 78 | @abstractmethod 79 | async def get_file_size(self, path: str) -> Dict[str, Any]: 80 | """Get the size of a file in bytes.""" 81 | pass 82 | 83 | class BaseAutomationHandler(ABC): 84 | """Abstract base class for OS-specific automation handlers. 85 | 86 | Categories: 87 | - Mouse Actions: Methods for mouse control 88 | - Keyboard Actions: Methods for keyboard input 89 | - Scrolling Actions: Methods for scrolling 90 | - Screen Actions: Methods for screen interaction 91 | - Clipboard Actions: Methods for clipboard operations 92 | """ 93 | 94 | # Mouse Actions 95 | @abstractmethod 96 | async def mouse_down(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> Dict[str, Any]: 97 | """Perform a mouse down at the current or specified position.""" 98 | pass 99 | 100 | @abstractmethod 101 | async def mouse_up(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> Dict[str, Any]: 102 | """Perform a mouse up at the current or specified position.""" 103 | pass 104 | 105 | @abstractmethod 106 | async def left_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]: 107 | """Perform a left click at the current or specified position.""" 108 | pass 109 | 110 | @abstractmethod 111 | async def right_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]: 112 | """Perform a right click at the current or specified position.""" 113 | pass 114 | 115 | @abstractmethod 116 | async def double_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]: 117 | """Perform a double click at the current or specified position.""" 118 | pass 119 | 120 | @abstractmethod 121 | async def move_cursor(self, x: int, y: int) -> Dict[str, Any]: 122 | """Move the cursor to the specified position.""" 123 | pass 124 | 125 | @abstractmethod 126 | async def drag_to(self, x: int, y: int, button: str = "left", duration: float = 0.5) -> Dict[str, Any]: 127 | """Drag the cursor from current position to specified coordinates. 128 | 129 | Args: 130 | x: The x coordinate to drag to 131 | y: The y coordinate to drag to 132 | button: The mouse button to use ('left', 'middle', 'right') 133 | duration: How long the drag should take in seconds 134 | """ 135 | pass 136 | 137 | @abstractmethod 138 | async def drag(self, path: List[Tuple[int, int]], button: str = "left", duration: float = 0.5) -> Dict[str, Any]: 139 | """Drag the cursor from current position to specified coordinates. 140 | 141 | Args: 142 | path: A list of tuples of x and y coordinates to drag to 143 | button: The mouse button to use ('left', 'middle', 'right') 144 | duration: How long the drag should take in seconds 145 | """ 146 | pass 147 | 148 | # Keyboard Actions 149 | @abstractmethod 150 | async def key_down(self, key: str) -> Dict[str, Any]: 151 | """Press and hold the specified key.""" 152 | pass 153 | 154 | @abstractmethod 155 | async def key_up(self, key: str) -> Dict[str, Any]: 156 | """Release the specified key.""" 157 | pass 158 | 159 | @abstractmethod 160 | async def type_text(self, text: str) -> Dict[str, Any]: 161 | """Type the specified text.""" 162 | pass 163 | 164 | @abstractmethod 165 | async def press_key(self, key: str) -> Dict[str, Any]: 166 | """Press the specified key.""" 167 | pass 168 | 169 | @abstractmethod 170 | async def hotkey(self, keys: List[str]) -> Dict[str, Any]: 171 | """Press a combination of keys together.""" 172 | pass 173 | 174 | # Scrolling Actions 175 | @abstractmethod 176 | async def scroll(self, x: int, y: int) -> Dict[str, Any]: 177 | """Scroll the specified amount.""" 178 | pass 179 | 180 | @abstractmethod 181 | async def scroll_down(self, clicks: int = 1) -> Dict[str, Any]: 182 | """Scroll down by the specified number of clicks.""" 183 | pass 184 | 185 | @abstractmethod 186 | async def scroll_up(self, clicks: int = 1) -> Dict[str, Any]: 187 | """Scroll up by the specified number of clicks.""" 188 | pass 189 | 190 | # Screen Actions 191 | @abstractmethod 192 | async def screenshot(self) -> Dict[str, Any]: 193 | """Take a screenshot and return base64 encoded image data.""" 194 | pass 195 | 196 | @abstractmethod 197 | async def get_screen_size(self) -> Dict[str, Any]: 198 | """Get the screen size of the VM.""" 199 | pass 200 | 201 | @abstractmethod 202 | async def get_cursor_position(self) -> Dict[str, Any]: 203 | """Get the current cursor position.""" 204 | pass 205 | 206 | # Clipboard Actions 207 | @abstractmethod 208 | async def copy_to_clipboard(self) -> Dict[str, Any]: 209 | """Get the current clipboard content.""" 210 | pass 211 | 212 | @abstractmethod 213 | async def set_clipboard(self, text: str) -> Dict[str, Any]: 214 | """Set the clipboard content.""" 215 | pass 216 | 217 | @abstractmethod 218 | async def run_command(self, command: str) -> Dict[str, Any]: 219 | """Run a command and return the output.""" 220 | pass ``` -------------------------------------------------------------------------------- /Development.md: -------------------------------------------------------------------------------- ```markdown 1 | # Getting Started 2 | 3 | ## Project Structure 4 | 5 | The project is organized as a monorepo with these main packages: 6 | 7 | - `libs/core/` - Base package with telemetry support 8 | - `libs/computer/` - Computer-use interface (CUI) library 9 | - `libs/agent/` - AI agent library with multi-provider support 10 | - `libs/som/` - Set-of-Mark parser 11 | - `libs/computer-server/` - Server component for VM 12 | - `libs/lume/` - Lume CLI 13 | - `libs/pylume/` - Python bindings for Lume 14 | 15 | Each package has its own virtual environment and dependencies, managed through PDM. 16 | 17 | ## Local Development Setup 18 | 19 | 1. Install Lume CLI: 20 | 21 | ```bash 22 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)" 23 | ``` 24 | 25 | 2. Clone the repository: 26 | 27 | ```bash 28 | git clone https://github.com/trycua/cua.git 29 | cd cua 30 | ``` 31 | 32 | 3. Create a `.env.local` file in the root directory with your API keys: 33 | 34 | ```bash 35 | # Required for Anthropic provider 36 | ANTHROPIC_API_KEY=your_anthropic_key_here 37 | 38 | # Required for OpenAI provider 39 | OPENAI_API_KEY=your_openai_key_here 40 | ``` 41 | 42 | 4. Open the workspace in VSCode or Cursor: 43 | 44 | ```bash 45 | # For Cua Python development 46 | code .vscode/py.code-workspace 47 | 48 | # For Lume (Swift) development 49 | code .vscode/lume.code-workspace 50 | ``` 51 | 52 | Using the workspace file is strongly recommended as it: 53 | 54 | - Sets up correct Python environments for each package 55 | - Configures proper import paths 56 | - Enables debugging configurations 57 | - Maintains consistent settings across packages 58 | 59 | ## Lume Development 60 | 61 | Refer to the [Lume README](./libs/lume/Development.md) for instructions on how to develop the Lume CLI. 62 | 63 | ## Python Development 64 | 65 | There are two ways to install Lume: 66 | 67 | ### Run the build script 68 | 69 | Run the build script to set up all packages: 70 | 71 | ```bash 72 | ./scripts/build.sh 73 | ``` 74 | 75 | The build script creates a shared virtual environment for all packages. The workspace configuration automatically handles import paths with the correct Python path settings. 76 | 77 | This will: 78 | 79 | - Create a virtual environment for the project 80 | - Install all packages in development mode 81 | - Set up the correct Python path 82 | - Install development tools 83 | 84 | ### Install with PDM 85 | 86 | If PDM is not already installed, you can follow the installation instructions [here](https://pdm-project.org/en/latest/#installation). 87 | 88 | To install with PDM, simply run: 89 | 90 | ```console 91 | pdm install -G:all 92 | ``` 93 | 94 | This installs all the dependencies for development, testing, and building the docs. If you'd only like development dependencies, you can run: 95 | 96 | ```console 97 | pdm install -d 98 | ``` 99 | 100 | ## Running Examples 101 | 102 | The Python workspace includes launch configurations for all packages: 103 | 104 | - "Run Computer Examples" - Runs computer examples 105 | - "Run Agent Examples" - Runs agent examples 106 | - "SOM" configurations - Various settings for running SOM 107 | 108 | To run examples from VSCode / Cursor: 109 | 110 | 1. Press F5 or use the Run/Debug view 111 | 2. Select the desired configuration 112 | 113 | The workspace also includes compound launch configurations: 114 | 115 | - "Run Computer Examples + Server" - Runs both the Computer Examples and Server simultaneously 116 | 117 | ## Docker Development Environment 118 | 119 | As an alternative to installing directly on your host machine, you can use Docker for development. This approach has several advantages: 120 | 121 | ### Prerequisites 122 | 123 | - Docker installed on your machine 124 | - Lume server running on your host (port 7777): `lume serve` 125 | 126 | ### Setup and Usage 127 | 128 | 1. Build the development Docker image: 129 | 130 | ```bash 131 | ./scripts/run-docker-dev.sh build 132 | ``` 133 | 134 | 2. Run an example in the container: 135 | 136 | ```bash 137 | ./scripts/run-docker-dev.sh run computer_examples.py 138 | ``` 139 | 140 | 3. Get an interactive shell in the container: 141 | 142 | ```bash 143 | ./scripts/run-docker-dev.sh run --interactive 144 | ``` 145 | 146 | 4. Stop any running containers: 147 | 148 | ```bash 149 | ./scripts/run-docker-dev.sh stop 150 | ``` 151 | 152 | ### How it Works 153 | 154 | The Docker development environment: 155 | 156 | - Installs all required Python dependencies in the container 157 | - Mounts your source code from the host at runtime 158 | - Automatically configures the connection to use host.docker.internal:7777 for accessing the Lume server on your host machine 159 | - Preserves your code changes without requiring rebuilds (source code is mounted as a volume) 160 | 161 | > **Note**: The Docker container doesn't include the macOS-specific Lume executable. Instead, it connects to the Lume server running on your host machine via host.docker.internal:7777. Make sure to start the Lume server on your host before running examples in the container. 162 | 163 | ## Cleanup and Reset 164 | 165 | If you need to clean up the environment (non-docker) and start fresh: 166 | 167 | ```bash 168 | ./scripts/cleanup.sh 169 | ``` 170 | 171 | This will: 172 | 173 | - Remove all virtual environments 174 | - Clean Python cache files and directories 175 | - Remove build artifacts 176 | - Clean PDM-related files 177 | - Reset environment configurations 178 | 179 | ## Code Formatting Standards 180 | 181 | The cua project follows strict code formatting standards to ensure consistency across all packages. 182 | 183 | ### Python Code Formatting 184 | 185 | #### Tools 186 | 187 | The project uses the following tools for code formatting and linting: 188 | 189 | - **[Black](https://black.readthedocs.io/)**: Code formatter 190 | - **[Ruff](https://beta.ruff.rs/docs/)**: Fast linter and formatter 191 | - **[MyPy](https://mypy.readthedocs.io/)**: Static type checker 192 | 193 | These tools are automatically installed when you set up the development environment using the `./scripts/build.sh` script. 194 | 195 | #### Configuration 196 | 197 | The formatting configuration is defined in the root `pyproject.toml` file: 198 | 199 | ```toml 200 | [tool.black] 201 | line-length = 100 202 | target-version = ["py311"] 203 | 204 | [tool.ruff] 205 | line-length = 100 206 | target-version = "py311" 207 | select = ["E", "F", "B", "I"] 208 | fix = true 209 | 210 | [tool.ruff.format] 211 | docstring-code-format = true 212 | 213 | [tool.mypy] 214 | strict = true 215 | python_version = "3.11" 216 | ignore_missing_imports = true 217 | disallow_untyped_defs = true 218 | check_untyped_defs = true 219 | warn_return_any = true 220 | show_error_codes = true 221 | warn_unused_ignores = false 222 | ``` 223 | 224 | #### Key Formatting Rules 225 | 226 | - **Line Length**: Maximum of 100 characters 227 | - **Python Version**: Code should be compatible with Python 3.11+ 228 | - **Imports**: Automatically sorted (using Ruff's "I" rule) 229 | - **Type Hints**: Required for all function definitions (strict mypy mode) 230 | 231 | #### IDE Integration 232 | 233 | The repository includes VSCode workspace configurations that enable automatic formatting. When you open the workspace files (as recommended in the setup instructions), the correct formatting settings are automatically applied. 234 | 235 | Python-specific settings in the workspace files: 236 | 237 | ```json 238 | "[python]": { 239 | "editor.formatOnSave": true, 240 | "editor.defaultFormatter": "ms-python.black-formatter", 241 | "editor.codeActionsOnSave": { 242 | "source.organizeImports": "explicit" 243 | } 244 | } 245 | ``` 246 | 247 | Recommended VS Code extensions: 248 | 249 | - Black Formatter (ms-python.black-formatter) 250 | - Ruff (charliermarsh.ruff) 251 | - Pylance (ms-python.vscode-pylance) 252 | 253 | #### Manual Formatting 254 | 255 | To manually format code: 256 | 257 | ```bash 258 | # Format all Python files using Black 259 | pdm run black . 260 | 261 | # Run Ruff linter with auto-fix 262 | pdm run ruff check --fix . 263 | 264 | # Run type checking with MyPy 265 | pdm run mypy . 266 | ``` 267 | 268 | #### Pre-commit Validation 269 | 270 | Before submitting a pull request, ensure your code passes all formatting checks: 271 | 272 | ```bash 273 | # Run all checks 274 | pdm run black --check . 275 | pdm run ruff check . 276 | pdm run mypy . 277 | ``` 278 | 279 | ### Swift Code (Lume) 280 | 281 | For Swift code in the `libs/lume` directory: 282 | 283 | - Follow the [Swift API Design Guidelines](https://www.swift.org/documentation/api-design-guidelines/) 284 | - Use SwiftFormat for consistent formatting 285 | - Code will be automatically formatted on save when using the lume workspace 286 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/loops/holo.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Holo 1.5 agent loop implementation for click prediction using litellm.acompletion. 3 | 4 | Implements the Holo1.5 grounding behavior: 5 | - Prompt asks for absolute pixel coordinates in JSON: {"action":"click_absolute","x":int,"y":int} 6 | - Optionally resizes the image using Qwen2-VL smart_resize parameters (via transformers AutoProcessor) 7 | - If resized, maps predicted coordinates back to the original screenshot resolution 8 | 9 | Note: We do NOT manually load the model; acompletions (via HuggingFaceLocalAdapter) 10 | will handle loading based on the provided model name. 11 | """ 12 | 13 | from __future__ import annotations 14 | 15 | import base64 16 | import json 17 | from io import BytesIO 18 | from typing import Any, Dict, List, Optional, Tuple 19 | 20 | import litellm 21 | from PIL import Image 22 | 23 | from ..decorators import register_agent 24 | from .base import AsyncAgentConfig 25 | from ..types import AgentCapability 26 | 27 | 28 | def _strip_hf_prefix(model: str) -> str: 29 | """Strip provider prefixes like 'huggingface-local/' from model names for HF processor load.""" 30 | if "/" in model and model.lower().startswith("huggingface-local/"): 31 | return model.split("/", 1)[1] 32 | return model 33 | 34 | 35 | def _maybe_smart_resize(image: Image.Image, model: str) -> Tuple[Image.Image, Tuple[int, int]]: 36 | """ 37 | Try to compute Qwen2-VL smart_resize output size using transformers AutoProcessor. 38 | 39 | Returns (processed_image, (orig_w, orig_h)). If transformers or processor unavailable, 40 | returns the original image and size without resizing. 41 | """ 42 | orig_w, orig_h = image.size 43 | try: 44 | # Import lazily to avoid hard dependency if not installed 45 | from transformers import AutoProcessor # type: ignore 46 | from transformers.models.qwen2_vl.image_processing_qwen2_vl import ( # type: ignore 47 | smart_resize, 48 | ) 49 | 50 | processor_name = _strip_hf_prefix(model) 51 | processor = AutoProcessor.from_pretrained(processor_name) 52 | image_processor = getattr(processor, "image_processor", None) 53 | if image_processor is None: 54 | return image, (orig_w, orig_h) 55 | 56 | factor = getattr(image_processor, "patch_size", 14) * getattr(image_processor, "merge_size", 1) 57 | min_pixels = getattr(image_processor, "min_pixels", 256 * 256) 58 | max_pixels = getattr(image_processor, "max_pixels", 1536 * 1536) 59 | 60 | resized_h, resized_w = smart_resize( 61 | orig_h, 62 | orig_w, 63 | factor=factor, 64 | min_pixels=min_pixels, 65 | max_pixels=max_pixels, 66 | ) 67 | 68 | if (resized_w, resized_h) == (orig_w, orig_h): 69 | return image, (orig_w, orig_h) 70 | 71 | processed = image.resize((resized_w, resized_h), resample=Image.Resampling.LANCZOS) 72 | return processed, (orig_w, orig_h) 73 | except Exception: 74 | # If any failure (no transformers, processor load error), fall back to original 75 | return image, (orig_w, orig_h) 76 | 77 | 78 | def _build_holo_prompt(instruction: str) -> str: 79 | """Construct the Holo1.5 grounding prompt.""" 80 | # Keep it close to the cookbook while avoiding heavy schema generation 81 | schema_hint = '{"action": "click_absolute", "x": <int>, "y": <int>}' 82 | return ( 83 | "Localize an element on the GUI image according to the provided target and output a click position. " 84 | f"You must output a valid JSON following the format: {schema_hint} " 85 | f"Your target is: {instruction}" 86 | ) 87 | 88 | 89 | def _parse_click_json(output_text: str) -> Optional[Tuple[int, int]]: 90 | """ 91 | Parse JSON from model output and extract x, y ints. 92 | Tries to find the first JSON object substring if extra text is present. 93 | """ 94 | try: 95 | # Fast path: direct JSON 96 | data = json.loads(output_text) 97 | except Exception: 98 | # Try to locate a JSON object within the text 99 | start = output_text.find("{") 100 | end = output_text.rfind("}") 101 | if start == -1 or end == -1 or end <= start: 102 | return None 103 | try: 104 | data = json.loads(output_text[start : end + 1]) 105 | except Exception: 106 | return None 107 | 108 | try: 109 | x = int(data.get("x")) 110 | y = int(data.get("y")) 111 | return x, y 112 | except Exception: 113 | return None 114 | 115 | 116 | @register_agent(models=r"(?i).*(Holo1\.5|Hcompany/Holo1\.5).*") 117 | class HoloConfig(AsyncAgentConfig): 118 | """Holo is a family of UI grounding models from H Company""" 119 | 120 | async def predict_step( 121 | self, 122 | messages: List[Dict[str, Any]], 123 | model: str, 124 | tools: Optional[List[Dict[str, Any]]] = None, 125 | max_retries: Optional[int] = None, 126 | stream: bool = False, 127 | computer_handler=None, 128 | _on_api_start=None, 129 | _on_api_end=None, 130 | _on_usage=None, 131 | _on_screenshot=None, 132 | **kwargs, 133 | ) -> Dict[str, Any]: 134 | # Holo models are only trained on UI localization tasks, not all-in-one agent 135 | raise NotImplementedError() 136 | 137 | async def predict_click( 138 | self, 139 | model: str, 140 | image_b64: str, 141 | instruction: str, 142 | **kwargs, 143 | ) -> Optional[Tuple[int, int]]: 144 | """ 145 | Predict click coordinates using Holo1.5 via litellm.acompletion. 146 | 147 | - Optionally smart-resizes the image using Qwen2-VL rules if transformers are available 148 | - Prompts for JSON with absolute pixel coordinates 149 | - Parses x,y and maps back to original screenshot size if resized 150 | """ 151 | try: 152 | img_bytes = base64.b64decode(image_b64) 153 | original_img = Image.open(BytesIO(img_bytes)) 154 | except Exception: 155 | return None 156 | 157 | # Optional preprocessing 158 | processed_img, (orig_w, orig_h) = _maybe_smart_resize(original_img, model) 159 | 160 | # If we resized, send the resized image; otherwise send original 161 | img_to_send = processed_img 162 | buf = BytesIO() 163 | img_to_send.save(buf, format="PNG") 164 | processed_b64 = base64.b64encode(buf.getvalue()).decode("utf-8") 165 | 166 | prompt = _build_holo_prompt(instruction) 167 | 168 | messages = [ 169 | { 170 | "role": "user", 171 | "content": [ 172 | { 173 | "type": "image_url", 174 | "image_url": {"url": f"data:image/png;base64,{processed_b64}"}, 175 | }, 176 | {"type": "text", "text": prompt}, 177 | ], 178 | } 179 | ] 180 | 181 | api_kwargs = { 182 | "model": model, 183 | "messages": messages, 184 | # Deterministic, small output 185 | "max_tokens": kwargs.get("max_tokens", 256), 186 | "temperature": kwargs.get("temperature", 0.0), 187 | } 188 | 189 | response = await litellm.acompletion(**api_kwargs) 190 | output_text = (response.choices[0].message.content or "").strip() # type: ignore 191 | 192 | coords = _parse_click_json(output_text) 193 | if coords is None: 194 | return None 195 | 196 | x, y = coords 197 | 198 | # Map back to original size if we resized 199 | proc_w, proc_h = img_to_send.size 200 | if (proc_w, proc_h) != (orig_w, orig_h): 201 | try: 202 | sx = orig_w / float(proc_w) 203 | sy = orig_h / float(proc_h) 204 | x = int(round(x * sx)) 205 | y = int(round(y * sy)) 206 | except Exception: 207 | # Fallback: clamp within original bounds 208 | pass 209 | 210 | # Clamp to original image bounds 211 | x = max(0, min(orig_w - 1, x)) 212 | y = max(0, min(orig_h - 1, y)) 213 | return x, y 214 | 215 | def get_capabilities(self) -> List[AgentCapability]: 216 | return ["click"] 217 | ``` -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- ```json 1 | { 2 | "configurations": [ 3 | { 4 | "name": "Agent UI", 5 | "type": "debugpy", 6 | "request": "launch", 7 | "program": "examples/agent_ui_examples.py", 8 | "console": "integratedTerminal", 9 | "justMyCode": false, 10 | "python": "${workspaceFolder:cua-root}/.venv/bin/python", 11 | "cwd": "${workspaceFolder:cua-root}", 12 | "env": { 13 | "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume" 14 | } 15 | }, 16 | { 17 | "name": "Computer UI", 18 | "type": "debugpy", 19 | "request": "launch", 20 | "program": "examples/computer_ui_examples.py", 21 | "console": "integratedTerminal", 22 | "justMyCode": false, 23 | "python": "${workspaceFolder:cua-root}/.venv/bin/python", 24 | "cwd": "${workspaceFolder:cua-root}", 25 | "env": { 26 | "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume" 27 | } 28 | }, 29 | { 30 | "name": "Run Computer Examples", 31 | "type": "debugpy", 32 | "request": "launch", 33 | "program": "examples/computer_examples.py", 34 | "console": "integratedTerminal", 35 | "justMyCode": true, 36 | "python": "${workspaceFolder:cua-root}/.venv/bin/python", 37 | "cwd": "${workspaceFolder:cua-root}", 38 | "env": { 39 | "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume" 40 | } 41 | }, 42 | { 43 | "name": "Run Agent Examples", 44 | "type": "debugpy", 45 | "request": "launch", 46 | "program": "examples/agent_examples.py", 47 | "console": "integratedTerminal", 48 | "justMyCode": false, 49 | "python": "${workspaceFolder:cua-root}/.venv/bin/python", 50 | "cwd": "${workspaceFolder:cua-root}", 51 | "env": { 52 | "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume" 53 | } 54 | }, 55 | { 56 | "name": "Run PyLume Examples", 57 | "type": "debugpy", 58 | "request": "launch", 59 | "program": "examples/pylume_examples.py", 60 | "console": "integratedTerminal", 61 | "justMyCode": true, 62 | "python": "${workspaceFolder:cua-root}/.venv/bin/python", 63 | "cwd": "${workspaceFolder:cua-root}", 64 | "env": { 65 | "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume" 66 | } 67 | }, 68 | { 69 | "name": "SOM: Run Experiments (No OCR)", 70 | "type": "debugpy", 71 | "request": "launch", 72 | "program": "examples/som_examples.py", 73 | "args": [ 74 | "examples/test_data", 75 | "--output-dir", 76 | "examples/output", 77 | "--ocr", 78 | "none", 79 | "--mode", 80 | "experiment" 81 | ], 82 | "console": "integratedTerminal", 83 | "justMyCode": false, 84 | "python": "${workspaceFolder:cua-root}/.venv/bin/python", 85 | "cwd": "${workspaceFolder:cua-root}", 86 | "env": { 87 | "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume" 88 | } 89 | }, 90 | { 91 | "name": "SOM: Run Experiments (EasyOCR)", 92 | "type": "debugpy", 93 | "request": "launch", 94 | "program": "examples/som_examples.py", 95 | "args": [ 96 | "examples/test_data", 97 | "--output-dir", 98 | "examples/output", 99 | "--ocr", 100 | "easyocr", 101 | "--mode", 102 | "experiment" 103 | ], 104 | "console": "integratedTerminal", 105 | "justMyCode": false, 106 | "python": "${workspaceFolder:cua-root}/.venv/bin/python", 107 | "cwd": "${workspaceFolder:cua-root}", 108 | "env": { 109 | "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume" 110 | } 111 | }, 112 | { 113 | "name": "Run Computer Server", 114 | "type": "debugpy", 115 | "request": "launch", 116 | "program": "${workspaceFolder}/libs/python/computer-server/run_server.py", 117 | "console": "integratedTerminal", 118 | "justMyCode": true, 119 | "python": "${workspaceFolder:cua-root}/.venv/bin/python", 120 | "cwd": "${workspaceFolder:cua-root}", 121 | "env": { 122 | "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume" 123 | } 124 | }, 125 | { 126 | "name": "Run Computer Server with Args", 127 | "type": "debugpy", 128 | "request": "launch", 129 | "program": "${workspaceFolder}/libs/python/computer-server/run_server.py", 130 | "args": [ 131 | "--host", 132 | "0.0.0.0", 133 | "--port", 134 | "8000", 135 | "--log-level", 136 | "debug" 137 | ], 138 | "console": "integratedTerminal", 139 | "justMyCode": false, 140 | "python": "${workspaceFolder:cua-root}/.venv/bin/python", 141 | "cwd": "${workspaceFolder:cua-root}", 142 | "env": { 143 | "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer-server" 144 | } 145 | }, 146 | { 147 | "type": "lldb", 148 | "request": "launch", 149 | "args": [], 150 | "cwd": "${workspaceFolder:cua-root}/libs/lume", 151 | "name": "Debug lume (libs/lume)", 152 | "program": "${workspaceFolder:cua-root}/libs/lume/.build/debug/lume", 153 | "preLaunchTask": "swift: Build Debug lume (libs/lume)" 154 | }, 155 | { 156 | "type": "lldb", 157 | "request": "launch", 158 | "args": [], 159 | "cwd": "${workspaceFolder:cua-root}/libs/lume", 160 | "name": "Release lume (libs/lume)", 161 | "program": "${workspaceFolder:cua-root}/libs/lume/.build/release/lume", 162 | "preLaunchTask": "swift: Build Release lume (libs/lume)" 163 | } 164 | ] 165 | } ``` -------------------------------------------------------------------------------- /libs/lume/src/Commands/Config.swift: -------------------------------------------------------------------------------- ```swift 1 | import ArgumentParser 2 | import Foundation 3 | 4 | struct Config: ParsableCommand { 5 | static let configuration = CommandConfiguration( 6 | commandName: "config", 7 | abstract: "Get or set lume configuration", 8 | subcommands: [Get.self, Storage.self, Cache.self, Caching.self], 9 | defaultSubcommand: Get.self 10 | ) 11 | 12 | // MARK: - Basic Configuration Subcommands 13 | 14 | struct Get: ParsableCommand { 15 | static let configuration = CommandConfiguration( 16 | commandName: "get", 17 | abstract: "Get current configuration" 18 | ) 19 | 20 | func run() throws { 21 | let controller = LumeController() 22 | let settings = controller.getSettings() 23 | 24 | // Display default location 25 | print( 26 | "Default VM storage: \(settings.defaultLocationName) (\(settings.defaultLocation?.path ?? "not set"))" 27 | ) 28 | 29 | // Display cache directory 30 | print("Cache directory: \(settings.cacheDirectory)") 31 | 32 | // Display caching enabled status 33 | print("Caching enabled: \(settings.cachingEnabled)") 34 | 35 | // Display all locations 36 | if !settings.vmLocations.isEmpty { 37 | print("\nConfigured VM storage locations:") 38 | for location in settings.sortedLocations { 39 | let isDefault = location.name == settings.defaultLocationName 40 | let defaultMark = isDefault ? " (default)" : "" 41 | print(" - \(location.name): \(location.path)\(defaultMark)") 42 | } 43 | } 44 | } 45 | } 46 | 47 | // MARK: - Debug Command 48 | 49 | struct Debug: ParsableCommand { 50 | static let configuration = CommandConfiguration( 51 | commandName: "debug", 52 | abstract: "Output detailed debug information about current configuration", 53 | shouldDisplay: false 54 | ) 55 | 56 | func run() throws { 57 | let debugInfo = SettingsManager.shared.debugSettings() 58 | print(debugInfo) 59 | } 60 | } 61 | 62 | // MARK: - Caching Management Subcommands 63 | 64 | struct Caching: ParsableCommand { 65 | static let configuration = CommandConfiguration( 66 | commandName: "caching", 67 | abstract: "Manage image caching settings", 68 | subcommands: [GetCaching.self, SetCaching.self] 69 | ) 70 | 71 | struct GetCaching: ParsableCommand { 72 | static let configuration = CommandConfiguration( 73 | commandName: "get", 74 | abstract: "Show current caching status" 75 | ) 76 | 77 | func run() throws { 78 | let controller = LumeController() 79 | let cachingEnabled = controller.isCachingEnabled() 80 | print("Caching enabled: \(cachingEnabled)") 81 | } 82 | } 83 | 84 | struct SetCaching: ParsableCommand { 85 | static let configuration = CommandConfiguration( 86 | commandName: "set", 87 | abstract: "Enable or disable image caching" 88 | ) 89 | 90 | @Argument(help: "Enable or disable caching (true/false)") 91 | var enabled: Bool 92 | 93 | func run() throws { 94 | let controller = LumeController() 95 | try controller.setCachingEnabled(enabled) 96 | print("Caching \(enabled ? "enabled" : "disabled")") 97 | } 98 | } 99 | } 100 | 101 | // MARK: - Cache Management Subcommands 102 | 103 | struct Cache: ParsableCommand { 104 | static let configuration = CommandConfiguration( 105 | commandName: "cache", 106 | abstract: "Manage cache settings", 107 | subcommands: [GetCache.self, SetCache.self] 108 | ) 109 | 110 | struct GetCache: ParsableCommand { 111 | static let configuration = CommandConfiguration( 112 | commandName: "get", 113 | abstract: "Get current cache directory" 114 | ) 115 | 116 | func run() throws { 117 | let controller = LumeController() 118 | let cacheDir = controller.getCacheDirectory() 119 | print("Cache directory: \(cacheDir)") 120 | } 121 | } 122 | 123 | struct SetCache: ParsableCommand { 124 | static let configuration = CommandConfiguration( 125 | commandName: "set", 126 | abstract: "Set cache directory" 127 | ) 128 | 129 | @Argument(help: "Path to cache directory") 130 | var path: String 131 | 132 | func run() throws { 133 | let controller = LumeController() 134 | try controller.setCacheDirectory(path: path) 135 | print("Cache directory set to: \(path)") 136 | } 137 | } 138 | } 139 | 140 | // MARK: - Storage Management Subcommands 141 | 142 | struct Storage: ParsableCommand { 143 | static let configuration = CommandConfiguration( 144 | commandName: "storage", 145 | abstract: "Manage VM storage locations", 146 | subcommands: [Add.self, Remove.self, List.self, Default.self] 147 | ) 148 | 149 | struct Add: ParsableCommand { 150 | static let configuration = CommandConfiguration( 151 | commandName: "add", 152 | abstract: "Add a new VM storage location" 153 | ) 154 | 155 | @Argument(help: "Storage name (alphanumeric with dashes/underscores)") 156 | var name: String 157 | 158 | @Argument(help: "Path to VM storage directory") 159 | var path: String 160 | 161 | func run() throws { 162 | let controller = LumeController() 163 | try controller.addLocation(name: name, path: path) 164 | print("Added VM storage location: \(name) at \(path)") 165 | } 166 | } 167 | 168 | struct Remove: ParsableCommand { 169 | static let configuration = CommandConfiguration( 170 | commandName: "remove", 171 | abstract: "Remove a VM storage location" 172 | ) 173 | 174 | @Argument(help: "Storage name to remove") 175 | var name: String 176 | 177 | func run() throws { 178 | let controller = LumeController() 179 | try controller.removeLocation(name: name) 180 | print("Removed VM storage location: \(name)") 181 | } 182 | } 183 | 184 | struct List: ParsableCommand { 185 | static let configuration = CommandConfiguration( 186 | commandName: "list", 187 | abstract: "List all VM storage locations" 188 | ) 189 | 190 | func run() throws { 191 | let controller = LumeController() 192 | let settings = controller.getSettings() 193 | 194 | if settings.vmLocations.isEmpty { 195 | print("No VM storage locations configured") 196 | return 197 | } 198 | 199 | print("VM Storage Locations:") 200 | for location in settings.sortedLocations { 201 | let isDefault = location.name == settings.defaultLocationName 202 | let defaultMark = isDefault ? " (default)" : "" 203 | print(" - \(location.name): \(location.path)\(defaultMark)") 204 | } 205 | } 206 | } 207 | 208 | struct Default: ParsableCommand { 209 | static let configuration = CommandConfiguration( 210 | commandName: "default", 211 | abstract: "Set the default VM storage location" 212 | ) 213 | 214 | @Argument(help: "Storage name to set as default") 215 | var name: String 216 | 217 | func run() throws { 218 | let controller = LumeController() 219 | try controller.setDefaultLocation(name: name) 220 | print("Set default VM storage location to: \(name)") 221 | } 222 | } 223 | } 224 | } 225 | ``` -------------------------------------------------------------------------------- /libs/python/computer-server/computer_server/handlers/generic.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Generic handlers for all OSes. 3 | 4 | Includes: 5 | - FileHandler 6 | 7 | """ 8 | 9 | from pathlib import Path 10 | from typing import Dict, Any, Optional 11 | from .base import BaseFileHandler 12 | import base64 13 | 14 | def resolve_path(path: str) -> Path: 15 | """Resolve a path to its absolute path. Expand ~ to the user's home directory. 16 | 17 | Args: 18 | path: The file or directory path to resolve 19 | 20 | Returns: 21 | Path: The resolved absolute path 22 | """ 23 | return Path(path).expanduser().resolve() 24 | 25 | class GenericFileHandler(BaseFileHandler): 26 | """ 27 | Generic file handler that provides file system operations for all operating systems. 28 | 29 | This class implements the BaseFileHandler interface and provides methods for 30 | file and directory operations including reading, writing, creating, and deleting 31 | files and directories. 32 | """ 33 | 34 | async def file_exists(self, path: str) -> Dict[str, Any]: 35 | """ 36 | Check if a file exists at the specified path. 37 | 38 | Args: 39 | path: The file path to check 40 | 41 | Returns: 42 | Dict containing 'success' boolean and either 'exists' boolean or 'error' string 43 | """ 44 | try: 45 | return {"success": True, "exists": resolve_path(path).is_file()} 46 | except Exception as e: 47 | return {"success": False, "error": str(e)} 48 | 49 | async def directory_exists(self, path: str) -> Dict[str, Any]: 50 | """ 51 | Check if a directory exists at the specified path. 52 | 53 | Args: 54 | path: The directory path to check 55 | 56 | Returns: 57 | Dict containing 'success' boolean and either 'exists' boolean or 'error' string 58 | """ 59 | try: 60 | return {"success": True, "exists": resolve_path(path).is_dir()} 61 | except Exception as e: 62 | return {"success": False, "error": str(e)} 63 | 64 | async def list_dir(self, path: str) -> Dict[str, Any]: 65 | """ 66 | List all files and directories in the specified directory. 67 | 68 | Args: 69 | path: The directory path to list 70 | 71 | Returns: 72 | Dict containing 'success' boolean and either 'files' list of names or 'error' string 73 | """ 74 | try: 75 | return {"success": True, "files": [p.name for p in resolve_path(path).iterdir() if p.is_file() or p.is_dir()]} 76 | except Exception as e: 77 | return {"success": False, "error": str(e)} 78 | 79 | async def read_text(self, path: str) -> Dict[str, Any]: 80 | """ 81 | Read the contents of a text file. 82 | 83 | Args: 84 | path: The file path to read from 85 | 86 | Returns: 87 | Dict containing 'success' boolean and either 'content' string or 'error' string 88 | """ 89 | try: 90 | return {"success": True, "content": resolve_path(path).read_text()} 91 | except Exception as e: 92 | return {"success": False, "error": str(e)} 93 | 94 | async def write_text(self, path: str, content: str) -> Dict[str, Any]: 95 | """ 96 | Write text content to a file. 97 | 98 | Args: 99 | path: The file path to write to 100 | content: The text content to write 101 | 102 | Returns: 103 | Dict containing 'success' boolean and optionally 'error' string 104 | """ 105 | try: 106 | resolve_path(path).write_text(content) 107 | return {"success": True} 108 | except Exception as e: 109 | return {"success": False, "error": str(e)} 110 | 111 | async def write_bytes(self, path: str, content_b64: str, append: bool = False) -> Dict[str, Any]: 112 | """ 113 | Write binary content to a file from base64 encoded string. 114 | 115 | Args: 116 | path: The file path to write to 117 | content_b64: Base64 encoded binary content 118 | append: If True, append to existing file; if False, overwrite 119 | 120 | Returns: 121 | Dict containing 'success' boolean and optionally 'error' string 122 | """ 123 | try: 124 | mode = 'ab' if append else 'wb' 125 | with open(resolve_path(path), mode) as f: 126 | f.write(base64.b64decode(content_b64)) 127 | return {"success": True} 128 | except Exception as e: 129 | return {"success": False, "error": str(e)} 130 | 131 | async def read_bytes(self, path: str, offset: int = 0, length: Optional[int] = None) -> Dict[str, Any]: 132 | """ 133 | Read binary content from a file and return as base64 encoded string. 134 | 135 | Args: 136 | path: The file path to read from 137 | offset: Byte offset to start reading from 138 | length: Number of bytes to read; if None, read entire file from offset 139 | 140 | Returns: 141 | Dict containing 'success' boolean and either 'content_b64' string or 'error' string 142 | """ 143 | try: 144 | file_path = resolve_path(path) 145 | with open(file_path, 'rb') as f: 146 | if offset > 0: 147 | f.seek(offset) 148 | 149 | if length is not None: 150 | content = f.read(length) 151 | else: 152 | content = f.read() 153 | 154 | return {"success": True, "content_b64": base64.b64encode(content).decode('utf-8')} 155 | except Exception as e: 156 | return {"success": False, "error": str(e)} 157 | 158 | async def get_file_size(self, path: str) -> Dict[str, Any]: 159 | """ 160 | Get the size of a file in bytes. 161 | 162 | Args: 163 | path: The file path to get size for 164 | 165 | Returns: 166 | Dict containing 'success' boolean and either 'size' integer or 'error' string 167 | """ 168 | try: 169 | file_path = resolve_path(path) 170 | size = file_path.stat().st_size 171 | return {"success": True, "size": size} 172 | except Exception as e: 173 | return {"success": False, "error": str(e)} 174 | 175 | async def delete_file(self, path: str) -> Dict[str, Any]: 176 | """ 177 | Delete a file at the specified path. 178 | 179 | Args: 180 | path: The file path to delete 181 | 182 | Returns: 183 | Dict containing 'success' boolean and optionally 'error' string 184 | """ 185 | try: 186 | resolve_path(path).unlink() 187 | return {"success": True} 188 | except Exception as e: 189 | return {"success": False, "error": str(e)} 190 | 191 | async def create_dir(self, path: str) -> Dict[str, Any]: 192 | """ 193 | Create a directory at the specified path. 194 | 195 | Creates parent directories if they don't exist and doesn't raise an error 196 | if the directory already exists. 197 | 198 | Args: 199 | path: The directory path to create 200 | 201 | Returns: 202 | Dict containing 'success' boolean and optionally 'error' string 203 | """ 204 | try: 205 | resolve_path(path).mkdir(parents=True, exist_ok=True) 206 | return {"success": True} 207 | except Exception as e: 208 | return {"success": False, "error": str(e)} 209 | 210 | async def delete_dir(self, path: str) -> Dict[str, Any]: 211 | """ 212 | Delete an empty directory at the specified path. 213 | 214 | Args: 215 | path: The directory path to delete 216 | 217 | Returns: 218 | Dict containing 'success' boolean and optionally 'error' string 219 | """ 220 | try: 221 | resolve_path(path).rmdir() 222 | return {"success": True} 223 | except Exception as e: 224 | return {"success": False, "error": str(e)} 225 | ``` -------------------------------------------------------------------------------- /libs/python/pylume/pylume/models.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Optional, List, Literal, Dict, Any 2 | import re 3 | from pydantic import BaseModel, Field, computed_field, validator, ConfigDict, RootModel 4 | 5 | class DiskInfo(BaseModel): 6 | """Information about disk storage allocation. 7 | 8 | Attributes: 9 | total: Total disk space in bytes 10 | allocated: Currently allocated disk space in bytes 11 | """ 12 | total: int 13 | allocated: int 14 | 15 | class VMConfig(BaseModel): 16 | """Configuration for creating a new VM. 17 | 18 | Note: Memory and disk sizes should be specified with units (e.g., "4GB", "64GB") 19 | 20 | Attributes: 21 | name: Name of the virtual machine 22 | os: Operating system type, either "macOS" or "linux" 23 | cpu: Number of CPU cores to allocate 24 | memory: Amount of memory to allocate with units 25 | disk_size: Size of the disk to create with units 26 | display: Display resolution in format "widthxheight" 27 | ipsw: IPSW path or 'latest' for macOS VMs, None for other OS types 28 | """ 29 | name: str 30 | os: Literal["macOS", "linux"] = "macOS" 31 | cpu: int = Field(default=2, ge=1) 32 | memory: str = "4GB" 33 | disk_size: str = Field(default="64GB", alias="diskSize") 34 | display: str = "1024x768" 35 | ipsw: Optional[str] = Field(default=None, description="IPSW path or 'latest', for macOS VMs") 36 | 37 | class Config: 38 | populate_by_alias = True 39 | 40 | class SharedDirectory(BaseModel): 41 | """Configuration for a shared directory. 42 | 43 | Attributes: 44 | host_path: Path to the directory on the host system 45 | read_only: Whether the directory should be mounted as read-only 46 | """ 47 | host_path: str = Field(..., alias="hostPath") # Allow host_path but serialize as hostPath 48 | read_only: bool = False 49 | 50 | class Config: 51 | populate_by_name = True # Allow both alias and original name 52 | alias_generator = lambda s: ''.join(word.capitalize() if i else word for i, word in enumerate(s.split('_'))) 53 | 54 | class VMRunOpts(BaseModel): 55 | """Configuration for running a VM. 56 | 57 | Args: 58 | no_display: Whether to not display the VNC client 59 | shared_directories: List of directories to share with the VM 60 | """ 61 | no_display: bool = Field(default=False, alias="noDisplay") 62 | shared_directories: Optional[list[SharedDirectory]] = Field( 63 | default=None, 64 | alias="sharedDirectories" 65 | ) 66 | 67 | model_config = ConfigDict( 68 | populate_by_name=True, 69 | alias_generator=lambda s: ''.join(word.capitalize() if i else word for i, word in enumerate(s.split('_'))) 70 | ) 71 | 72 | def model_dump(self, **kwargs): 73 | """Export model data with proper field name conversion. 74 | 75 | Converts shared directory fields to match API expectations when using aliases. 76 | 77 | Args: 78 | **kwargs: Keyword arguments passed to parent model_dump method 79 | 80 | Returns: 81 | dict: Model data with properly formatted field names 82 | """ 83 | data = super().model_dump(**kwargs) 84 | # Convert shared directory fields to match API expectations 85 | if self.shared_directories and "by_alias" in kwargs and kwargs["by_alias"]: 86 | data["sharedDirectories"] = [ 87 | { 88 | "hostPath": d.host_path, 89 | "readOnly": d.read_only 90 | } 91 | for d in self.shared_directories 92 | ] 93 | # Remove the snake_case version if it exists 94 | data.pop("shared_directories", None) 95 | return data 96 | 97 | class VMStatus(BaseModel): 98 | """Status information for a virtual machine. 99 | 100 | Attributes: 101 | name: Name of the virtual machine 102 | status: Current status of the VM 103 | os: Operating system type 104 | cpu_count: Number of CPU cores allocated 105 | memory_size: Amount of memory allocated in bytes 106 | disk_size: Disk storage information 107 | vnc_url: URL for VNC connection if available 108 | ip_address: IP address of the VM if available 109 | """ 110 | name: str 111 | status: str 112 | os: Literal["macOS", "linux"] 113 | cpu_count: int = Field(alias="cpuCount") 114 | memory_size: int = Field(alias="memorySize") # API returns memory size in bytes 115 | disk_size: DiskInfo = Field(alias="diskSize") 116 | vnc_url: Optional[str] = Field(default=None, alias="vncUrl") 117 | ip_address: Optional[str] = Field(default=None, alias="ipAddress") 118 | 119 | class Config: 120 | populate_by_alias = True 121 | 122 | @computed_field 123 | @property 124 | def state(self) -> str: 125 | """Get the current state of the VM. 126 | 127 | Returns: 128 | str: Current VM status 129 | """ 130 | return self.status 131 | 132 | @computed_field 133 | @property 134 | def cpu(self) -> int: 135 | """Get the number of CPU cores. 136 | 137 | Returns: 138 | int: Number of CPU cores allocated to the VM 139 | """ 140 | return self.cpu_count 141 | 142 | @computed_field 143 | @property 144 | def memory(self) -> str: 145 | """Get memory allocation in human-readable format. 146 | 147 | Returns: 148 | str: Memory size formatted as "{size}GB" 149 | """ 150 | # Convert bytes to GB 151 | gb = self.memory_size / (1024 * 1024 * 1024) 152 | return f"{int(gb)}GB" 153 | 154 | class VMUpdateOpts(BaseModel): 155 | """Options for updating VM configuration. 156 | 157 | Attributes: 158 | cpu: Number of CPU cores to update to 159 | memory: Amount of memory to update to with units 160 | disk_size: Size of disk to update to with units 161 | """ 162 | cpu: Optional[int] = None 163 | memory: Optional[str] = None 164 | disk_size: Optional[str] = None 165 | 166 | class ImageRef(BaseModel): 167 | """Reference to a VM image. 168 | 169 | Attributes: 170 | image: Name of the image 171 | tag: Tag version of the image 172 | registry: Registry hostname where image is stored 173 | organization: Organization or namespace in the registry 174 | """ 175 | image: str 176 | tag: str = "latest" 177 | registry: Optional[str] = "ghcr.io" 178 | organization: Optional[str] = "trycua" 179 | 180 | def model_dump(self, **kwargs): 181 | """Override model_dump to return just the image:tag format. 182 | 183 | Args: 184 | **kwargs: Keyword arguments (ignored) 185 | 186 | Returns: 187 | str: Image reference in "image:tag" format 188 | """ 189 | return f"{self.image}:{self.tag}" 190 | 191 | class CloneSpec(BaseModel): 192 | """Specification for cloning a VM. 193 | 194 | Attributes: 195 | name: Name of the source VM to clone 196 | new_name: Name for the new cloned VM 197 | """ 198 | name: str 199 | new_name: str = Field(alias="newName") 200 | 201 | class Config: 202 | populate_by_alias = True 203 | 204 | class ImageInfo(BaseModel): 205 | """Model for individual image information. 206 | 207 | Attributes: 208 | imageId: Unique identifier for the image 209 | """ 210 | imageId: str 211 | 212 | class ImageList(RootModel): 213 | """Response model for the images endpoint. 214 | 215 | A list-like container for ImageInfo objects that provides 216 | iteration and indexing capabilities. 217 | """ 218 | root: List[ImageInfo] 219 | 220 | def __iter__(self): 221 | """Iterate over the image list. 222 | 223 | Returns: 224 | Iterator over ImageInfo objects 225 | """ 226 | return iter(self.root) 227 | 228 | def __getitem__(self, item): 229 | """Get an item from the image list by index. 230 | 231 | Args: 232 | item: Index or slice to retrieve 233 | 234 | Returns: 235 | ImageInfo or list of ImageInfo objects 236 | """ 237 | return self.root[item] 238 | 239 | def __len__(self): 240 | """Get the number of images in the list. 241 | 242 | Returns: 243 | int: Number of images in the list 244 | """ 245 | return len(self.root) ```