This is page 14 of 21. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .all-contributorsrc ├── .cursorignore ├── .devcontainer │ ├── devcontainer.json │ ├── post-install.sh │ └── README.md ├── .dockerignore ├── .gitattributes ├── .github │ ├── FUNDING.yml │ ├── scripts │ │ ├── get_pyproject_version.py │ │ └── tests │ │ ├── __init__.py │ │ ├── README.md │ │ └── test_get_pyproject_version.py │ └── workflows │ ├── ci-lume.yml │ ├── docker-publish-kasm.yml │ ├── docker-publish-xfce.yml │ ├── docker-reusable-publish.yml │ ├── npm-publish-computer.yml │ ├── npm-publish-core.yml │ ├── publish-lume.yml │ ├── pypi-publish-agent.yml │ ├── pypi-publish-computer-server.yml │ ├── pypi-publish-computer.yml │ ├── pypi-publish-core.yml │ ├── pypi-publish-mcp-server.yml │ ├── pypi-publish-pylume.yml │ ├── pypi-publish-som.yml │ ├── pypi-reusable-publish.yml │ └── test-validation-script.yml ├── .gitignore ├── .vscode │ ├── docs.code-workspace │ ├── launch.json │ ├── libs-ts.code-workspace │ ├── lume.code-workspace │ ├── lumier.code-workspace │ ├── py.code-workspace │ └── settings.json ├── blog │ ├── app-use.md │ ├── assets │ │ ├── composite-agents.png │ │ ├── docker-ubuntu-support.png │ │ ├── hack-booth.png │ │ ├── hack-closing-ceremony.jpg │ │ ├── hack-cua-ollama-hud.jpeg │ │ ├── hack-leaderboard.png │ │ ├── hack-the-north.png │ │ ├── hack-winners.jpeg │ │ ├── hack-workshop.jpeg │ │ ├── hud-agent-evals.png │ │ └── trajectory-viewer.jpeg │ ├── bringing-computer-use-to-the-web.md │ ├── build-your-own-operator-on-macos-1.md │ ├── build-your-own-operator-on-macos-2.md │ ├── composite-agents.md │ ├── cua-hackathon.md │ ├── hack-the-north.md │ ├── hud-agent-evals.md │ ├── human-in-the-loop.md │ ├── introducing-cua-cloud-containers.md │ ├── lume-to-containerization.md │ ├── sandboxed-python-execution.md │ ├── training-computer-use-models-trajectories-1.md │ ├── trajectory-viewer.md │ ├── ubuntu-docker-support.md │ └── windows-sandbox.md ├── CONTRIBUTING.md ├── Development.md ├── Dockerfile ├── docs │ ├── .gitignore │ ├── .prettierrc │ ├── content │ │ └── docs │ │ ├── agent-sdk │ │ │ ├── agent-loops.mdx │ │ │ ├── benchmarks │ │ │ │ ├── index.mdx │ │ │ │ ├── interactive.mdx │ │ │ │ ├── introduction.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── osworld-verified.mdx │ │ │ │ ├── screenspot-pro.mdx │ │ │ │ └── screenspot-v2.mdx │ │ │ ├── callbacks │ │ │ │ ├── agent-lifecycle.mdx │ │ │ │ ├── cost-saving.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── logging.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── pii-anonymization.mdx │ │ │ │ └── trajectories.mdx │ │ │ ├── chat-history.mdx │ │ │ ├── custom-computer-handlers.mdx │ │ │ ├── custom-tools.mdx │ │ │ ├── customizing-computeragent.mdx │ │ │ ├── integrations │ │ │ │ ├── hud.mdx │ │ │ │ └── meta.json │ │ │ ├── message-format.mdx │ │ │ ├── meta.json │ │ │ ├── migration-guide.mdx │ │ │ ├── prompt-caching.mdx │ │ │ ├── supported-agents │ │ │ │ ├── composed-agents.mdx │ │ │ │ ├── computer-use-agents.mdx │ │ │ │ ├── grounding-models.mdx │ │ │ │ ├── human-in-the-loop.mdx │ │ │ │ └── meta.json │ │ │ ├── supported-model-providers │ │ │ │ ├── index.mdx │ │ │ │ └── local-models.mdx │ │ │ └── usage-tracking.mdx │ │ ├── computer-sdk │ │ │ ├── cloud-vm-management.mdx │ │ │ ├── commands.mdx │ │ │ ├── computer-ui.mdx │ │ │ ├── computers.mdx │ │ │ ├── meta.json │ │ │ └── sandboxed-python.mdx │ │ ├── index.mdx │ │ ├── libraries │ │ │ ├── agent │ │ │ │ └── index.mdx │ │ │ ├── computer │ │ │ │ └── index.mdx │ │ │ ├── computer-server │ │ │ │ ├── Commands.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── REST-API.mdx │ │ │ │ └── WebSocket-API.mdx │ │ │ ├── core │ │ │ │ └── index.mdx │ │ │ ├── lume │ │ │ │ ├── cli-reference.mdx │ │ │ │ ├── faq.md │ │ │ │ ├── http-api.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── meta.json │ │ │ │ └── prebuilt-images.mdx │ │ │ ├── lumier │ │ │ │ ├── building-lumier.mdx │ │ │ │ ├── docker-compose.mdx │ │ │ │ ├── docker.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ └── meta.json │ │ │ ├── mcp-server │ │ │ │ ├── client-integrations.mdx │ │ │ │ ├── configuration.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── llm-integrations.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── tools.mdx │ │ │ │ └── usage.mdx │ │ │ └── som │ │ │ ├── configuration.mdx │ │ │ └── index.mdx │ │ ├── meta.json │ │ ├── quickstart-cli.mdx │ │ ├── quickstart-devs.mdx │ │ └── telemetry.mdx │ ├── next.config.mjs │ ├── package-lock.json │ ├── package.json │ ├── pnpm-lock.yaml │ ├── postcss.config.mjs │ ├── public │ │ └── img │ │ ├── agent_gradio_ui.png │ │ ├── agent.png │ │ ├── cli.png │ │ ├── computer.png │ │ ├── som_box_threshold.png │ │ └── som_iou_threshold.png │ ├── README.md │ ├── source.config.ts │ ├── src │ │ ├── app │ │ │ ├── (home) │ │ │ │ ├── [[...slug]] │ │ │ │ │ └── page.tsx │ │ │ │ └── layout.tsx │ │ │ ├── api │ │ │ │ └── search │ │ │ │ └── route.ts │ │ │ ├── favicon.ico │ │ │ ├── global.css │ │ │ ├── layout.config.tsx │ │ │ ├── layout.tsx │ │ │ ├── llms.mdx │ │ │ │ └── [[...slug]] │ │ │ │ └── route.ts │ │ │ └── llms.txt │ │ │ └── route.ts │ │ ├── assets │ │ │ ├── discord-black.svg │ │ │ ├── discord-white.svg │ │ │ ├── logo-black.svg │ │ │ └── logo-white.svg │ │ ├── components │ │ │ ├── iou.tsx │ │ │ └── mermaid.tsx │ │ ├── lib │ │ │ ├── llms.ts │ │ │ └── source.ts │ │ └── mdx-components.tsx │ └── tsconfig.json ├── examples │ ├── agent_examples.py │ ├── agent_ui_examples.py │ ├── cloud_api_examples.py │ ├── computer_examples_windows.py │ ├── computer_examples.py │ ├── computer_ui_examples.py │ ├── computer-example-ts │ │ ├── .env.example │ │ ├── .gitignore │ │ ├── .prettierrc │ │ ├── package-lock.json │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── README.md │ │ ├── src │ │ │ ├── helpers.ts │ │ │ └── index.ts │ │ └── tsconfig.json │ ├── docker_examples.py │ ├── evals │ │ ├── hud_eval_examples.py │ │ └── wikipedia_most_linked.txt │ ├── pylume_examples.py │ ├── sandboxed_functions_examples.py │ ├── som_examples.py │ ├── utils.py │ └── winsandbox_example.py ├── img │ ├── agent_gradio_ui.png │ ├── agent.png │ ├── cli.png │ ├── computer.png │ ├── logo_black.png │ └── logo_white.png ├── libs │ ├── kasm │ │ ├── Dockerfile │ │ ├── LICENSE │ │ ├── README.md │ │ └── src │ │ └── ubuntu │ │ └── install │ │ └── firefox │ │ ├── custom_startup.sh │ │ ├── firefox.desktop │ │ └── install_firefox.sh │ ├── lume │ │ ├── .cursorignore │ │ ├── CONTRIBUTING.md │ │ ├── Development.md │ │ ├── img │ │ │ └── cli.png │ │ ├── Package.resolved │ │ ├── Package.swift │ │ ├── README.md │ │ ├── resources │ │ │ └── lume.entitlements │ │ ├── scripts │ │ │ ├── build │ │ │ │ ├── build-debug.sh │ │ │ │ ├── build-release-notarized.sh │ │ │ │ └── build-release.sh │ │ │ └── install.sh │ │ ├── src │ │ │ ├── Commands │ │ │ │ ├── Clone.swift │ │ │ │ ├── Config.swift │ │ │ │ ├── Create.swift │ │ │ │ ├── Delete.swift │ │ │ │ ├── Get.swift │ │ │ │ ├── Images.swift │ │ │ │ ├── IPSW.swift │ │ │ │ ├── List.swift │ │ │ │ ├── Logs.swift │ │ │ │ ├── Options │ │ │ │ │ └── FormatOption.swift │ │ │ │ ├── Prune.swift │ │ │ │ ├── Pull.swift │ │ │ │ ├── Push.swift │ │ │ │ ├── Run.swift │ │ │ │ ├── Serve.swift │ │ │ │ ├── Set.swift │ │ │ │ └── Stop.swift │ │ │ ├── ContainerRegistry │ │ │ │ ├── ImageContainerRegistry.swift │ │ │ │ ├── ImageList.swift │ │ │ │ └── ImagesPrinter.swift │ │ │ ├── Errors │ │ │ │ └── Errors.swift │ │ │ ├── FileSystem │ │ │ │ ├── Home.swift │ │ │ │ ├── Settings.swift │ │ │ │ ├── VMConfig.swift │ │ │ │ ├── VMDirectory.swift │ │ │ │ └── VMLocation.swift │ │ │ ├── LumeController.swift │ │ │ ├── Main.swift │ │ │ ├── Server │ │ │ │ ├── Handlers.swift │ │ │ │ ├── HTTP.swift │ │ │ │ ├── Requests.swift │ │ │ │ ├── Responses.swift │ │ │ │ └── Server.swift │ │ │ ├── Utils │ │ │ │ ├── CommandRegistry.swift │ │ │ │ ├── CommandUtils.swift │ │ │ │ ├── Logger.swift │ │ │ │ ├── NetworkUtils.swift │ │ │ │ ├── Path.swift │ │ │ │ ├── ProcessRunner.swift │ │ │ │ ├── ProgressLogger.swift │ │ │ │ ├── String.swift │ │ │ │ └── Utils.swift │ │ │ ├── Virtualization │ │ │ │ ├── DarwinImageLoader.swift │ │ │ │ ├── DHCPLeaseParser.swift │ │ │ │ ├── ImageLoaderFactory.swift │ │ │ │ └── VMVirtualizationService.swift │ │ │ ├── VM │ │ │ │ ├── DarwinVM.swift │ │ │ │ ├── LinuxVM.swift │ │ │ │ ├── VM.swift │ │ │ │ ├── VMDetails.swift │ │ │ │ ├── VMDetailsPrinter.swift │ │ │ │ ├── VMDisplayResolution.swift │ │ │ │ └── VMFactory.swift │ │ │ └── VNC │ │ │ ├── PassphraseGenerator.swift │ │ │ └── VNCService.swift │ │ └── tests │ │ ├── Mocks │ │ │ ├── MockVM.swift │ │ │ ├── MockVMVirtualizationService.swift │ │ │ └── MockVNCService.swift │ │ ├── VM │ │ │ └── VMDetailsPrinterTests.swift │ │ ├── VMTests.swift │ │ ├── VMVirtualizationServiceTests.swift │ │ └── VNCServiceTests.swift │ ├── lumier │ │ ├── .dockerignore │ │ ├── Dockerfile │ │ ├── README.md │ │ └── src │ │ ├── bin │ │ │ └── entry.sh │ │ ├── config │ │ │ └── constants.sh │ │ ├── hooks │ │ │ └── on-logon.sh │ │ └── lib │ │ ├── utils.sh │ │ └── vm.sh │ ├── python │ │ ├── agent │ │ │ ├── .bumpversion.cfg │ │ │ ├── agent │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── adapters │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── huggingfacelocal_adapter.py │ │ │ │ │ ├── human_adapter.py │ │ │ │ │ ├── mlxvlm_adapter.py │ │ │ │ │ └── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── qwen2_5_vl.py │ │ │ │ ├── agent.py │ │ │ │ ├── callbacks │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── budget_manager.py │ │ │ │ │ ├── image_retention.py │ │ │ │ │ ├── logging.py │ │ │ │ │ ├── operator_validator.py │ │ │ │ │ ├── pii_anonymization.py │ │ │ │ │ ├── prompt_instructions.py │ │ │ │ │ ├── telemetry.py │ │ │ │ │ └── trajectory_saver.py │ │ │ │ ├── cli.py │ │ │ │ ├── computers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cua.py │ │ │ │ │ └── custom.py │ │ │ │ ├── decorators.py │ │ │ │ ├── human_tool │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ ├── server.py │ │ │ │ │ └── ui.py │ │ │ │ ├── integrations │ │ │ │ │ └── hud │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── agent.py │ │ │ │ │ └── proxy.py │ │ │ │ ├── loops │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── anthropic.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── composed_grounded.py │ │ │ │ │ ├── gemini.py │ │ │ │ │ ├── glm45v.py │ │ │ │ │ ├── gta1.py │ │ │ │ │ ├── holo.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── model_types.csv │ │ │ │ │ ├── moondream3.py │ │ │ │ │ ├── omniparser.py │ │ │ │ │ ├── openai.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── uitars.py │ │ │ │ ├── proxy │ │ │ │ │ ├── examples.py │ │ │ │ │ └── handlers.py │ │ │ │ ├── responses.py │ │ │ │ ├── types.py │ │ │ │ └── ui │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ └── gradio │ │ │ │ ├── __init__.py │ │ │ │ ├── app.py │ │ │ │ └── ui_components.py │ │ │ ├── benchmarks │ │ │ │ ├── .gitignore │ │ │ │ ├── contrib.md │ │ │ │ ├── interactive.py │ │ │ │ ├── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ └── gta1.py │ │ │ │ ├── README.md │ │ │ │ ├── ss-pro.py │ │ │ │ ├── ss-v2.py │ │ │ │ └── utils.py │ │ │ ├── example.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer │ │ │ ├── .bumpversion.cfg │ │ │ ├── computer │ │ │ │ ├── __init__.py │ │ │ │ ├── computer.py │ │ │ │ ├── diorama_computer.py │ │ │ │ ├── helpers.py │ │ │ │ ├── interface │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ ├── models.py │ │ │ │ │ └── windows.py │ │ │ │ ├── logger.py │ │ │ │ ├── models.py │ │ │ │ ├── providers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cloud │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── docker │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── lume │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── lume_api.py │ │ │ │ │ ├── lumier │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── types.py │ │ │ │ │ └── winsandbox │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── provider.py │ │ │ │ │ └── setup_script.ps1 │ │ │ │ ├── ui │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ └── gradio │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── app.py │ │ │ │ └── utils.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer-server │ │ │ ├── .bumpversion.cfg │ │ │ ├── computer_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── cli.py │ │ │ │ ├── diorama │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── diorama_computer.py │ │ │ │ │ ├── diorama.py │ │ │ │ │ ├── draw.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── safezone.py │ │ │ │ ├── handlers │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── windows.py │ │ │ │ ├── main.py │ │ │ │ ├── server.py │ │ │ │ └── watchdog.py │ │ │ ├── examples │ │ │ │ ├── __init__.py │ │ │ │ └── usage_example.py │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ ├── run_server.py │ │ │ └── test_connection.py │ │ ├── core │ │ │ ├── .bumpversion.cfg │ │ │ ├── core │ │ │ │ ├── __init__.py │ │ │ │ └── telemetry │ │ │ │ ├── __init__.py │ │ │ │ └── posthog.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── mcp-server │ │ │ ├── .bumpversion.cfg │ │ │ ├── CONCURRENT_SESSIONS.md │ │ │ ├── mcp_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── server.py │ │ │ │ └── session_manager.py │ │ │ ├── pdm.lock │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ └── scripts │ │ │ ├── install_mcp_server.sh │ │ │ └── start_mcp_server.sh │ │ ├── pylume │ │ │ ├── __init__.py │ │ │ ├── .bumpversion.cfg │ │ │ ├── pylume │ │ │ │ ├── __init__.py │ │ │ │ ├── client.py │ │ │ │ ├── exceptions.py │ │ │ │ ├── lume │ │ │ │ ├── models.py │ │ │ │ ├── pylume.py │ │ │ │ └── server.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ └── som │ │ ├── .bumpversion.cfg │ │ ├── LICENSE │ │ ├── poetry.toml │ │ ├── pyproject.toml │ │ ├── README.md │ │ ├── som │ │ │ ├── __init__.py │ │ │ ├── detect.py │ │ │ ├── detection.py │ │ │ ├── models.py │ │ │ ├── ocr.py │ │ │ ├── util │ │ │ │ └── utils.py │ │ │ └── visualization.py │ │ └── tests │ │ └── test_omniparser.py │ ├── typescript │ │ ├── .gitignore │ │ ├── .nvmrc │ │ ├── agent │ │ │ ├── examples │ │ │ │ ├── playground-example.html │ │ │ │ └── README.md │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── client.ts │ │ │ │ ├── index.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ └── client.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── biome.json │ │ ├── computer │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── computer │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── providers │ │ │ │ │ │ ├── base.ts │ │ │ │ │ │ ├── cloud.ts │ │ │ │ │ │ └── index.ts │ │ │ │ │ └── types.ts │ │ │ │ ├── index.ts │ │ │ │ ├── interface │ │ │ │ │ ├── base.ts │ │ │ │ │ ├── factory.ts │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── linux.ts │ │ │ │ │ ├── macos.ts │ │ │ │ │ └── windows.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ ├── computer │ │ │ │ │ └── cloud.test.ts │ │ │ │ ├── interface │ │ │ │ │ ├── factory.test.ts │ │ │ │ │ ├── index.test.ts │ │ │ │ │ ├── linux.test.ts │ │ │ │ │ ├── macos.test.ts │ │ │ │ │ └── windows.test.ts │ │ │ │ └── setup.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── core │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── index.ts │ │ │ │ └── telemetry │ │ │ │ ├── clients │ │ │ │ │ ├── index.ts │ │ │ │ │ └── posthog.ts │ │ │ │ └── index.ts │ │ │ ├── tests │ │ │ │ └── telemetry.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── pnpm-workspace.yaml │ │ └── README.md │ └── xfce │ ├── .dockerignore │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ └── src │ ├── scripts │ │ ├── resize-display.sh │ │ ├── start-computer-server.sh │ │ ├── start-novnc.sh │ │ ├── start-vnc.sh │ │ └── xstartup.sh │ ├── supervisor │ │ └── supervisord.conf │ └── xfce-config │ ├── helpers.rc │ ├── xfce4-power-manager.xml │ └── xfce4-session.xml ├── LICENSE.md ├── Makefile ├── notebooks │ ├── agent_nb.ipynb │ ├── blog │ │ ├── build-your-own-operator-on-macos-1.ipynb │ │ └── build-your-own-operator-on-macos-2.ipynb │ ├── composite_agents_docker_nb.ipynb │ ├── computer_nb.ipynb │ ├── computer_server_nb.ipynb │ ├── customizing_computeragent.ipynb │ ├── eval_osworld.ipynb │ ├── ollama_nb.ipynb │ ├── pylume_nb.ipynb │ ├── README.md │ ├── sota_hackathon_cloud.ipynb │ └── sota_hackathon.ipynb ├── pdm.lock ├── pyproject.toml ├── pyrightconfig.json ├── README.md ├── samples │ └── community │ ├── global-online │ │ └── README.md │ └── hack-the-north │ └── README.md ├── scripts │ ├── build-uv.sh │ ├── build.ps1 │ ├── build.sh │ ├── cleanup.sh │ ├── playground-docker.sh │ ├── playground.sh │ └── run-docker-dev.sh └── tests ├── pytest.ini ├── shell_cmd.py ├── test_files.py ├── test_mcp_server_session_management.py ├── test_mcp_server_streaming.py ├── test_shell_bash.py ├── test_telemetry.py ├── test_venv.py └── test_watchdog.py ``` # Files -------------------------------------------------------------------------------- /libs/python/computer-server/computer_server/handlers/linux.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Linux implementation of automation and accessibility handlers. 3 | 4 | This implementation attempts to use pyautogui for GUI automation when available. 5 | If running in a headless environment without X11, it will fall back to simulated responses. 6 | To use GUI automation in a headless environment: 7 | 1. Install Xvfb: sudo apt-get install xvfb 8 | 2. Run with virtual display: xvfb-run python -m computer_server 9 | """ 10 | from typing import Dict, Any, List, Tuple, Optional 11 | import logging 12 | import subprocess 13 | import asyncio 14 | import base64 15 | import os 16 | import json 17 | from io import BytesIO 18 | 19 | # Configure logger 20 | logger = logging.getLogger(__name__) 21 | 22 | # Try to import pyautogui, but don't fail if it's not available 23 | # This allows the server to run in headless environments 24 | try: 25 | import pyautogui 26 | pyautogui.FAILSAFE = False 27 | 28 | logger.info("pyautogui successfully imported, GUI automation available") 29 | except Exception as e: 30 | logger.warning(f"pyautogui import failed: {str(e)}. GUI operations will be simulated.") 31 | 32 | from pynput.mouse import Button, Controller as MouseController 33 | from pynput.keyboard import Key, Controller as KeyboardController 34 | 35 | from .base import BaseAccessibilityHandler, BaseAutomationHandler 36 | 37 | class LinuxAccessibilityHandler(BaseAccessibilityHandler): 38 | """Linux implementation of accessibility handler.""" 39 | 40 | async def get_accessibility_tree(self) -> Dict[str, Any]: 41 | """Get the accessibility tree of the current window. 42 | 43 | Returns: 44 | Dict[str, Any]: A dictionary containing success status and a simulated tree structure 45 | since Linux doesn't have equivalent accessibility API like macOS. 46 | """ 47 | # Linux doesn't have equivalent accessibility API like macOS 48 | # Return a minimal dummy tree 49 | logger.info("Getting accessibility tree (simulated, no accessibility API available on Linux)") 50 | return { 51 | "success": True, 52 | "tree": { 53 | "role": "Window", 54 | "title": "Linux Window", 55 | "position": {"x": 0, "y": 0}, 56 | "size": {"width": 1920, "height": 1080}, 57 | "children": [] 58 | } 59 | } 60 | 61 | async def find_element(self, role: Optional[str] = None, 62 | title: Optional[str] = None, 63 | value: Optional[str] = None) -> Dict[str, Any]: 64 | """Find an element in the accessibility tree by criteria. 65 | 66 | Args: 67 | role: The role of the element to find. 68 | title: The title of the element to find. 69 | value: The value of the element to find. 70 | 71 | Returns: 72 | Dict[str, Any]: A dictionary indicating that element search is not supported on Linux. 73 | """ 74 | logger.info(f"Finding element with role={role}, title={title}, value={value} (not supported on Linux)") 75 | return { 76 | "success": False, 77 | "message": "Element search not supported on Linux" 78 | } 79 | 80 | def get_cursor_position(self) -> Tuple[int, int]: 81 | """Get the current cursor position. 82 | 83 | Returns: 84 | Tuple[int, int]: The x and y coordinates of the cursor position. 85 | Returns (0, 0) if pyautogui is not available. 86 | """ 87 | try: 88 | pos = pyautogui.position() 89 | return pos.x, pos.y 90 | except Exception as e: 91 | logger.warning(f"Failed to get cursor position with pyautogui: {e}") 92 | 93 | logger.info("Getting cursor position (simulated)") 94 | return 0, 0 95 | 96 | def get_screen_size(self) -> Tuple[int, int]: 97 | """Get the screen size. 98 | 99 | Returns: 100 | Tuple[int, int]: The width and height of the screen in pixels. 101 | Returns (1920, 1080) if pyautogui is not available. 102 | """ 103 | try: 104 | size = pyautogui.size() 105 | return size.width, size.height 106 | except Exception as e: 107 | logger.warning(f"Failed to get screen size with pyautogui: {e}") 108 | 109 | logger.info("Getting screen size (simulated)") 110 | return 1920, 1080 111 | 112 | class LinuxAutomationHandler(BaseAutomationHandler): 113 | """Linux implementation of automation handler using pyautogui.""" 114 | keyboard = KeyboardController() 115 | mouse = MouseController() 116 | 117 | # Mouse Actions 118 | async def mouse_down(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> Dict[str, Any]: 119 | """Press and hold a mouse button at the specified coordinates. 120 | 121 | Args: 122 | x: The x coordinate to move to before pressing. If None, uses current position. 123 | y: The y coordinate to move to before pressing. If None, uses current position. 124 | button: The mouse button to press ("left", "right", or "middle"). 125 | 126 | Returns: 127 | Dict[str, Any]: A dictionary with success status and error message if failed. 128 | """ 129 | try: 130 | if x is not None and y is not None: 131 | pyautogui.moveTo(x, y) 132 | pyautogui.mouseDown(button=button) 133 | return {"success": True} 134 | except Exception as e: 135 | return {"success": False, "error": str(e)} 136 | 137 | async def mouse_up(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> Dict[str, Any]: 138 | """Release a mouse button at the specified coordinates. 139 | 140 | Args: 141 | x: The x coordinate to move to before releasing. If None, uses current position. 142 | y: The y coordinate to move to before releasing. If None, uses current position. 143 | button: The mouse button to release ("left", "right", or "middle"). 144 | 145 | Returns: 146 | Dict[str, Any]: A dictionary with success status and error message if failed. 147 | """ 148 | try: 149 | if x is not None and y is not None: 150 | pyautogui.moveTo(x, y) 151 | pyautogui.mouseUp(button=button) 152 | return {"success": True} 153 | except Exception as e: 154 | return {"success": False, "error": str(e)} 155 | 156 | async def move_cursor(self, x: int, y: int) -> Dict[str, Any]: 157 | """Move the cursor to the specified coordinates. 158 | 159 | Args: 160 | x: The x coordinate to move to. 161 | y: The y coordinate to move to. 162 | 163 | Returns: 164 | Dict[str, Any]: A dictionary with success status and error message if failed. 165 | """ 166 | try: 167 | pyautogui.moveTo(x, y) 168 | return {"success": True} 169 | except Exception as e: 170 | return {"success": False, "error": str(e)} 171 | 172 | async def left_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]: 173 | """Perform a left mouse click at the specified coordinates. 174 | 175 | Args: 176 | x: The x coordinate to click at. If None, clicks at current position. 177 | y: The y coordinate to click at. If None, clicks at current position. 178 | 179 | Returns: 180 | Dict[str, Any]: A dictionary with success status and error message if failed. 181 | """ 182 | try: 183 | if x is not None and y is not None: 184 | pyautogui.moveTo(x, y) 185 | pyautogui.click() 186 | return {"success": True} 187 | except Exception as e: 188 | return {"success": False, "error": str(e)} 189 | 190 | async def right_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]: 191 | """Perform a right mouse click at the specified coordinates. 192 | 193 | Args: 194 | x: The x coordinate to click at. If None, clicks at current position. 195 | y: The y coordinate to click at. If None, clicks at current position. 196 | 197 | Returns: 198 | Dict[str, Any]: A dictionary with success status and error message if failed. 199 | """ 200 | try: 201 | if x is not None and y is not None: 202 | pyautogui.moveTo(x, y) 203 | pyautogui.rightClick() 204 | return {"success": True} 205 | except Exception as e: 206 | return {"success": False, "error": str(e)} 207 | 208 | async def double_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]: 209 | """Perform a double click at the specified coordinates. 210 | 211 | Args: 212 | x: The x coordinate to double click at. If None, clicks at current position. 213 | y: The y coordinate to double click at. If None, clicks at current position. 214 | 215 | Returns: 216 | Dict[str, Any]: A dictionary with success status and error message if failed. 217 | """ 218 | try: 219 | if x is not None and y is not None: 220 | pyautogui.moveTo(x, y) 221 | pyautogui.doubleClick(interval=0.1) 222 | return {"success": True} 223 | except Exception as e: 224 | return {"success": False, "error": str(e)} 225 | 226 | async def click(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> Dict[str, Any]: 227 | """Perform a mouse click with the specified button at the given coordinates. 228 | 229 | Args: 230 | x: The x coordinate to click at. If None, clicks at current position. 231 | y: The y coordinate to click at. If None, clicks at current position. 232 | button: The mouse button to click ("left", "right", or "middle"). 233 | 234 | Returns: 235 | Dict[str, Any]: A dictionary with success status and error message if failed. 236 | """ 237 | try: 238 | if x is not None and y is not None: 239 | pyautogui.moveTo(x, y) 240 | pyautogui.click(button=button) 241 | return {"success": True} 242 | except Exception as e: 243 | return {"success": False, "error": str(e)} 244 | 245 | async def drag_to(self, x: int, y: int, button: str = "left", duration: float = 0.5) -> Dict[str, Any]: 246 | """Drag from the current position to the specified coordinates. 247 | 248 | Args: 249 | x: The x coordinate to drag to. 250 | y: The y coordinate to drag to. 251 | button: The mouse button to use for dragging. 252 | duration: The time in seconds to take for the drag operation. 253 | 254 | Returns: 255 | Dict[str, Any]: A dictionary with success status and error message if failed. 256 | """ 257 | try: 258 | pyautogui.dragTo(x, y, duration=duration, button=button) 259 | return {"success": True} 260 | except Exception as e: 261 | return {"success": False, "error": str(e)} 262 | 263 | async def drag(self, start_x: int, start_y: int, end_x: int, end_y: int, button: str = "left") -> Dict[str, Any]: 264 | """Drag from start coordinates to end coordinates. 265 | 266 | Args: 267 | start_x: The starting x coordinate. 268 | start_y: The starting y coordinate. 269 | end_x: The ending x coordinate. 270 | end_y: The ending y coordinate. 271 | button: The mouse button to use for dragging. 272 | 273 | Returns: 274 | Dict[str, Any]: A dictionary with success status and error message if failed. 275 | """ 276 | try: 277 | pyautogui.moveTo(start_x, start_y) 278 | pyautogui.dragTo(end_x, end_y, duration=0.5, button=button) 279 | return {"success": True} 280 | except Exception as e: 281 | return {"success": False, "error": str(e)} 282 | 283 | async def drag_path(self, path: List[Tuple[int, int]], button: str = "left", duration: float = 0.5) -> Dict[str, Any]: 284 | """Drag along a path defined by a list of coordinates. 285 | 286 | Args: 287 | path: A list of (x, y) coordinate tuples defining the drag path. 288 | button: The mouse button to use for dragging. 289 | duration: The time in seconds to take for each segment of the drag. 290 | 291 | Returns: 292 | Dict[str, Any]: A dictionary with success status and error message if failed. 293 | """ 294 | try: 295 | if not path: 296 | return {"success": False, "error": "Path is empty"} 297 | pyautogui.moveTo(*path[0]) 298 | for x, y in path[1:]: 299 | pyautogui.dragTo(x, y, duration=duration, button=button) 300 | return {"success": True} 301 | except Exception as e: 302 | return {"success": False, "error": str(e)} 303 | 304 | # Keyboard Actions 305 | async def key_down(self, key: str) -> Dict[str, Any]: 306 | """Press and hold a key. 307 | 308 | Args: 309 | key: The key to press down. 310 | 311 | Returns: 312 | Dict[str, Any]: A dictionary with success status and error message if failed. 313 | """ 314 | try: 315 | pyautogui.keyDown(key) 316 | return {"success": True} 317 | except Exception as e: 318 | return {"success": False, "error": str(e)} 319 | 320 | async def key_up(self, key: str) -> Dict[str, Any]: 321 | """Release a key. 322 | 323 | Args: 324 | key: The key to release. 325 | 326 | Returns: 327 | Dict[str, Any]: A dictionary with success status and error message if failed. 328 | """ 329 | try: 330 | pyautogui.keyUp(key) 331 | return {"success": True} 332 | except Exception as e: 333 | return {"success": False, "error": str(e)} 334 | 335 | async def type_text(self, text: str) -> Dict[str, Any]: 336 | """Type the specified text using the keyboard. 337 | 338 | Args: 339 | text: The text to type. 340 | 341 | Returns: 342 | Dict[str, Any]: A dictionary with success status and error message if failed. 343 | """ 344 | try: 345 | # use pynput for Unicode support 346 | self.keyboard.type(text) 347 | return {"success": True} 348 | except Exception as e: 349 | return {"success": False, "error": str(e)} 350 | 351 | async def press_key(self, key: str) -> Dict[str, Any]: 352 | """Press and release a key. 353 | 354 | Args: 355 | key: The key to press. 356 | 357 | Returns: 358 | Dict[str, Any]: A dictionary with success status and error message if failed. 359 | """ 360 | try: 361 | pyautogui.press(key) 362 | return {"success": True} 363 | except Exception as e: 364 | return {"success": False, "error": str(e)} 365 | 366 | async def hotkey(self, keys: List[str]) -> Dict[str, Any]: 367 | """Press a combination of keys simultaneously. 368 | 369 | Args: 370 | keys: A list of keys to press together as a hotkey combination. 371 | 372 | Returns: 373 | Dict[str, Any]: A dictionary with success status and error message if failed. 374 | """ 375 | try: 376 | pyautogui.hotkey(*keys) 377 | return {"success": True} 378 | except Exception as e: 379 | return {"success": False, "error": str(e)} 380 | 381 | # Scrolling Actions 382 | async def scroll(self, x: int, y: int) -> Dict[str, Any]: 383 | """Scroll the mouse wheel. 384 | 385 | Args: 386 | x: The horizontal scroll amount. 387 | y: The vertical scroll amount. 388 | 389 | Returns: 390 | Dict[str, Any]: A dictionary with success status and error message if failed. 391 | """ 392 | try: 393 | self.mouse.scroll(x, y) 394 | return {"success": True} 395 | except Exception as e: 396 | return {"success": False, "error": str(e)} 397 | 398 | async def scroll_down(self, clicks: int = 1) -> Dict[str, Any]: 399 | """Scroll down by the specified number of clicks. 400 | 401 | Args: 402 | clicks: The number of scroll clicks to perform downward. 403 | 404 | Returns: 405 | Dict[str, Any]: A dictionary with success status and error message if failed. 406 | """ 407 | try: 408 | pyautogui.scroll(-clicks) 409 | return {"success": True} 410 | except Exception as e: 411 | return {"success": False, "error": str(e)} 412 | 413 | async def scroll_up(self, clicks: int = 1) -> Dict[str, Any]: 414 | """Scroll up by the specified number of clicks. 415 | 416 | Args: 417 | clicks: The number of scroll clicks to perform upward. 418 | 419 | Returns: 420 | Dict[str, Any]: A dictionary with success status and error message if failed. 421 | """ 422 | try: 423 | pyautogui.scroll(clicks) 424 | return {"success": True} 425 | except Exception as e: 426 | return {"success": False, "error": str(e)} 427 | 428 | # Screen Actions 429 | async def screenshot(self) -> Dict[str, Any]: 430 | """Take a screenshot of the current screen. 431 | 432 | Returns: 433 | Dict[str, Any]: A dictionary containing success status and base64-encoded image data, 434 | or error message if failed. 435 | """ 436 | try: 437 | from PIL import Image 438 | screenshot = pyautogui.screenshot() 439 | if not isinstance(screenshot, Image.Image): 440 | return {"success": False, "error": "Failed to capture screenshot"} 441 | buffered = BytesIO() 442 | screenshot.save(buffered, format="PNG", optimize=True) 443 | buffered.seek(0) 444 | image_data = base64.b64encode(buffered.getvalue()).decode() 445 | return {"success": True, "image_data": image_data} 446 | except Exception as e: 447 | return {"success": False, "error": f"Screenshot error: {str(e)}"} 448 | 449 | async def get_screen_size(self) -> Dict[str, Any]: 450 | """Get the size of the screen. 451 | 452 | Returns: 453 | Dict[str, Any]: A dictionary containing success status and screen dimensions, 454 | or error message if failed. 455 | """ 456 | try: 457 | size = pyautogui.size() 458 | return {"success": True, "size": {"width": size.width, "height": size.height}} 459 | except Exception as e: 460 | return {"success": False, "error": str(e)} 461 | 462 | async def get_cursor_position(self) -> Dict[str, Any]: 463 | """Get the current position of the cursor. 464 | 465 | Returns: 466 | Dict[str, Any]: A dictionary containing success status and cursor coordinates, 467 | or error message if failed. 468 | """ 469 | try: 470 | pos = pyautogui.position() 471 | return {"success": True, "position": {"x": pos.x, "y": pos.y}} 472 | except Exception as e: 473 | return {"success": False, "error": str(e)} 474 | 475 | # Clipboard Actions 476 | async def copy_to_clipboard(self) -> Dict[str, Any]: 477 | """Get the current content of the clipboard. 478 | 479 | Returns: 480 | Dict[str, Any]: A dictionary containing success status and clipboard content, 481 | or error message if failed. 482 | """ 483 | try: 484 | import pyperclip 485 | content = pyperclip.paste() 486 | return {"success": True, "content": content} 487 | except Exception as e: 488 | return {"success": False, "error": str(e)} 489 | 490 | async def set_clipboard(self, text: str) -> Dict[str, Any]: 491 | """Set the clipboard content to the specified text. 492 | 493 | Args: 494 | text: The text to copy to the clipboard. 495 | 496 | Returns: 497 | Dict[str, Any]: A dictionary with success status and error message if failed. 498 | """ 499 | try: 500 | import pyperclip 501 | pyperclip.copy(text) 502 | return {"success": True} 503 | except Exception as e: 504 | return {"success": False, "error": str(e)} 505 | 506 | # Command Execution 507 | async def run_command(self, command: str) -> Dict[str, Any]: 508 | """Execute a shell command asynchronously. 509 | 510 | Args: 511 | command: The shell command to execute. 512 | 513 | Returns: 514 | Dict[str, Any]: A dictionary containing success status, stdout, stderr, 515 | and return code, or error message if failed. 516 | """ 517 | try: 518 | # Create subprocess 519 | process = await asyncio.create_subprocess_shell( 520 | command, 521 | stdout=asyncio.subprocess.PIPE, 522 | stderr=asyncio.subprocess.PIPE 523 | ) 524 | # Wait for the subprocess to finish 525 | stdout, stderr = await process.communicate() 526 | # Return decoded output 527 | return { 528 | "success": True, 529 | "stdout": stdout.decode() if stdout else "", 530 | "stderr": stderr.decode() if stderr else "", 531 | "return_code": process.returncode 532 | } 533 | except Exception as e: 534 | return {"success": False, "error": str(e)} 535 | ``` -------------------------------------------------------------------------------- /libs/python/computer/computer/providers/winsandbox/provider.py: -------------------------------------------------------------------------------- ```python 1 | """Windows Sandbox VM provider implementation using pywinsandbox.""" 2 | 3 | import os 4 | import asyncio 5 | import logging 6 | import time 7 | from typing import Dict, Any, Optional, List 8 | from pathlib import Path 9 | 10 | from ..base import BaseVMProvider, VMProviderType 11 | 12 | # Setup logging 13 | logger = logging.getLogger(__name__) 14 | 15 | try: 16 | import winsandbox 17 | HAS_WINSANDBOX = True 18 | except ImportError: 19 | HAS_WINSANDBOX = False 20 | 21 | 22 | class WinSandboxProvider(BaseVMProvider): 23 | """Windows Sandbox VM provider implementation using pywinsandbox. 24 | 25 | This provider uses Windows Sandbox to create isolated Windows environments. 26 | Storage is always ephemeral with Windows Sandbox. 27 | """ 28 | 29 | def __init__( 30 | self, 31 | port: int = 7777, 32 | host: str = "localhost", 33 | storage: Optional[str] = None, 34 | verbose: bool = False, 35 | ephemeral: bool = True, # Windows Sandbox is always ephemeral 36 | memory_mb: int = 4096, 37 | networking: bool = True, 38 | **kwargs 39 | ): 40 | """Initialize the Windows Sandbox provider. 41 | 42 | Args: 43 | port: Port for the computer server (default: 7777) 44 | host: Host to use for connections (default: localhost) 45 | storage: Storage path (ignored - Windows Sandbox is always ephemeral) 46 | verbose: Enable verbose logging 47 | ephemeral: Always True for Windows Sandbox 48 | memory_mb: Memory allocation in MB (default: 4096) 49 | networking: Enable networking in sandbox (default: True) 50 | """ 51 | if not HAS_WINSANDBOX: 52 | raise ImportError( 53 | "pywinsandbox is required for WinSandboxProvider. " 54 | "Please install it with 'pip install pywinsandbox'" 55 | ) 56 | 57 | self.host = host 58 | self.port = port 59 | self.verbose = verbose 60 | self.memory_mb = memory_mb 61 | self.networking = networking 62 | 63 | # Windows Sandbox is always ephemeral 64 | if not ephemeral: 65 | logger.warning("Windows Sandbox storage is always ephemeral. Ignoring ephemeral=False.") 66 | self.ephemeral = True 67 | 68 | # Storage is always ephemeral for Windows Sandbox 69 | if storage and storage != "ephemeral": 70 | logger.warning("Windows Sandbox does not support persistent storage. Using ephemeral storage.") 71 | self.storage = "ephemeral" 72 | 73 | self.logger = logging.getLogger(__name__) 74 | 75 | # Track active sandboxes 76 | self._active_sandboxes: Dict[str, Any] = {} 77 | 78 | @property 79 | def provider_type(self) -> VMProviderType: 80 | """Get the provider type.""" 81 | return VMProviderType.WINSANDBOX 82 | 83 | async def __aenter__(self): 84 | """Enter async context manager.""" 85 | # Verify Windows Sandbox is available 86 | if not HAS_WINSANDBOX: 87 | raise ImportError("pywinsandbox is not available") 88 | 89 | return self 90 | 91 | async def __aexit__(self, exc_type, exc_val, exc_tb): 92 | """Exit async context manager.""" 93 | # Clean up any active sandboxes 94 | for name, sandbox in self._active_sandboxes.items(): 95 | try: 96 | sandbox.shutdown() 97 | self.logger.info(f"Terminated sandbox: {name}") 98 | except Exception as e: 99 | self.logger.error(f"Error terminating sandbox {name}: {e}") 100 | 101 | self._active_sandboxes.clear() 102 | 103 | async def get_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: 104 | """Get VM information by name. 105 | 106 | Args: 107 | name: Name of the VM to get information for 108 | storage: Ignored for Windows Sandbox (always ephemeral) 109 | 110 | Returns: 111 | Dictionary with VM information including status, IP address, etc. 112 | """ 113 | if name not in self._active_sandboxes: 114 | return { 115 | "name": name, 116 | "status": "stopped", 117 | "ip_address": None, 118 | "storage": "ephemeral" 119 | } 120 | 121 | sandbox = self._active_sandboxes[name] 122 | 123 | # Check if sandbox is still running 124 | try: 125 | # Try to ping the sandbox to see if it's responsive 126 | try: 127 | sandbox.rpyc.modules.os.getcwd() 128 | sandbox_responsive = True 129 | except Exception: 130 | sandbox_responsive = False 131 | 132 | if not sandbox_responsive: 133 | return { 134 | "name": name, 135 | "status": "starting", 136 | "ip_address": None, 137 | "storage": "ephemeral", 138 | "memory_mb": self.memory_mb, 139 | "networking": self.networking 140 | } 141 | 142 | # Check for computer server address file 143 | server_address_file = r"C:\Users\WDAGUtilityAccount\Desktop\shared_windows_sandbox_dir\server_address" 144 | 145 | try: 146 | # Check if the server address file exists 147 | file_exists = sandbox.rpyc.modules.os.path.exists(server_address_file) 148 | 149 | if file_exists: 150 | # Read the server address file 151 | with sandbox.rpyc.builtin.open(server_address_file, 'r') as f: 152 | server_address = f.read().strip() 153 | 154 | if server_address and ':' in server_address: 155 | # Parse IP:port from the file 156 | ip_address, port = server_address.split(':', 1) 157 | 158 | # Verify the server is actually responding 159 | try: 160 | import socket 161 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 162 | sock.settimeout(3) 163 | result = sock.connect_ex((ip_address, int(port))) 164 | sock.close() 165 | 166 | if result == 0: 167 | # Server is responding 168 | status = "running" 169 | self.logger.debug(f"Computer server found at {ip_address}:{port}") 170 | else: 171 | # Server file exists but not responding 172 | status = "starting" 173 | ip_address = None 174 | except Exception as e: 175 | self.logger.debug(f"Error checking server connectivity: {e}") 176 | status = "starting" 177 | ip_address = None 178 | else: 179 | # File exists but doesn't contain valid address 180 | status = "starting" 181 | ip_address = None 182 | else: 183 | # Server address file doesn't exist yet 184 | status = "starting" 185 | ip_address = None 186 | 187 | except Exception as e: 188 | self.logger.debug(f"Error checking server address file: {e}") 189 | status = "starting" 190 | ip_address = None 191 | 192 | except Exception as e: 193 | self.logger.error(f"Error checking sandbox status: {e}") 194 | status = "error" 195 | ip_address = None 196 | 197 | return { 198 | "name": name, 199 | "status": status, 200 | "ip_address": ip_address, 201 | "storage": "ephemeral", 202 | "memory_mb": self.memory_mb, 203 | "networking": self.networking 204 | } 205 | 206 | async def list_vms(self) -> List[Dict[str, Any]]: 207 | """List all available VMs.""" 208 | vms = [] 209 | for name in self._active_sandboxes.keys(): 210 | vm_info = await self.get_vm(name) 211 | vms.append(vm_info) 212 | return vms 213 | 214 | async def run_vm(self, image: str, name: str, run_opts: Dict[str, Any], storage: Optional[str] = None) -> Dict[str, Any]: 215 | """Run a VM with the given options. 216 | 217 | Args: 218 | image: Image name (ignored for Windows Sandbox - always uses host Windows) 219 | name: Name of the VM to run 220 | run_opts: Dictionary of run options (memory, cpu, etc.) 221 | storage: Ignored for Windows Sandbox (always ephemeral) 222 | 223 | Returns: 224 | Dictionary with VM run status and information 225 | """ 226 | if name in self._active_sandboxes: 227 | return { 228 | "success": False, 229 | "error": f"Sandbox {name} is already running" 230 | } 231 | 232 | try: 233 | # Extract options from run_opts 234 | memory_mb = run_opts.get("memory_mb", self.memory_mb) 235 | if isinstance(memory_mb, str): 236 | # Convert memory string like "4GB" to MB 237 | if memory_mb.upper().endswith("GB"): 238 | memory_mb = int(float(memory_mb[:-2]) * 1024) 239 | elif memory_mb.upper().endswith("MB"): 240 | memory_mb = int(memory_mb[:-2]) 241 | else: 242 | memory_mb = self.memory_mb 243 | 244 | networking = run_opts.get("networking", self.networking) 245 | 246 | # Create folder mappers; always map a persistent venv directory on host for caching packages 247 | folder_mappers = [] 248 | # Ensure host side persistent venv directory exists (Path.home()/wsb_venv) 249 | host_wsb_env = Path.home() / ".cua" / "wsb_cache" 250 | try: 251 | host_wsb_env.mkdir(parents=True, exist_ok=True) 252 | except Exception: 253 | # If cannot create, continue without persistent mapping 254 | host_wsb_env = None 255 | shared_directories = run_opts.get("shared_directories", []) 256 | for shared_dir in shared_directories: 257 | if isinstance(shared_dir, dict): 258 | host_path = shared_dir.get("hostPath", "") 259 | elif isinstance(shared_dir, str): 260 | host_path = shared_dir 261 | else: 262 | continue 263 | 264 | if host_path and os.path.exists(host_path): 265 | folder_mappers.append(winsandbox.FolderMapper(host_path)) 266 | 267 | # Add mapping for the persistent venv directory (read/write) so it appears in Sandbox Desktop 268 | if host_wsb_env is not None and host_wsb_env.exists(): 269 | try: 270 | folder_mappers.append( 271 | winsandbox.FolderMapper(str(host_wsb_env), read_only=False) 272 | ) 273 | except Exception as e: 274 | self.logger.warning(f"Failed to map host winsandbox_venv: {e}") 275 | 276 | self.logger.info(f"Creating Windows Sandbox: {name}") 277 | self.logger.info(f"Memory: {memory_mb}MB, Networking: {networking}") 278 | if folder_mappers: 279 | self.logger.info(f"Shared directories: {len(folder_mappers)}") 280 | 281 | # Create the sandbox without logon script 282 | try: 283 | # Try with memory_mb parameter (newer pywinsandbox version) 284 | sandbox = winsandbox.new_sandbox( 285 | memory_mb=str(memory_mb), 286 | networking=networking, 287 | folder_mappers=folder_mappers 288 | ) 289 | except TypeError as e: 290 | if "memory_mb" in str(e): 291 | # Fallback for older pywinsandbox version that doesn't support memory_mb 292 | self.logger.warning( 293 | f"Your pywinsandbox version doesn't support memory_mb parameter. " 294 | f"Using default memory settings. To use custom memory settings, " 295 | f"please update pywinsandbox: pip install -U git+https://github.com/karkason/pywinsandbox.git" 296 | ) 297 | sandbox = winsandbox.new_sandbox( 298 | networking=networking, 299 | folder_mappers=folder_mappers 300 | ) 301 | else: 302 | # Re-raise if it's a different TypeError 303 | raise 304 | 305 | # Store the sandbox 306 | self._active_sandboxes[name] = sandbox 307 | 308 | self.logger.info(f"Windows Sandbox {name} created successfully") 309 | 310 | venv_exists = (host_wsb_env / "venv" / "Lib" / "site-packages" / "computer_server").exists() if host_wsb_env else False 311 | 312 | # Setup the computer server in the sandbox 313 | await self._setup_computer_server(sandbox, name, wait_for_venv=(not venv_exists)) 314 | 315 | return { 316 | "success": True, 317 | "name": name, 318 | "status": "starting", 319 | "memory_mb": memory_mb, 320 | "networking": networking, 321 | "storage": "ephemeral" 322 | } 323 | 324 | except Exception as e: 325 | self.logger.error(f"Failed to create Windows Sandbox {name}: {e}") 326 | # stack trace 327 | import traceback 328 | self.logger.error(f"Stack trace: {traceback.format_exc()}") 329 | return { 330 | "success": False, 331 | "error": f"Failed to create sandbox: {str(e)}" 332 | } 333 | 334 | async def stop_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: 335 | """Stop a running VM. 336 | 337 | Args: 338 | name: Name of the VM to stop 339 | storage: Ignored for Windows Sandbox 340 | 341 | Returns: 342 | Dictionary with stop status and information 343 | """ 344 | if name not in self._active_sandboxes: 345 | return { 346 | "success": False, 347 | "error": f"Sandbox {name} is not running" 348 | } 349 | 350 | try: 351 | sandbox = self._active_sandboxes[name] 352 | 353 | # Terminate the sandbox 354 | sandbox.shutdown() 355 | 356 | # Remove from active sandboxes 357 | del self._active_sandboxes[name] 358 | 359 | self.logger.info(f"Windows Sandbox {name} stopped successfully") 360 | 361 | return { 362 | "success": True, 363 | "name": name, 364 | "status": "stopped" 365 | } 366 | 367 | except Exception as e: 368 | self.logger.error(f"Failed to stop Windows Sandbox {name}: {e}") 369 | return { 370 | "success": False, 371 | "error": f"Failed to stop sandbox: {str(e)}" 372 | } 373 | 374 | async def update_vm(self, name: str, update_opts: Dict[str, Any], storage: Optional[str] = None) -> Dict[str, Any]: 375 | """Update VM configuration. 376 | 377 | Note: Windows Sandbox does not support runtime configuration updates. 378 | The sandbox must be stopped and restarted with new configuration. 379 | 380 | Args: 381 | name: Name of the VM to update 382 | update_opts: Dictionary of update options 383 | storage: Ignored for Windows Sandbox 384 | 385 | Returns: 386 | Dictionary with update status and information 387 | """ 388 | return { 389 | "success": False, 390 | "error": "Windows Sandbox does not support runtime configuration updates. " 391 | "Please stop and restart the sandbox with new configuration." 392 | } 393 | 394 | async def restart_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: 395 | raise NotImplementedError("WinSandboxProvider does not support restarting VMs.") 396 | 397 | async def get_ip(self, name: str, storage: Optional[str] = None, retry_delay: int = 2) -> str: 398 | """Get the IP address of a VM, waiting indefinitely until it's available. 399 | 400 | Args: 401 | name: Name of the VM to get the IP for 402 | storage: Ignored for Windows Sandbox 403 | retry_delay: Delay between retries in seconds (default: 2) 404 | 405 | Returns: 406 | IP address of the VM when it becomes available 407 | """ 408 | total_attempts = 0 409 | 410 | # Loop indefinitely until we get a valid IP 411 | while True: 412 | total_attempts += 1 413 | 414 | # Log retry message but not on first attempt 415 | if total_attempts > 1: 416 | self.logger.info(f"Waiting for Windows Sandbox {name} IP address (attempt {total_attempts})...") 417 | 418 | try: 419 | # Get VM information 420 | vm_info = await self.get_vm(name, storage=storage) 421 | 422 | # Check if we got a valid IP 423 | ip = vm_info.get("ip_address", None) 424 | if ip and ip != "unknown" and not ip.startswith("0.0.0.0"): 425 | self.logger.info(f"Got valid Windows Sandbox IP address: {ip}") 426 | return ip 427 | 428 | # Check the VM status 429 | status = vm_info.get("status", "unknown") 430 | 431 | # If VM is not running yet, log and wait 432 | if status != "running": 433 | self.logger.info(f"Windows Sandbox is not running yet (status: {status}). Waiting...") 434 | # If VM is running but no IP yet, wait and retry 435 | else: 436 | self.logger.info("Windows Sandbox is running but no valid IP address yet. Waiting...") 437 | 438 | except Exception as e: 439 | self.logger.warning(f"Error getting Windows Sandbox {name} IP: {e}, continuing to wait...") 440 | 441 | # Wait before next retry 442 | await asyncio.sleep(retry_delay) 443 | 444 | # Add progress log every 10 attempts 445 | if total_attempts % 10 == 0: 446 | self.logger.info(f"Still waiting for Windows Sandbox {name} IP after {total_attempts} attempts...") 447 | 448 | async def _setup_computer_server(self, sandbox, name: str, visible: bool = False, wait_for_venv: bool = True): 449 | """Setup the computer server in the Windows Sandbox using RPyC. 450 | 451 | Args: 452 | sandbox: The Windows Sandbox instance 453 | name: Name of the sandbox 454 | visible: Whether the opened process should be visible (default: False) 455 | """ 456 | try: 457 | self.logger.info(f"Setting up computer server in sandbox {name}...") 458 | 459 | # Read the PowerShell setup script 460 | script_path = os.path.join(os.path.dirname(__file__), "setup_script.ps1") 461 | with open(script_path, 'r', encoding='utf-8') as f: 462 | setup_script_content = f.read() 463 | 464 | # Write the setup script to the sandbox using RPyC 465 | script_dest_path = r"C:\Users\WDAGUtilityAccount\setup_cua.ps1" 466 | 467 | self.logger.info(f"Writing setup script to {script_dest_path}") 468 | with sandbox.rpyc.builtin.open(script_dest_path, 'w') as f: 469 | f.write(setup_script_content) 470 | 471 | # Execute the PowerShell script in the background 472 | self.logger.info("Executing setup script in sandbox...") 473 | 474 | # Use subprocess to run PowerShell script 475 | import subprocess 476 | powershell_cmd = [ 477 | "powershell.exe", 478 | "-ExecutionPolicy", "Bypass", 479 | "-NoExit", # Keep window open after script completes 480 | "-File", script_dest_path 481 | ] 482 | 483 | # Set creation flags based on visibility preference 484 | if visible: 485 | # CREATE_NEW_CONSOLE - creates a new console window (visible) 486 | creation_flags = 0x00000010 487 | else: 488 | creation_flags = 0x08000000 # CREATE_NO_WINDOW 489 | 490 | # Start the process using RPyC 491 | process = sandbox.rpyc.modules.subprocess.Popen( 492 | powershell_cmd, 493 | creationflags=creation_flags, 494 | shell=False 495 | ) 496 | 497 | if wait_for_venv: 498 | print("Waiting for venv to be created for the first time setup of Windows Sandbox...") 499 | print("This may take a minute...") 500 | await asyncio.sleep(120) 501 | 502 | ip = await self.get_ip(name) 503 | self.logger.info(f"Sandbox IP: {ip}") 504 | self.logger.info(f"Setup script started in background in sandbox {name} with PID: {process.pid}") 505 | 506 | except Exception as e: 507 | self.logger.error(f"Failed to setup computer server in sandbox {name}: {e}") 508 | import traceback 509 | self.logger.error(f"Stack trace: {traceback.format_exc()}") 510 | ``` -------------------------------------------------------------------------------- /libs/python/computer/computer/providers/lume/provider.py: -------------------------------------------------------------------------------- ```python 1 | """Lume VM provider implementation using curl commands. 2 | 3 | This provider uses direct curl commands to interact with the Lume API, 4 | removing the dependency on the pylume Python package. 5 | """ 6 | 7 | import os 8 | import re 9 | import asyncio 10 | import json 11 | import logging 12 | import subprocess 13 | import urllib.parse 14 | from typing import Dict, Any, Optional, List, Tuple 15 | 16 | from ..base import BaseVMProvider, VMProviderType 17 | from ...logger import Logger, LogLevel 18 | from ..lume_api import ( 19 | lume_api_get, 20 | lume_api_run, 21 | lume_api_stop, 22 | lume_api_update, 23 | lume_api_pull, 24 | HAS_CURL, 25 | parse_memory 26 | ) 27 | 28 | # Setup logging 29 | logger = logging.getLogger(__name__) 30 | 31 | class LumeProvider(BaseVMProvider): 32 | """Lume VM provider implementation using direct curl commands. 33 | 34 | This provider uses curl to interact with the Lume API server, 35 | removing the dependency on the pylume Python package. 36 | """ 37 | 38 | def __init__( 39 | self, 40 | port: int = 7777, 41 | host: str = "localhost", 42 | storage: Optional[str] = None, 43 | verbose: bool = False, 44 | ephemeral: bool = False, 45 | ): 46 | """Initialize the Lume provider. 47 | 48 | Args: 49 | port: Port for the Lume API server (default: 7777) 50 | host: Host to use for API connections (default: localhost) 51 | storage: Path to store VM data 52 | verbose: Enable verbose logging 53 | """ 54 | if not HAS_CURL: 55 | raise ImportError( 56 | "curl is required for LumeProvider. " 57 | "Please ensure it is installed and in your PATH." 58 | ) 59 | 60 | self.host = host 61 | self.port = port # Default port for Lume API 62 | self.storage = storage 63 | self.verbose = verbose 64 | self.ephemeral = ephemeral # If True, VMs will be deleted after stopping 65 | 66 | # Base API URL for Lume API calls 67 | self.api_base_url = f"http://{self.host}:{self.port}" 68 | 69 | self.logger = logging.getLogger(__name__) 70 | 71 | @property 72 | def provider_type(self) -> VMProviderType: 73 | """Get the provider type.""" 74 | return VMProviderType.LUME 75 | 76 | async def __aenter__(self): 77 | """Enter async context manager.""" 78 | # No initialization needed, just return self 79 | return self 80 | 81 | async def __aexit__(self, exc_type, exc_val, exc_tb): 82 | """Exit async context manager.""" 83 | # No cleanup needed 84 | pass 85 | 86 | def _lume_api_get(self, vm_name: str = "", storage: Optional[str] = None, debug: bool = False) -> Dict[str, Any]: 87 | """Get VM information using shared lume_api function. 88 | 89 | Args: 90 | vm_name: Optional name of the VM to get info for. 91 | If empty, lists all VMs. 92 | storage: Optional storage path override. If provided, this will be used instead of self.storage 93 | debug: Whether to show debug output 94 | 95 | Returns: 96 | Dictionary with VM status information parsed from JSON response 97 | """ 98 | # Use the shared implementation from lume_api module 99 | return lume_api_get( 100 | vm_name=vm_name, 101 | host=self.host, 102 | port=self.port, 103 | storage=storage if storage is not None else self.storage, 104 | debug=debug, 105 | verbose=self.verbose 106 | ) 107 | 108 | def _lume_api_run(self, vm_name: str, run_opts: Dict[str, Any], debug: bool = False) -> Dict[str, Any]: 109 | """Run a VM using shared lume_api function. 110 | 111 | Args: 112 | vm_name: Name of the VM to run 113 | run_opts: Dictionary of run options 114 | debug: Whether to show debug output 115 | 116 | Returns: 117 | Dictionary with API response or error information 118 | """ 119 | # Use the shared implementation from lume_api module 120 | return lume_api_run( 121 | vm_name=vm_name, 122 | host=self.host, 123 | port=self.port, 124 | run_opts=run_opts, 125 | storage=self.storage, 126 | debug=debug, 127 | verbose=self.verbose 128 | ) 129 | 130 | def _lume_api_stop(self, vm_name: str, debug: bool = False) -> Dict[str, Any]: 131 | """Stop a VM using shared lume_api function. 132 | 133 | Args: 134 | vm_name: Name of the VM to stop 135 | debug: Whether to show debug output 136 | 137 | Returns: 138 | Dictionary with API response or error information 139 | """ 140 | # Use the shared implementation from lume_api module 141 | return lume_api_stop( 142 | vm_name=vm_name, 143 | host=self.host, 144 | port=self.port, 145 | storage=self.storage, 146 | debug=debug, 147 | verbose=self.verbose 148 | ) 149 | 150 | def _lume_api_update(self, vm_name: str, update_opts: Dict[str, Any], debug: bool = False) -> Dict[str, Any]: 151 | """Update VM configuration using shared lume_api function. 152 | 153 | Args: 154 | vm_name: Name of the VM to update 155 | update_opts: Dictionary of update options 156 | debug: Whether to show debug output 157 | 158 | Returns: 159 | Dictionary with API response or error information 160 | """ 161 | # Use the shared implementation from lume_api module 162 | return lume_api_update( 163 | vm_name=vm_name, 164 | host=self.host, 165 | port=self.port, 166 | update_opts=update_opts, 167 | storage=self.storage, 168 | debug=debug, 169 | verbose=self.verbose 170 | ) 171 | 172 | async def get_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: 173 | """Get VM information by name. 174 | 175 | Args: 176 | name: Name of the VM to get information for 177 | storage: Optional storage path override. If provided, this will be used 178 | instead of the provider's default storage path. 179 | 180 | Returns: 181 | Dictionary with VM information including status, IP address, etc. 182 | 183 | Note: 184 | If storage is not provided, the provider's default storage path will be used. 185 | The storage parameter allows overriding the storage location for this specific call. 186 | """ 187 | if not HAS_CURL: 188 | logger.error("curl is not available. Cannot get VM status.") 189 | return { 190 | "name": name, 191 | "status": "unavailable", 192 | "error": "curl is not available" 193 | } 194 | 195 | # First try to get detailed VM info from the API 196 | try: 197 | # Query the Lume API for VM status using the provider's storage_path 198 | vm_info = self._lume_api_get( 199 | vm_name=name, 200 | storage=storage if storage is not None else self.storage, 201 | debug=self.verbose 202 | ) 203 | 204 | # Check for API errors 205 | if "error" in vm_info: 206 | logger.debug(f"API request error: {vm_info['error']}") 207 | # If we got an error from the API, report the VM as not ready yet 208 | return { 209 | "name": name, 210 | "status": "starting", # VM is still starting - do not attempt to connect yet 211 | "api_status": "error", 212 | "error": vm_info["error"] 213 | } 214 | 215 | # Process the VM status information 216 | vm_status = vm_info.get("status", "unknown") 217 | 218 | # Check if VM is stopped or not running - don't wait for IP in this case 219 | if vm_status == "stopped": 220 | logger.info(f"VM {name} is in '{vm_status}' state - not waiting for IP address") 221 | # Return the status as-is without waiting for an IP 222 | result = { 223 | "name": name, 224 | "status": vm_status, 225 | **vm_info # Include all original fields from the API response 226 | } 227 | return result 228 | 229 | # Handle field name differences between APIs 230 | # Some APIs use camelCase, others use snake_case 231 | if "vncUrl" in vm_info: 232 | vnc_url = vm_info["vncUrl"] 233 | elif "vnc_url" in vm_info: 234 | vnc_url = vm_info["vnc_url"] 235 | else: 236 | vnc_url = "" 237 | 238 | if "ipAddress" in vm_info: 239 | ip_address = vm_info["ipAddress"] 240 | elif "ip_address" in vm_info: 241 | ip_address = vm_info["ip_address"] 242 | else: 243 | # If no IP address is provided and VM is supposed to be running, 244 | # report it as still starting 245 | ip_address = None 246 | logger.info(f"VM {name} is in '{vm_status}' state but no IP address found - reporting as still starting") 247 | 248 | logger.info(f"VM {name} status: {vm_status}") 249 | 250 | # Return the complete status information 251 | result = { 252 | "name": name, 253 | "status": vm_status if vm_status else "running", 254 | "ip_address": ip_address, 255 | "vnc_url": vnc_url, 256 | "api_status": "ok" 257 | } 258 | 259 | # Include all original fields from the API response 260 | if isinstance(vm_info, dict): 261 | for key, value in vm_info.items(): 262 | if key not in result: # Don't override our carefully processed fields 263 | result[key] = value 264 | 265 | return result 266 | 267 | except Exception as e: 268 | logger.error(f"Failed to get VM status: {e}") 269 | # Return a fallback status that indicates the VM is not ready yet 270 | return { 271 | "name": name, 272 | "status": "initializing", # VM is still initializing 273 | "error": f"Failed to get VM status: {str(e)}" 274 | } 275 | 276 | async def list_vms(self) -> List[Dict[str, Any]]: 277 | """List all available VMs.""" 278 | result = self._lume_api_get(debug=self.verbose) 279 | 280 | # Extract the VMs list from the response 281 | if "vms" in result and isinstance(result["vms"], list): 282 | return result["vms"] 283 | elif "error" in result: 284 | logger.error(f"Error listing VMs: {result['error']}") 285 | return [] 286 | else: 287 | return [] 288 | 289 | async def run_vm(self, image: str, name: str, run_opts: Dict[str, Any], storage: Optional[str] = None) -> Dict[str, Any]: 290 | """Run a VM with the given options. 291 | 292 | If the VM does not exist in the storage location, this will attempt to pull it 293 | from the Lume registry first. 294 | 295 | Args: 296 | image: Image name to use when pulling the VM if it doesn't exist 297 | name: Name of the VM to run 298 | run_opts: Dictionary of run options (memory, cpu, etc.) 299 | storage: Optional storage path override. If provided, this will be used 300 | instead of the provider's default storage path. 301 | 302 | Returns: 303 | Dictionary with VM run status and information 304 | """ 305 | # First check if VM exists by trying to get its info 306 | vm_info = await self.get_vm(name, storage=storage) 307 | 308 | if "error" in vm_info: 309 | # VM doesn't exist, try to pull it 310 | self.logger.info(f"VM {name} not found, attempting to pull image {image} from registry...") 311 | 312 | # Call pull_vm with the image parameter 313 | pull_result = await self.pull_vm( 314 | name=name, 315 | image=image, 316 | storage=storage 317 | ) 318 | 319 | # Check if pull was successful 320 | if "error" in pull_result: 321 | self.logger.error(f"Failed to pull VM image: {pull_result['error']}") 322 | return pull_result # Return the error from pull 323 | 324 | self.logger.info(f"Successfully pulled VM image {image} as {name}") 325 | 326 | # Now run the VM with the given options 327 | self.logger.info(f"Running VM {name} with options: {run_opts}") 328 | 329 | from ..lume_api import lume_api_run 330 | return lume_api_run( 331 | vm_name=name, 332 | host=self.host, 333 | port=self.port, 334 | run_opts=run_opts, 335 | storage=storage if storage is not None else self.storage, 336 | debug=self.verbose, 337 | verbose=self.verbose 338 | ) 339 | 340 | async def stop_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: 341 | """Stop a running VM. 342 | 343 | If this provider was initialized with ephemeral=True, the VM will also 344 | be deleted after it is stopped. 345 | 346 | Args: 347 | name: Name of the VM to stop 348 | storage: Optional storage path override 349 | 350 | Returns: 351 | Dictionary with stop status and information 352 | """ 353 | # Stop the VM first 354 | stop_result = self._lume_api_stop(name, debug=self.verbose) 355 | 356 | # Log ephemeral status for debugging 357 | self.logger.info(f"Ephemeral mode status: {self.ephemeral}") 358 | 359 | # If ephemeral mode is enabled, delete the VM after stopping 360 | if self.ephemeral and (stop_result.get("success", False) or "error" not in stop_result): 361 | self.logger.info(f"Ephemeral mode enabled - deleting VM {name} after stopping") 362 | try: 363 | delete_result = await self.delete_vm(name, storage=storage) 364 | 365 | # Return combined result 366 | return { 367 | **stop_result, # Include all stop result info 368 | "deleted": True, 369 | "delete_result": delete_result 370 | } 371 | except Exception as e: 372 | self.logger.error(f"Failed to delete ephemeral VM {name}: {e}") 373 | # Include the error but still return stop result 374 | return { 375 | **stop_result, 376 | "deleted": False, 377 | "delete_error": str(e) 378 | } 379 | 380 | # Just return the stop result if not ephemeral 381 | return stop_result 382 | 383 | async def pull_vm( 384 | self, 385 | name: str, 386 | image: str, 387 | storage: Optional[str] = None, 388 | registry: str = "ghcr.io", 389 | organization: str = "trycua", 390 | pull_opts: Optional[Dict[str, Any]] = None, 391 | ) -> Dict[str, Any]: 392 | """Pull a VM image from the registry. 393 | 394 | Args: 395 | name: Name for the VM after pulling 396 | image: The image name to pull (e.g. 'macos-sequoia-cua:latest') 397 | storage: Optional storage path to use 398 | registry: Registry to pull from (default: ghcr.io) 399 | organization: Organization in registry (default: trycua) 400 | pull_opts: Additional options for pulling the VM (optional) 401 | 402 | Returns: 403 | Dictionary with information about the pulled VM 404 | 405 | Raises: 406 | RuntimeError: If pull operation fails or image is not provided 407 | """ 408 | # Validate image parameter 409 | if not image: 410 | raise ValueError("Image parameter is required for pull_vm") 411 | 412 | self.logger.info(f"Pulling VM image '{image}' as '{name}'") 413 | self.logger.info("You can check the pull progress using: lume logs -f") 414 | 415 | # Set default pull_opts if not provided 416 | if pull_opts is None: 417 | pull_opts = {} 418 | 419 | # Log information about the operation 420 | self.logger.debug(f"Pull storage location: {storage or 'default'}") 421 | 422 | try: 423 | # Call the lume_api_pull function from lume_api.py 424 | from ..lume_api import lume_api_pull 425 | 426 | result = lume_api_pull( 427 | image=image, 428 | name=name, 429 | host=self.host, 430 | port=self.port, 431 | storage=storage if storage is not None else self.storage, 432 | registry=registry, 433 | organization=organization, 434 | debug=self.verbose, 435 | verbose=self.verbose 436 | ) 437 | 438 | # Check for errors in the result 439 | if "error" in result: 440 | self.logger.error(f"Failed to pull VM image: {result['error']}") 441 | return result 442 | 443 | self.logger.info(f"Successfully pulled VM image '{image}' as '{name}'") 444 | return result 445 | except Exception as e: 446 | self.logger.error(f"Failed to pull VM image '{image}': {e}") 447 | return {"error": f"Failed to pull VM: {str(e)}"} 448 | 449 | async def delete_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: 450 | """Delete a VM permanently. 451 | 452 | Args: 453 | name: Name of the VM to delete 454 | storage: Optional storage path override 455 | 456 | Returns: 457 | Dictionary with delete status and information 458 | """ 459 | self.logger.info(f"Deleting VM {name}...") 460 | 461 | try: 462 | # Call the lume_api_delete function we created 463 | from ..lume_api import lume_api_delete 464 | 465 | result = lume_api_delete( 466 | vm_name=name, 467 | host=self.host, 468 | port=self.port, 469 | storage=storage if storage is not None else self.storage, 470 | debug=self.verbose, 471 | verbose=self.verbose 472 | ) 473 | 474 | # Check for errors in the result 475 | if "error" in result: 476 | self.logger.error(f"Failed to delete VM: {result['error']}") 477 | return result 478 | 479 | self.logger.info(f"Successfully deleted VM '{name}'") 480 | return result 481 | except Exception as e: 482 | self.logger.error(f"Failed to delete VM '{name}': {e}") 483 | return {"error": f"Failed to delete VM: {str(e)}"} 484 | 485 | async def update_vm(self, name: str, update_opts: Dict[str, Any], storage: Optional[str] = None) -> Dict[str, Any]: 486 | """Update VM configuration.""" 487 | return self._lume_api_update(name, update_opts, debug=self.verbose) 488 | 489 | async def restart_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: 490 | raise NotImplementedError("LumeProvider does not support restarting VMs.") 491 | 492 | async def get_ip(self, name: str, storage: Optional[str] = None, retry_delay: int = 2) -> str: 493 | """Get the IP address of a VM, waiting indefinitely until it's available. 494 | 495 | Args: 496 | name: Name of the VM to get the IP for 497 | storage: Optional storage path override 498 | retry_delay: Delay between retries in seconds (default: 2) 499 | 500 | Returns: 501 | IP address of the VM when it becomes available 502 | """ 503 | # Track total attempts for logging purposes 504 | total_attempts = 0 505 | 506 | # Loop indefinitely until we get a valid IP 507 | while True: 508 | total_attempts += 1 509 | 510 | # Log retry message but not on first attempt 511 | if total_attempts > 1: 512 | self.logger.info(f"Waiting for VM {name} IP address (attempt {total_attempts})...") 513 | 514 | try: 515 | # Get VM information 516 | vm_info = await self.get_vm(name, storage=storage) 517 | 518 | # Check if we got a valid IP 519 | ip = vm_info.get("ip_address", None) 520 | if ip and ip != "unknown" and not ip.startswith("0.0.0.0"): 521 | self.logger.info(f"Got valid VM IP address: {ip}") 522 | return ip 523 | 524 | # Check the VM status 525 | status = vm_info.get("status", "unknown") 526 | 527 | # If VM is not running yet, log and wait 528 | if status != "running": 529 | self.logger.info(f"VM is not running yet (status: {status}). Waiting...") 530 | # If VM is running but no IP yet, wait and retry 531 | else: 532 | self.logger.info("VM is running but no valid IP address yet. Waiting...") 533 | 534 | except Exception as e: 535 | self.logger.warning(f"Error getting VM {name} IP: {e}, continuing to wait...") 536 | 537 | # Wait before next retry 538 | await asyncio.sleep(retry_delay) 539 | 540 | # Add progress log every 10 attempts 541 | if total_attempts % 10 == 0: 542 | self.logger.info(f"Still waiting for VM {name} IP after {total_attempts} attempts...") 543 | 544 | 545 | ``` -------------------------------------------------------------------------------- /libs/python/computer/computer/providers/docker/provider.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Docker VM provider implementation. 3 | 4 | This provider uses Docker containers running the CUA Ubuntu image to create 5 | Linux VMs with computer-server. It handles VM lifecycle operations through Docker 6 | commands and container management. 7 | """ 8 | 9 | import logging 10 | import json 11 | import asyncio 12 | from typing import Dict, List, Optional, Any 13 | import subprocess 14 | import time 15 | import re 16 | 17 | from ..base import BaseVMProvider, VMProviderType 18 | 19 | # Setup logging 20 | logger = logging.getLogger(__name__) 21 | 22 | # Check if Docker is available 23 | try: 24 | subprocess.run(["docker", "--version"], capture_output=True, check=True) 25 | HAS_DOCKER = True 26 | except (subprocess.SubprocessError, FileNotFoundError): 27 | HAS_DOCKER = False 28 | 29 | 30 | class DockerProvider(BaseVMProvider): 31 | """ 32 | Docker VM Provider implementation using Docker containers. 33 | 34 | This provider uses Docker to run containers with the CUA Ubuntu image 35 | that includes computer-server for remote computer use. 36 | """ 37 | 38 | def __init__( 39 | self, 40 | port: Optional[int] = 8000, 41 | host: str = "localhost", 42 | storage: Optional[str] = None, 43 | shared_path: Optional[str] = None, 44 | image: str = "trycua/cua-ubuntu:latest", 45 | verbose: bool = False, 46 | ephemeral: bool = False, 47 | vnc_port: Optional[int] = 6901, 48 | ): 49 | """Initialize the Docker VM Provider. 50 | 51 | Args: 52 | port: Currently unused (VM provider port) 53 | host: Hostname for the API server (default: localhost) 54 | storage: Path for persistent VM storage 55 | shared_path: Path for shared folder between host and container 56 | image: Docker image to use (default: "trycua/cua-ubuntu:latest") 57 | Supported images: 58 | - "trycua/cua-ubuntu:latest" (Kasm-based) 59 | - "trycua/cua-docker-xfce:latest" (vanilla XFCE) 60 | verbose: Enable verbose logging 61 | ephemeral: Use ephemeral (temporary) storage 62 | vnc_port: Port for VNC interface (default: 6901) 63 | """ 64 | self.host = host 65 | self.api_port = 8000 66 | self.vnc_port = vnc_port 67 | self.ephemeral = ephemeral 68 | 69 | # Handle ephemeral storage (temporary directory) 70 | if ephemeral: 71 | self.storage = "ephemeral" 72 | else: 73 | self.storage = storage 74 | 75 | self.shared_path = shared_path 76 | self.image = image 77 | self.verbose = verbose 78 | self._container_id = None 79 | self._running_containers = {} # Track running containers by name 80 | 81 | # Detect image type and configure user directory accordingly 82 | self._detect_image_config() 83 | 84 | def _detect_image_config(self): 85 | """Detect image type and configure paths accordingly.""" 86 | # Detect if this is a docker-xfce image or Kasm image 87 | if "docker-xfce" in self.image.lower() or "xfce" in self.image.lower(): 88 | self._home_dir = "/home/cua" 89 | self._image_type = "docker-xfce" 90 | logger.info(f"Detected docker-xfce image: using {self._home_dir}") 91 | else: 92 | # Default to Kasm configuration 93 | self._home_dir = "/home/kasm-user" 94 | self._image_type = "kasm" 95 | logger.info(f"Detected Kasm image: using {self._home_dir}") 96 | 97 | @property 98 | def provider_type(self) -> VMProviderType: 99 | """Return the provider type.""" 100 | return VMProviderType.DOCKER 101 | 102 | def _parse_memory(self, memory_str: str) -> str: 103 | """Parse memory string to Docker format. 104 | 105 | Examples: 106 | "8GB" -> "8g" 107 | "1024MB" -> "1024m" 108 | "512" -> "512m" 109 | """ 110 | if isinstance(memory_str, int): 111 | return f"{memory_str}m" 112 | 113 | if isinstance(memory_str, str): 114 | # Extract number and unit 115 | match = re.match(r"(\d+)([A-Za-z]*)", memory_str) 116 | if match: 117 | value, unit = match.groups() 118 | unit = unit.upper() 119 | 120 | if unit == "GB" or unit == "G": 121 | return f"{value}g" 122 | elif unit == "MB" or unit == "M" or unit == "": 123 | return f"{value}m" 124 | 125 | # Default fallback 126 | logger.warning(f"Could not parse memory string '{memory_str}', using 4g default") 127 | return "4g" # Default to 4GB 128 | 129 | async def get_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: 130 | """Get VM information by name. 131 | 132 | Args: 133 | name: Name of the VM to get information for 134 | storage: Optional storage path override. If provided, this will be used 135 | instead of the provider's default storage path. 136 | 137 | Returns: 138 | Dictionary with VM information including status, IP address, etc. 139 | """ 140 | try: 141 | # Check if container exists and get its status 142 | cmd = ["docker", "inspect", name] 143 | result = subprocess.run(cmd, capture_output=True, text=True) 144 | 145 | if result.returncode != 0: 146 | # Container doesn't exist 147 | return { 148 | "name": name, 149 | "status": "not_found", 150 | "ip_address": None, 151 | "ports": {}, 152 | "image": self.image, 153 | "provider": "docker" 154 | } 155 | 156 | # Parse container info 157 | container_info = json.loads(result.stdout)[0] 158 | state = container_info["State"] 159 | network_settings = container_info["NetworkSettings"] 160 | 161 | # Determine status 162 | if state["Running"]: 163 | status = "running" 164 | elif state["Paused"]: 165 | status = "paused" 166 | else: 167 | status = "stopped" 168 | 169 | # Get IP address 170 | ip_address = network_settings.get("IPAddress", "") 171 | if not ip_address and "Networks" in network_settings: 172 | # Try to get IP from bridge network 173 | for network_name, network_info in network_settings["Networks"].items(): 174 | if network_info.get("IPAddress"): 175 | ip_address = network_info["IPAddress"] 176 | break 177 | 178 | # Get port mappings 179 | ports = {} 180 | if "Ports" in network_settings and network_settings["Ports"]: 181 | # network_settings["Ports"] is a dict like: 182 | # {'6901/tcp': [{'HostIp': '0.0.0.0', 'HostPort': '6901'}, ...], ...} 183 | for container_port, port_mappings in network_settings["Ports"].items(): 184 | if port_mappings: # Check if there are any port mappings 185 | # Take the first mapping (usually the IPv4 one) 186 | for mapping in port_mappings: 187 | if mapping.get("HostPort"): 188 | ports[container_port] = mapping["HostPort"] 189 | break # Use the first valid mapping 190 | 191 | return { 192 | "name": name, 193 | "status": status, 194 | "ip_address": ip_address or "127.0.0.1", # Use localhost if no IP 195 | "ports": ports, 196 | "image": container_info["Config"]["Image"], 197 | "provider": "docker", 198 | "container_id": container_info["Id"][:12], # Short ID 199 | "created": container_info["Created"], 200 | "started": state.get("StartedAt", ""), 201 | } 202 | 203 | except Exception as e: 204 | logger.error(f"Error getting VM info for {name}: {e}") 205 | import traceback 206 | traceback.print_exc() 207 | return { 208 | "name": name, 209 | "status": "error", 210 | "error": str(e), 211 | "provider": "docker" 212 | } 213 | 214 | async def list_vms(self) -> List[Dict[str, Any]]: 215 | """List all Docker containers managed by this provider.""" 216 | try: 217 | # List all containers (running and stopped) with the CUA image 218 | cmd = ["docker", "ps", "-a", "--filter", f"ancestor={self.image}", "--format", "json"] 219 | result = subprocess.run(cmd, capture_output=True, text=True, check=True) 220 | 221 | containers = [] 222 | if result.stdout.strip(): 223 | for line in result.stdout.strip().split('\n'): 224 | if line.strip(): 225 | container_data = json.loads(line) 226 | vm_info = await self.get_vm(container_data["Names"]) 227 | containers.append(vm_info) 228 | 229 | return containers 230 | 231 | except subprocess.CalledProcessError as e: 232 | logger.error(f"Error listing containers: {e.stderr}") 233 | return [] 234 | except Exception as e: 235 | logger.error(f"Error listing VMs: {e}") 236 | import traceback 237 | traceback.print_exc() 238 | return [] 239 | 240 | async def run_vm(self, image: str, name: str, run_opts: Dict[str, Any], storage: Optional[str] = None) -> Dict[str, Any]: 241 | """Run a VM with the given options. 242 | 243 | Args: 244 | image: Name/tag of the Docker image to use 245 | name: Name of the container to run 246 | run_opts: Options for running the VM, including: 247 | - memory: Memory limit (e.g., "4GB", "2048MB") 248 | - cpu: CPU limit (e.g., 2 for 2 cores) 249 | - vnc_port: Specific port for VNC interface 250 | - api_port: Specific port for computer-server API 251 | 252 | Returns: 253 | Dictionary with VM status information 254 | """ 255 | try: 256 | # Check if container already exists 257 | existing_vm = await self.get_vm(name, storage) 258 | if existing_vm["status"] == "running": 259 | logger.info(f"Container {name} is already running") 260 | return existing_vm 261 | elif existing_vm["status"] in ["stopped", "paused"]: 262 | # Start existing container 263 | logger.info(f"Starting existing container {name}") 264 | start_cmd = ["docker", "start", name] 265 | result = subprocess.run(start_cmd, capture_output=True, text=True, check=True) 266 | 267 | # Wait for container to be ready 268 | await self._wait_for_container_ready(name) 269 | return await self.get_vm(name, storage) 270 | 271 | # Use provided image or default 272 | docker_image = image if image != "default" else self.image 273 | 274 | # Build docker run command 275 | cmd = ["docker", "run", "-d", "--name", name] 276 | 277 | # Add memory limit if specified 278 | if "memory" in run_opts: 279 | memory_limit = self._parse_memory(run_opts["memory"]) 280 | cmd.extend(["--memory", memory_limit]) 281 | 282 | # Add CPU limit if specified 283 | if "cpu" in run_opts: 284 | cpu_count = str(run_opts["cpu"]) 285 | cmd.extend(["--cpus", cpu_count]) 286 | 287 | # Add port mappings 288 | vnc_port = run_opts.get("vnc_port", self.vnc_port) 289 | api_port = run_opts.get("api_port", self.api_port) 290 | 291 | if vnc_port: 292 | cmd.extend(["-p", f"{vnc_port}:6901"]) # VNC port 293 | if api_port: 294 | cmd.extend(["-p", f"{api_port}:8000"]) # computer-server API port 295 | 296 | # Add volume mounts if storage is specified 297 | storage_path = storage or self.storage 298 | if storage_path and storage_path != "ephemeral": 299 | # Mount storage directory using detected home directory 300 | cmd.extend(["-v", f"{storage_path}:{self._home_dir}/storage"]) 301 | 302 | # Add shared path if specified 303 | if self.shared_path: 304 | # Mount shared directory using detected home directory 305 | cmd.extend(["-v", f"{self.shared_path}:{self._home_dir}/shared"]) 306 | 307 | # Add environment variables 308 | cmd.extend(["-e", "VNC_PW=password"]) # Set VNC password 309 | cmd.extend(["-e", "VNCOPTIONS=-disableBasicAuth"]) # Disable VNC basic auth 310 | 311 | # Add the image 312 | cmd.append(docker_image) 313 | 314 | logger.info(f"Running Docker container with command: {' '.join(cmd)}") 315 | 316 | # Run the container 317 | result = subprocess.run(cmd, capture_output=True, text=True, check=True) 318 | container_id = result.stdout.strip() 319 | 320 | logger.info(f"Container {name} started with ID: {container_id[:12]}") 321 | 322 | # Store container info 323 | self._container_id = container_id 324 | self._running_containers[name] = container_id 325 | 326 | # Wait for container to be ready 327 | await self._wait_for_container_ready(name) 328 | 329 | # Return VM info 330 | vm_info = await self.get_vm(name, storage) 331 | vm_info["container_id"] = container_id[:12] 332 | 333 | return vm_info 334 | 335 | except subprocess.CalledProcessError as e: 336 | error_msg = f"Failed to run container {name}: {e.stderr}" 337 | logger.error(error_msg) 338 | return { 339 | "name": name, 340 | "status": "error", 341 | "error": error_msg, 342 | "provider": "docker" 343 | } 344 | except Exception as e: 345 | error_msg = f"Error running VM {name}: {e}" 346 | logger.error(error_msg) 347 | return { 348 | "name": name, 349 | "status": "error", 350 | "error": error_msg, 351 | "provider": "docker" 352 | } 353 | 354 | async def _wait_for_container_ready(self, container_name: str, timeout: int = 60) -> bool: 355 | """Wait for the Docker container to be fully ready. 356 | 357 | Args: 358 | container_name: Name of the Docker container to check 359 | timeout: Maximum time to wait in seconds (default: 60 seconds) 360 | 361 | Returns: 362 | True if the container is running and ready 363 | """ 364 | logger.info(f"Waiting for container {container_name} to be ready...") 365 | 366 | start_time = time.time() 367 | while time.time() - start_time < timeout: 368 | try: 369 | # Check if container is running 370 | vm_info = await self.get_vm(container_name) 371 | if vm_info["status"] == "running": 372 | logger.info(f"Container {container_name} is running") 373 | 374 | # Additional check: try to connect to computer-server API 375 | # This is optional - we'll just wait a bit more for services to start 376 | await asyncio.sleep(5) 377 | return True 378 | 379 | except Exception as e: 380 | logger.debug(f"Container {container_name} not ready yet: {e}") 381 | 382 | await asyncio.sleep(2) 383 | 384 | logger.warning(f"Container {container_name} did not become ready within {timeout} seconds") 385 | return False 386 | 387 | async def stop_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: 388 | """Stop a running VM by stopping the Docker container.""" 389 | try: 390 | logger.info(f"Stopping container {name}") 391 | 392 | # Stop the container 393 | cmd = ["docker", "stop", name] 394 | result = subprocess.run(cmd, capture_output=True, text=True, check=True) 395 | 396 | # Remove from running containers tracking 397 | if name in self._running_containers: 398 | del self._running_containers[name] 399 | 400 | logger.info(f"Container {name} stopped successfully") 401 | 402 | return { 403 | "name": name, 404 | "status": "stopped", 405 | "message": "Container stopped successfully", 406 | "provider": "docker" 407 | } 408 | 409 | except subprocess.CalledProcessError as e: 410 | error_msg = f"Failed to stop container {name}: {e.stderr}" 411 | logger.error(error_msg) 412 | return { 413 | "name": name, 414 | "status": "error", 415 | "error": error_msg, 416 | "provider": "docker" 417 | } 418 | except Exception as e: 419 | error_msg = f"Error stopping VM {name}: {e}" 420 | logger.error(error_msg) 421 | return { 422 | "name": name, 423 | "status": "error", 424 | "error": error_msg, 425 | "provider": "docker" 426 | } 427 | 428 | async def restart_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]: 429 | raise NotImplementedError("DockerProvider does not support restarting VMs.") 430 | 431 | async def update_vm(self, name: str, update_opts: Dict[str, Any], storage: Optional[str] = None) -> Dict[str, Any]: 432 | """Update VM configuration. 433 | 434 | Note: Docker containers cannot be updated while running. 435 | This method will return an error suggesting to recreate the container. 436 | """ 437 | return { 438 | "name": name, 439 | "status": "error", 440 | "error": "Docker containers cannot be updated while running. Please stop and recreate the container with new options.", 441 | "provider": "docker" 442 | } 443 | 444 | async def get_ip(self, name: str, storage: Optional[str] = None, retry_delay: int = 2) -> str: 445 | """Get the IP address of a VM, waiting indefinitely until it's available. 446 | 447 | Args: 448 | name: Name of the VM to get the IP for 449 | storage: Optional storage path override 450 | retry_delay: Delay between retries in seconds (default: 2) 451 | 452 | Returns: 453 | IP address of the VM when it becomes available 454 | """ 455 | logger.info(f"Getting IP address for container {name}") 456 | 457 | total_attempts = 0 458 | while True: 459 | total_attempts += 1 460 | 461 | try: 462 | vm_info = await self.get_vm(name, storage) 463 | 464 | if vm_info["status"] == "error": 465 | raise Exception(f"VM is in error state: {vm_info.get('error', 'Unknown error')}") 466 | 467 | # TODO: for now, return localhost 468 | # it seems the docker container is not accessible from the host 469 | # on WSL2, unless you port forward? not sure 470 | if True: 471 | logger.warning("Overriding container IP with localhost") 472 | return "localhost" 473 | 474 | # Check if we got a valid IP 475 | ip = vm_info.get("ip_address", None) 476 | if ip and ip != "unknown" and not ip.startswith("0.0.0.0"): 477 | logger.info(f"Got valid container IP address: {ip}") 478 | return ip 479 | 480 | # For Docker containers, we can also use localhost if ports are mapped 481 | if vm_info["status"] == "running" and vm_info.get("ports"): 482 | logger.info(f"Container is running with port mappings, using localhost") 483 | return "127.0.0.1" 484 | 485 | # Check the container status 486 | status = vm_info.get("status", "unknown") 487 | 488 | if status == "stopped": 489 | logger.info(f"Container status is {status}, but still waiting for it to start") 490 | elif status != "running": 491 | logger.info(f"Container is not running yet (status: {status}). Waiting...") 492 | else: 493 | logger.info("Container is running but no valid IP address yet. Waiting...") 494 | 495 | except Exception as e: 496 | logger.warning(f"Error getting container {name} IP: {e}, continuing to wait...") 497 | 498 | # Wait before next retry 499 | await asyncio.sleep(retry_delay) 500 | 501 | # Add progress log every 10 attempts 502 | if total_attempts % 10 == 0: 503 | logger.info(f"Still waiting for container {name} IP after {total_attempts} attempts...") 504 | 505 | async def __aenter__(self): 506 | """Async context manager entry.""" 507 | logger.debug("Entering DockerProvider context") 508 | return self 509 | 510 | async def __aexit__(self, exc_type, exc_val, exc_tb): 511 | """Async context manager exit. 512 | 513 | This method handles cleanup of running containers if needed. 514 | """ 515 | logger.debug(f"Exiting DockerProvider context, handling exceptions: {exc_type}") 516 | try: 517 | # Optionally stop running containers on context exit 518 | # For now, we'll leave containers running as they might be needed 519 | # Users can manually stop them if needed 520 | pass 521 | except Exception as e: 522 | logger.error(f"Error during DockerProvider cleanup: {e}") 523 | if exc_type is None: 524 | raise 525 | return False 526 | ``` -------------------------------------------------------------------------------- /blog/build-your-own-operator-on-macos-1.md: -------------------------------------------------------------------------------- ```markdown 1 | # Build Your Own Operator on macOS - Part 1 2 | 3 | *Published on March 31, 2025 by Francesco Bonacci* 4 | 5 | In this first blogpost, we'll learn how to build our own Computer-Use Operator using OpenAI's `computer-use-preview` model. But first, let's understand what some common terms mean: 6 | 7 | - A **Virtual Machine (VM)** is like a computer within your computer - a safe, isolated environment where the AI can work without affecting your main system. 8 | - **computer-use-preview** is OpenAI's specialized language model trained to understand and interact with computer interfaces through screenshots. 9 | - A **Computer-Use Agent** is an AI agent that can control a computer just like a human would - clicking buttons, typing text, and interacting with applications. 10 | 11 | Our Operator will run in an isolated macOS VM, by making use of our [cua-computer](https://github.com/trycua/cua/tree/main/libs/computer) package and [lume virtualization CLI](https://github.com/trycua/cua/tree/main/libs/lume). 12 | 13 | Check out what it looks like to use your own Operator from a Gradio app: 14 | 15 | <div align="center"> 16 | <video src="https://github.com/user-attachments/assets/a2cf69ad-2ab2-4eb9-8e1a-45606dd7eec6" width="600" controls></video> 17 | </div> 18 | 19 | ## What You'll Learn 20 | 21 | By the end of this tutorial, you'll be able to: 22 | - Set up a macOS virtual machine for AI automation 23 | - Connect OpenAI's computer-use model to your VM 24 | - Create a basic loop for the AI to interact with your VM 25 | - Handle different types of computer actions (clicking, typing, etc.) 26 | - Implement safety checks and error handling 27 | 28 | **Prerequisites:** 29 | - macOS Sonoma (14.0) or later 30 | - 8GB RAM minimum (16GB recommended) 31 | - OpenAI API access (Tier 3+) 32 | - Basic Python knowledge 33 | - Familiarity with terminal commands 34 | 35 | **Estimated Time:** 45-60 minutes 36 | 37 | ## Introduction to Computer-Use Agents 38 | 39 | Last March OpenAI released a fine-tuned version of GPT-4o, namely [CUA](https://openai.com/index/computer-using-agent/), introducing pixel-level vision capabilities with advanced reasoning through reinforcement learning. This fine-tuning enables the computer-use model to interpret screenshots and interact with graphical user interfaces on a pixel-level such as buttons, menus, and text fields - mimicking human interactions on a computer screen. It scores a remarkable 38.1% success rate on [OSWorld](https://os-world.github.io) - a benchmark for Computer-Use agents on Linux and Windows. This is the 2nd available model after Anthropic's [Claude 3.5 Sonnet](https://www.anthropic.com/news/3-5-models-and-computer-use) to support computer-use capabilities natively with no external models (e.g. accessory [SoM (Set-of-Mark)](https://arxiv.org/abs/2310.11441) and OCR runs). 40 | 41 | Professor Ethan Mollick provides an excellent explanation of computer-use agents in this article: [When you give a Claude a mouse](https://www.oneusefulthing.org/p/when-you-give-a-claude-a-mouse). 42 | 43 | ### ChatGPT Operator 44 | OpenAI's computer-use model powers [ChatGPT Operator](https://openai.com/index/introducing-operator), a Chromium-based interface exclusively available to ChatGPT Pro subscribers. Users leverage this functionality to automate web-based tasks such as online shopping, expense report submission, and booking reservations by interacting with websites in a human-like manner. 45 | 46 | ## Benefits of Custom Operators 47 | 48 | ### Why Build Your Own? 49 | While OpenAI's Operator uses a controlled Chromium VM instance, there are scenarios where you may want to use your own VM with full desktop capabilities. Here are some examples: 50 | 51 | - Automating native macOS apps like Finder, Xcode 52 | - Managing files, changing settings, and running terminal commands 53 | - Testing desktop software and applications 54 | - Creating workflows that combine web and desktop tasks 55 | - Automating media editing in apps like Final Cut Pro and Blender 56 | 57 | This gives you more control and flexibility to automate tasks beyond just web browsing, with full access to interact with native applications and system-level operations. Additionally, running your own VM locally provides better privacy for sensitive user files and delivers superior performance by leveraging your own hardware instead of renting expensive Cloud VMs. 58 | 59 | ## Access Requirements 60 | 61 | ### Model Availability 62 | As we speak, the **computer-use-preview** model has limited availability: 63 | - Only accessible to OpenAI tier 3+ users 64 | - Additional application process may be required even for eligible users 65 | - Cannot be used in the OpenAI Playground 66 | - Outside of ChatGPT Operator, usage is restricted to the new **Responses API** 67 | 68 | ## Understanding the OpenAI API 69 | 70 | ### Responses API Overview 71 | Let's start with the basics. In our case, we'll use OpenAI's Responses API to communicate with their computer-use model. 72 | 73 | Think of it like this: 74 | 1. We send the model a screenshot of our VM and tell it what we want it to do 75 | 2. The model looks at the screenshot and decides what actions to take 76 | 3. It sends back instructions (like "click here" or "type this") 77 | 4. We execute those instructions in our VM 78 | 79 | The [Responses API](https://platform.openai.com/docs/guides/responses) is OpenAI's newest way to interact with their AI models. It comes with several built-in tools: 80 | - **Web search**: Let the AI search the internet 81 | - **File search**: Help the AI find documents 82 | - **Computer use**: Allow the AI to control a computer (what we'll be using) 83 | 84 | As we speak, the computer-use model is only available through the Responses API. 85 | 86 | ### Responses API Examples 87 | Let's look at some simple examples. We'll start with the traditional way of using OpenAI's API with Chat Completions, then show the new Responses API primitive. 88 | 89 | Chat Completions: 90 | ```python 91 | # The old way required managing conversation history manually 92 | messages = [{"role": "user", "content": "Hello"}] 93 | response = client.chat.completions.create( 94 | model="gpt-4", 95 | messages=messages # We had to track all messages ourselves 96 | ) 97 | messages.append(response.choices[0].message) # Manual message tracking 98 | ``` 99 | 100 | Responses API: 101 | ```python 102 | # Example 1: Simple web search 103 | # The API handles all the complexity for us 104 | response = client.responses.create( 105 | model="gpt-4", 106 | input=[{ 107 | "role": "user", 108 | "content": "What's the latest news about AI?" 109 | }], 110 | tools=[{ 111 | "type": "web_search", # Tell the API to use web search 112 | "search_query": "latest AI news" 113 | }] 114 | ) 115 | 116 | # Example 2: File search 117 | # Looking for specific documents becomes easy 118 | response = client.responses.create( 119 | model="gpt-4", 120 | input=[{ 121 | "role": "user", 122 | "content": "Find documents about project X" 123 | }], 124 | tools=[{ 125 | "type": "file_search", 126 | "query": "project X", 127 | "file_types": ["pdf", "docx"] # Specify which file types to look for 128 | }] 129 | ) 130 | ``` 131 | 132 | ### Computer-Use Model Setup 133 | For our operator, we'll use the computer-use model. Here's how we set it up: 134 | 135 | ```python 136 | # Set up the computer-use model to control our VM 137 | response = client.responses.create( 138 | model="computer-use-preview", # Special model for computer control 139 | tools=[{ 140 | "type": "computer_use_preview", 141 | "display_width": 1024, # Size of our VM screen 142 | "display_height": 768, 143 | "environment": "mac" # Tell it we're using macOS. 144 | }], 145 | input=[ 146 | { 147 | "role": "user", 148 | "content": [ 149 | # What we want the AI to do 150 | {"type": "input_text", "text": "Open Safari and go to google.com"}, 151 | # Current screenshot of our VM 152 | {"type": "input_image", "image_url": f"data:image/png;base64,{screenshot_base64}"} 153 | ] 154 | } 155 | ], 156 | truncation="auto" # Let OpenAI handle message length 157 | ) 158 | ``` 159 | 160 | ### Understanding the Response 161 | When we send a request, the API sends back a response that looks like this: 162 | 163 | ```json 164 | "output": [ 165 | { 166 | "type": "reasoning", # The AI explains what it's thinking 167 | "id": "rs_67cc...", 168 | "summary": [ 169 | { 170 | "type": "summary_text", 171 | "text": "Clicking on the browser address bar." 172 | } 173 | ] 174 | }, 175 | { 176 | "type": "computer_call", # The actual action to perform 177 | "id": "cu_67cc...", 178 | "call_id": "call_zw3...", 179 | "action": { 180 | "type": "click", # What kind of action (click, type, etc.) 181 | "button": "left", # Which mouse button to use 182 | "x": 156, # Where to click (coordinates) 183 | "y": 50 184 | }, 185 | "pending_safety_checks": [], # Any safety warnings to consider 186 | "status": "completed" # Whether the action was successful 187 | } 188 | ] 189 | ``` 190 | 191 | Each response contains: 192 | 1. **Reasoning**: The AI's explanation of what it's doing 193 | 2. **Action**: The specific computer action to perform 194 | 3. **Safety Checks**: Any potential risks to review 195 | 4. **Status**: Whether everything worked as planned 196 | 197 | ## CUA-Computer Interface 198 | 199 | ### Architecture Overview 200 | Let's break down the main components of our system and how they work together: 201 | 202 | 1. **The Virtual Machine (VM)** 203 | - Think of this as a safe playground for our AI 204 | - It's a complete macOS system running inside your computer 205 | - Anything the AI does stays inside this VM, keeping your main system safe 206 | - We use `lume` to create and manage this VM 207 | 208 | 2. **The Computer Interface (CUI)** 209 | - This is how we control the VM 210 | - It can move the mouse, type text, and take screenshots 211 | - Works like a remote control for the VM 212 | - Built using our `cua-computer` package 213 | 214 | 3. **The OpenAI Model** 215 | - This is the brain of our operator 216 | - It looks at screenshots of the VM 217 | - Decides what actions to take 218 | - Sends back instructions like "click here" or "type this" 219 | 220 | Here's how they all work together: 221 | 222 | ```mermaid 223 | sequenceDiagram 224 | participant User as You 225 | participant CUI as Computer Interface 226 | participant VM as Virtual Machine 227 | participant AI as OpenAI API 228 | 229 | Note over User,AI: The Main Loop 230 | User->>CUI: Start the operator 231 | CUI->>VM: Create macOS sandbox 232 | activate VM 233 | VM-->>CUI: VM is ready 234 | 235 | loop Action Loop 236 | Note over CUI,AI: Each iteration 237 | CUI->>VM: Take a screenshot 238 | VM-->>CUI: Return current screen 239 | CUI->>AI: Send screenshot + instructions 240 | AI-->>CUI: Return next action 241 | 242 | Note over CUI,VM: Execute the action 243 | alt Mouse Click 244 | CUI->>VM: Move and click mouse 245 | else Type Text 246 | CUI->>VM: Type characters 247 | else Scroll Screen 248 | CUI->>VM: Scroll window 249 | else Press Keys 250 | CUI->>VM: Press keyboard keys 251 | else Wait 252 | CUI->>VM: Pause for a moment 253 | end 254 | end 255 | 256 | VM-->>CUI: Task finished 257 | deactivate VM 258 | CUI-->>User: All done! 259 | ``` 260 | 261 | The diagram above shows how information flows through our system: 262 | 1. You start the operator 263 | 2. The Computer Interface creates a virtual macOS 264 | 3. Then it enters a loop: 265 | - Take a picture of the VM screen 266 | - Send it to OpenAI with instructions 267 | - Get back an action to perform 268 | - Execute that action in the VM 269 | - Repeat until the task is done 270 | 271 | This design keeps everything organized and safe. The AI can only interact with the VM through our controlled interface, and the VM keeps the AI's actions isolated from your main system. 272 | 273 | --- 274 | 275 | ## Implementation Guide 276 | 277 | ### Prerequisites 278 | 279 | 1. **Lume CLI Setup** 280 | For installing the standalone lume binary, run the following command from a terminal, or download the [latest pkg](https://github.com/trycua/cua/releases/latest/download/lume.pkg.tar.gz). 281 | 282 | ```bash 283 | sudo /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)" 284 | ``` 285 | 286 | **Important Storage Notes:** 287 | - Initial download requires 80GB of free space 288 | - After first run, space usage reduces to ~30GB due to macOS's sparse file system 289 | - VMs are stored in `~/.lume` 290 | - Cached images are stored in `~/.lume/cache` 291 | 292 | You can check your downloaded VM images anytime: 293 | ```bash 294 | lume ls 295 | ``` 296 | 297 | Example output: 298 | 299 | | name | os | cpu | memory | disk | display | status | ip | vnc | 300 | |--------------------------|---------|-------|---------|----------------|-----------|-----------|----------------|---------------------------------------------------| 301 | | macos-sequoia-cua:latest | macOS | 12 | 16.00G | 64.5GB/80.0GB | 1024x768 | running | 192.168.64.78 | vnc://:[email protected]:56085 | 302 | 303 | After checking your available images, you can run the VM to ensure everything is working correctly: 304 | ```bash 305 | lume run macos-sequoia-cua:latest 306 | ``` 307 | 308 | 2. **Python Environment Setup** 309 | **Note**: The `cua-computer` package requires Python 3.10 or later. We recommend creating a dedicated Python environment: 310 | 311 | **Using venv:** 312 | ```bash 313 | python -m venv cua-env 314 | source cua-env/bin/activate 315 | ``` 316 | 317 | **Using conda:** 318 | ```bash 319 | conda create -n cua-env python=3.10 320 | conda activate cua-env 321 | ``` 322 | 323 | Then install the required packages: 324 | 325 | ```bash 326 | pip install openai 327 | pip install cua-computer 328 | ``` 329 | 330 | Ensure you have an OpenAI API key (set as an environment variable or in your OpenAI configuration). 331 | 332 | ### Building the Operator 333 | 334 | #### Importing Required Modules 335 | With the prerequisites installed and configured, we're ready to build our first operator. 336 | The following example uses asynchronous Python (async/await). You can run it either in a VS Code Notebook or as a standalone Python script. 337 | 338 | ```python 339 | import asyncio 340 | import base64 341 | import openai 342 | 343 | from computer import Computer 344 | ``` 345 | 346 | #### Mapping API Actions to CUA Methods 347 | The following helper function converts a `computer_call` action from the OpenAI Responses API into corresponding commands on the CUI interface. For example, if the API instructs a `click` action, we move the cursor and perform a left click on the lume VM Sandbox. We will use the computer interface to execute the actions. 348 | 349 | ```python 350 | async def execute_action(computer, action): 351 | action_type = action.type 352 | 353 | if action_type == "click": 354 | x = action.x 355 | y = action.y 356 | button = action.button 357 | print(f"Executing click at ({x}, {y}) with button '{button}'") 358 | await computer.interface.move_cursor(x, y) 359 | if button == "right": 360 | await computer.interface.right_click() 361 | else: 362 | await computer.interface.left_click() 363 | 364 | elif action_type == "type": 365 | text = action.text 366 | print(f"Typing text: {text}") 367 | await computer.interface.type_text(text) 368 | 369 | elif action_type == "scroll": 370 | x = action.x 371 | y = action.y 372 | scroll_x = action.scroll_x 373 | scroll_y = action.scroll_y 374 | print(f"Scrolling at ({x}, {y}) with offsets (scroll_x={scroll_x}, scroll_y={scroll_y})") 375 | await computer.interface.move_cursor(x, y) 376 | await computer.interface.scroll(scroll_y) # Using vertical scroll only 377 | 378 | elif action_type == "keypress": 379 | keys = action.keys 380 | for key in keys: 381 | print(f"Pressing key: {key}") 382 | # Map common key names to CUA equivalents 383 | if key.lower() == "enter": 384 | await computer.interface.press_key("return") 385 | elif key.lower() == "space": 386 | await computer.interface.press_key("space") 387 | else: 388 | await computer.interface.press_key(key) 389 | 390 | elif action_type == "wait": 391 | wait_time = action.time 392 | print(f"Waiting for {wait_time} seconds") 393 | await asyncio.sleep(wait_time) 394 | 395 | elif action_type == "screenshot": 396 | print("Taking screenshot") 397 | # This is handled automatically in the main loop, but we can take an extra one if requested 398 | screenshot = await computer.interface.screenshot() 399 | return screenshot 400 | 401 | else: 402 | print(f"Unrecognized action: {action_type}") 403 | ``` 404 | 405 | #### Implementing the Computer-Use Loop 406 | This section defines a loop that: 407 | 408 | 1. Initializes the cua-computer instance (connecting to a macOS sandbox). 409 | 2. Captures a screenshot of the current state. 410 | 3. Sends the screenshot (with a user prompt) to the OpenAI Responses API using the `computer-use-preview` model. 411 | 4. Processes the returned `computer_call` action and executes it using our helper function. 412 | 5. Captures an updated screenshot after the action (this example runs one iteration, but you can wrap it in a loop). 413 | 414 | For a full loop, you would repeat these steps until no further actions are returned. 415 | 416 | ```python 417 | async def cua_openai_loop(): 418 | # Initialize the lume computer instance (macOS sandbox) 419 | async with Computer( 420 | display="1024x768", 421 | memory="4GB", 422 | cpu="2", 423 | os_type="macos" 424 | ) as computer: 425 | await computer.run() # Start the lume VM 426 | 427 | # Capture the initial screenshot 428 | screenshot = await computer.interface.screenshot() 429 | screenshot_base64 = base64.b64encode(screenshot).decode('utf-8') 430 | 431 | # Initial request to start the loop 432 | response = openai.responses.create( 433 | model="computer-use-preview", 434 | tools=[{ 435 | "type": "computer_use_preview", 436 | "display_width": 1024, 437 | "display_height": 768, 438 | "environment": "mac" 439 | }], 440 | input=[ 441 | { 442 | "role": "user", 443 | "content": [ 444 | {"type": "input_text", "text": "Open Safari, download and install Cursor."}, 445 | {"type": "input_image", "image_url": f"data:image/png;base64,{screenshot_base64}"} 446 | ] 447 | } 448 | ], 449 | truncation="auto" 450 | ) 451 | 452 | # Continue the loop until no more computer_call actions 453 | while True: 454 | # Check for computer_call actions 455 | computer_calls = [item for item in response.output if item and item.type == "computer_call"] 456 | if not computer_calls: 457 | print("No more computer calls. Loop complete.") 458 | break 459 | 460 | # Get the first computer call 461 | call = computer_calls[0] 462 | last_call_id = call.call_id 463 | action = call.action 464 | print("Received action from OpenAI Responses API:", action) 465 | 466 | # Handle any pending safety checks 467 | if call.pending_safety_checks: 468 | print("Safety checks pending:", call.pending_safety_checks) 469 | # In a real implementation, you would want to get user confirmation here 470 | acknowledged_checks = call.pending_safety_checks 471 | else: 472 | acknowledged_checks = [] 473 | 474 | # Execute the action 475 | await execute_action(computer, action) 476 | await asyncio.sleep(1) # Allow time for changes to take effect 477 | 478 | # Capture new screenshot after action 479 | new_screenshot = await computer.interface.screenshot() 480 | new_screenshot_base64 = base64.b64encode(new_screenshot).decode('utf-8') 481 | 482 | # Send the screenshot back as computer_call_output 483 | response = openai.responses.create( 484 | model="computer-use-preview", 485 | tools=[{ 486 | "type": "computer_use_preview", 487 | "display_width": 1024, 488 | "display_height": 768, 489 | "environment": "mac" 490 | }], 491 | input=[{ 492 | "type": "computer_call_output", 493 | "call_id": last_call_id, 494 | "acknowledged_safety_checks": acknowledged_checks, 495 | "output": { 496 | "type": "input_image", 497 | "image_url": f"data:image/png;base64,{new_screenshot_base64}" 498 | } 499 | }], 500 | truncation="auto" 501 | ) 502 | 503 | # End the session 504 | await computer.stop() 505 | 506 | # Run the loop 507 | if __name__ == "__main__": 508 | asyncio.run(cua_openai_loop()) 509 | ``` 510 | 511 | You can find the full code in our [notebook](https://github.com/trycua/cua/blob/main/notebooks/blog/build-your-own-operator-on-macos-1.ipynb). 512 | 513 | #### Request Handling Differences 514 | The first request to the OpenAI Responses API is special in that it includes the initial screenshot and prompt. Subsequent requests are handled differently, using the `computer_call_output` type to provide feedback on the executed action. 515 | 516 | ##### Initial Request Format 517 | - We use `role: "user"` with `content` that contains both `input_text` (the prompt) and `input_image` (the screenshot) 518 | 519 | ##### Subsequent Request Format 520 | - We use `type: "computer_call_output"` instead of the user role 521 | - We include the `call_id` to link the output to the specific previous action that was executed 522 | - We provide any `acknowledged_safety_checks` that were approved 523 | - We include the new screenshot in the `output` field 524 | 525 | This structured approach allows the API to maintain context and continuity throughout the interaction session. 526 | 527 | **Note**: For multi-turn conversations, you should include the `previous_response_id` in your initial requests when starting a new conversation with prior context. However, when using `computer_call_output` for action feedback, you don't need to explicitly manage the conversation history - OpenAI's API automatically tracks the context using the `call_id`. The `previous_response_id` is primarily important when the user provides additional instructions or when starting a new request that should continue from a previous session. 528 | 529 | ## Conclusion 530 | 531 | ### Summary 532 | This blogpost demonstrates a single iteration of a OpenAI Computer-Use loop where: 533 | 534 | - A macOS sandbox is controlled using the CUA interface. 535 | - A screenshot and prompt are sent to the OpenAI Responses API. 536 | - The returned action (e.g. a click or type command) is executed via the CUI interface. 537 | 538 | In a production setting, you would wrap the action-response cycle in a loop, handling multiple actions and safety checks as needed. 539 | 540 | ### Next Steps 541 | In the next blogpost, we'll introduce our Agent framework which abstracts away all these tedious implementation steps. This framework provides a higher-level API that handles the interaction loop between OpenAI's computer-use model and the macOS sandbox, allowing you to focus on building sophisticated applications rather than managing the low-level details we've explored here. Can't wait? Check out the [cua-agent](https://github.com/trycua/cua/tree/main/libs/agent) package! 542 | 543 | ### Resources 544 | - [OpenAI Computer-Use docs](https://platform.openai.com/docs/guides/tools-computer-use) 545 | - [cua-computer](https://github.com/trycua/cua/tree/main/libs/computer) 546 | - [lume](https://github.com/trycua/cua/tree/main/libs/lume) 547 | ``` -------------------------------------------------------------------------------- /libs/python/computer-server/computer_server/diorama/diorama.py: -------------------------------------------------------------------------------- ```python 1 | #!/usr/bin/env python3 2 | """Diorama: A virtual desktop manager for macOS""" 3 | 4 | import os 5 | import asyncio 6 | import logging 7 | import sys 8 | import io 9 | from typing import Union 10 | from PIL import Image, ImageDraw 11 | 12 | from computer_server.diorama.draw import capture_all_apps, AppActivationContext, get_frontmost_and_active_app, get_all_windows, get_running_apps 13 | 14 | from computer_server.diorama.diorama_computer import DioramaComputer 15 | from computer_server.handlers.macos import * 16 | 17 | # simple, nicely formatted logging 18 | logger = logging.getLogger(__name__) 19 | 20 | automation_handler = MacOSAutomationHandler() 21 | 22 | class Diorama: 23 | """Virtual desktop manager that provides automation capabilities for macOS applications. 24 | 25 | Manages application windows and provides an interface for taking screenshots, 26 | mouse interactions, keyboard input, and coordinate transformations between 27 | screenshot space and screen space. 28 | """ 29 | _scheduler_queue = None 30 | _scheduler_task = None 31 | _loop = None 32 | _scheduler_started = False 33 | 34 | @classmethod 35 | def create_from_apps(cls, *args) -> DioramaComputer: 36 | """Create a DioramaComputer instance from a list of application names. 37 | 38 | Args: 39 | *args: Variable number of application names to include in the desktop 40 | 41 | Returns: 42 | DioramaComputer: A computer interface for the specified applications 43 | """ 44 | cls._ensure_scheduler() 45 | return cls(args).computer 46 | 47 | # Dictionary to store cursor positions for each unique app_list hash 48 | _cursor_positions = {} 49 | 50 | def __init__(self, app_list): 51 | """Initialize a Diorama instance for the specified applications. 52 | 53 | Args: 54 | app_list: List of application names to manage 55 | """ 56 | self.app_list = app_list 57 | self.interface = self.Interface(self) 58 | self.computer = DioramaComputer(self) 59 | self.focus_context = None 60 | 61 | # Create a hash for this app_list to use as a key 62 | self.app_list_hash = hash(tuple(sorted(app_list))) 63 | 64 | # Initialize cursor position for this app_list if it doesn't exist 65 | if self.app_list_hash not in Diorama._cursor_positions: 66 | Diorama._cursor_positions[self.app_list_hash] = (0, 0) 67 | 68 | @classmethod 69 | def _ensure_scheduler(cls): 70 | """Ensure the async scheduler loop is running. 71 | 72 | Creates and starts the scheduler task if it hasn't been started yet. 73 | """ 74 | if not cls._scheduler_started: 75 | logger.info("Starting Diorama scheduler loop…") 76 | cls._scheduler_queue = asyncio.Queue() 77 | cls._loop = asyncio.get_event_loop() 78 | cls._scheduler_task = cls._loop.create_task(cls._scheduler_loop()) 79 | cls._scheduler_started = True 80 | 81 | @classmethod 82 | async def _scheduler_loop(cls): 83 | """Main scheduler loop that processes automation commands. 84 | 85 | Continuously processes commands from the scheduler queue, handling 86 | screenshots, mouse actions, keyboard input, and scrolling operations. 87 | """ 88 | while True: 89 | cmd = await cls._scheduler_queue.get() 90 | action = cmd.get("action") 91 | args = cmd.get("arguments", {}) 92 | future = cmd.get("future") 93 | logger.info(f"Processing command: {action} | args={args}") 94 | 95 | app_whitelist = args.get("app_list", []) 96 | 97 | all_windows = get_all_windows() 98 | running_apps = get_running_apps() 99 | frontmost_app, active_app_to_use, active_app_pid = get_frontmost_and_active_app(all_windows, running_apps, app_whitelist) 100 | focus_context = AppActivationContext(active_app_pid, active_app_to_use, logger) 101 | 102 | with focus_context: 103 | try: 104 | if action == "screenshot": 105 | logger.info(f"Taking screenshot for apps: {app_whitelist}") 106 | result, img = capture_all_apps( 107 | app_whitelist=app_whitelist, 108 | save_to_disk=False, 109 | take_focus=False 110 | ) 111 | logger.info("Screenshot complete.") 112 | if future: 113 | future.set_result((result, img)) 114 | # Mouse actions 115 | elif action in ["left_click", "right_click", "double_click", "move_cursor", "drag_to"]: 116 | x = args.get("x") 117 | y = args.get("y") 118 | 119 | duration = args.get("duration", 0.5) 120 | if action == "left_click": 121 | await automation_handler.left_click(x, y) 122 | elif action == "right_click": 123 | await automation_handler.right_click(x, y) 124 | elif action == "double_click": 125 | await automation_handler.double_click(x, y) 126 | elif action == "move_cursor": 127 | await automation_handler.move_cursor(x, y) 128 | elif action == "drag_to": 129 | await automation_handler.drag_to(x, y, duration=duration) 130 | if future: 131 | future.set_result(None) 132 | elif action in ["scroll_up", "scroll_down"]: 133 | x = args.get("x") 134 | y = args.get("y") 135 | if x is not None and y is not None: 136 | await automation_handler.move_cursor(x, y) 137 | 138 | clicks = args.get("clicks", 1) 139 | if action == "scroll_up": 140 | await automation_handler.scroll_up(clicks) 141 | else: 142 | await automation_handler.scroll_down(clicks) 143 | if future: 144 | future.set_result(None) 145 | # Keyboard actions 146 | elif action == "type_text": 147 | text = args.get("text") 148 | await automation_handler.type_text(text) 149 | if future: 150 | future.set_result(None) 151 | elif action == "press_key": 152 | key = args.get("key") 153 | await automation_handler.press_key(key) 154 | if future: 155 | future.set_result(None) 156 | elif action == "hotkey": 157 | keys = args.get("keys", []) 158 | await automation_handler.hotkey(keys) 159 | if future: 160 | future.set_result(None) 161 | elif action == "get_cursor_position": 162 | pos = await automation_handler.get_cursor_position() 163 | if future: 164 | future.set_result(pos) 165 | else: 166 | logger.warning(f"Unknown action: {action}") 167 | if future: 168 | future.set_exception(ValueError(f"Unknown action: {action}")) 169 | except Exception as e: 170 | logger.error(f"Exception during {action}: {e}", exc_info=True) 171 | if future: 172 | future.set_exception(e) 173 | 174 | class Interface(): 175 | """Interface for interacting with the virtual desktop. 176 | 177 | Provides methods for taking screenshots, mouse interactions, keyboard input, 178 | and coordinate transformations between screenshot and screen coordinates. 179 | """ 180 | 181 | def __init__(self, diorama): 182 | """Initialize the interface with a reference to the parent Diorama instance. 183 | 184 | Args: 185 | diorama: The parent Diorama instance 186 | """ 187 | self._diorama = diorama 188 | 189 | self._scene_hitboxes = [] 190 | self._scene_size = None 191 | 192 | async def _send_cmd(self, action, arguments=None): 193 | """Send a command to the scheduler queue. 194 | 195 | Args: 196 | action (str): The action to perform 197 | arguments (dict, optional): Arguments for the action 198 | 199 | Returns: 200 | The result of the command execution 201 | """ 202 | Diorama._ensure_scheduler() 203 | loop = asyncio.get_event_loop() 204 | future = loop.create_future() 205 | logger.info(f"Enqueuing {action} command for apps: {self._diorama.app_list}") 206 | await Diorama._scheduler_queue.put({ 207 | "action": action, 208 | "arguments": {"app_list": self._diorama.app_list, **(arguments or {})}, 209 | "future": future 210 | }) 211 | try: 212 | return await future 213 | except asyncio.CancelledError: 214 | logger.warning(f"Command was cancelled: {action}") 215 | return None 216 | 217 | async def screenshot(self, as_bytes: bool = True) -> Union[str, Image.Image]: 218 | """Take a screenshot of the managed applications. 219 | 220 | Args: 221 | as_bytes (bool): If True, return base64-encoded bytes; if False, return PIL Image 222 | 223 | Returns: 224 | Union[str, Image.Image]: Base64-encoded PNG bytes or PIL Image object 225 | """ 226 | import base64 227 | result, img = await self._send_cmd("screenshot") 228 | self._scene_hitboxes = result.get("hitboxes", []) 229 | self._scene_size = img.size 230 | 231 | if as_bytes: 232 | # PIL Image to bytes, then base64 encode for JSON 233 | import io 234 | img_byte_arr = io.BytesIO() 235 | img.save(img_byte_arr, format="PNG") 236 | img_bytes = img_byte_arr.getvalue() 237 | img_b64 = base64.b64encode(img_bytes).decode("ascii") 238 | return img_b64 239 | else: 240 | return img 241 | 242 | async def left_click(self, x, y): 243 | """Perform a left mouse click at the specified coordinates. 244 | 245 | Args: 246 | x (int): X coordinate in screenshot space (or None to use last position) 247 | y (int): Y coordinate in screenshot space (or None to use last position) 248 | """ 249 | # Get last cursor position for this app_list hash 250 | app_list_hash = hash(tuple(sorted(self._diorama.app_list))) 251 | last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) 252 | x, y = x or last_pos[0], y or last_pos[1] 253 | # Update cursor position for this app_list hash 254 | Diorama._cursor_positions[app_list_hash] = (x, y) 255 | 256 | sx, sy = await self.to_screen_coordinates(x, y) 257 | await self._send_cmd("left_click", {"x": sx, "y": sy}) 258 | 259 | async def right_click(self, x, y): 260 | """Perform a right mouse click at the specified coordinates. 261 | 262 | Args: 263 | x (int): X coordinate in screenshot space (or None to use last position) 264 | y (int): Y coordinate in screenshot space (or None to use last position) 265 | """ 266 | # Get last cursor position for this app_list hash 267 | app_list_hash = hash(tuple(sorted(self._diorama.app_list))) 268 | last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) 269 | x, y = x or last_pos[0], y or last_pos[1] 270 | # Update cursor position for this app_list hash 271 | Diorama._cursor_positions[app_list_hash] = (x, y) 272 | 273 | sx, sy = await self.to_screen_coordinates(x, y) 274 | await self._send_cmd("right_click", {"x": sx, "y": sy}) 275 | 276 | async def double_click(self, x, y): 277 | """Perform a double mouse click at the specified coordinates. 278 | 279 | Args: 280 | x (int): X coordinate in screenshot space (or None to use last position) 281 | y (int): Y coordinate in screenshot space (or None to use last position) 282 | """ 283 | # Get last cursor position for this app_list hash 284 | app_list_hash = hash(tuple(sorted(self._diorama.app_list))) 285 | last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) 286 | x, y = x or last_pos[0], y or last_pos[1] 287 | # Update cursor position for this app_list hash 288 | Diorama._cursor_positions[app_list_hash] = (x, y) 289 | 290 | sx, sy = await self.to_screen_coordinates(x, y) 291 | await self._send_cmd("double_click", {"x": sx, "y": sy}) 292 | 293 | async def move_cursor(self, x, y): 294 | """Move the mouse cursor to the specified coordinates. 295 | 296 | Args: 297 | x (int): X coordinate in screenshot space (or None to use last position) 298 | y (int): Y coordinate in screenshot space (or None to use last position) 299 | """ 300 | # Get last cursor position for this app_list hash 301 | app_list_hash = hash(tuple(sorted(self._diorama.app_list))) 302 | last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) 303 | x, y = x or last_pos[0], y or last_pos[1] 304 | # Update cursor position for this app_list hash 305 | Diorama._cursor_positions[app_list_hash] = (x, y) 306 | 307 | sx, sy = await self.to_screen_coordinates(x, y) 308 | await self._send_cmd("move_cursor", {"x": sx, "y": sy}) 309 | 310 | async def drag_to(self, x, y, duration=0.5): 311 | """Drag the mouse from current position to the specified coordinates. 312 | 313 | Args: 314 | x (int): X coordinate in screenshot space (or None to use last position) 315 | y (int): Y coordinate in screenshot space (or None to use last position) 316 | duration (float): Duration of the drag operation in seconds 317 | """ 318 | # Get last cursor position for this app_list hash 319 | app_list_hash = hash(tuple(sorted(self._diorama.app_list))) 320 | last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) 321 | x, y = x or last_pos[0], y or last_pos[1] 322 | # Update cursor position for this app_list hash 323 | Diorama._cursor_positions[app_list_hash] = (x, y) 324 | 325 | sx, sy = await self.to_screen_coordinates(x, y) 326 | await self._send_cmd("drag_to", {"x": sx, "y": sy, "duration": duration}) 327 | 328 | async def get_cursor_position(self): 329 | """Get the current cursor position in screen coordinates. 330 | 331 | Returns: 332 | tuple: (x, y) coordinates of the cursor in screen space 333 | """ 334 | return await self._send_cmd("get_cursor_position") 335 | 336 | async def type_text(self, text): 337 | """Type the specified text using the keyboard. 338 | 339 | Args: 340 | text (str): The text to type 341 | """ 342 | await self._send_cmd("type_text", {"text": text}) 343 | 344 | async def press_key(self, key): 345 | """Press a single key on the keyboard. 346 | 347 | Args: 348 | key (str): The key to press 349 | """ 350 | await self._send_cmd("press_key", {"key": key}) 351 | 352 | async def hotkey(self, keys): 353 | """Press a combination of keys simultaneously. 354 | 355 | Args: 356 | keys (list): List of keys to press together 357 | """ 358 | await self._send_cmd("hotkey", {"keys": list(keys)}) 359 | 360 | async def scroll_up(self, clicks: int = 1): 361 | """Scroll up at the current cursor position. 362 | 363 | Args: 364 | clicks (int): Number of scroll clicks to perform 365 | """ 366 | # Get last cursor position for this app_list hash 367 | app_list_hash = hash(tuple(sorted(self._diorama.app_list))) 368 | last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) 369 | x, y = last_pos[0], last_pos[1] 370 | 371 | await self._send_cmd("scroll_up", {"clicks": clicks, "x": x, "y": y}) 372 | 373 | async def scroll_down(self, clicks: int = 1): 374 | """Scroll down at the current cursor position. 375 | 376 | Args: 377 | clicks (int): Number of scroll clicks to perform 378 | """ 379 | # Get last cursor position for this app_list hash 380 | app_list_hash = hash(tuple(sorted(self._diorama.app_list))) 381 | last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) 382 | x, y = last_pos[0], last_pos[1] 383 | 384 | await self._send_cmd("scroll_down", {"clicks": clicks, "x": x, "y": y}) 385 | 386 | async def get_screen_size(self) -> dict[str, int]: 387 | """Get the size of the screenshot area. 388 | 389 | Returns: 390 | dict[str, int]: Dictionary with 'width' and 'height' keys 391 | """ 392 | if not self._scene_size: 393 | await self.screenshot() 394 | return { "width": self._scene_size[0], "height": self._scene_size[1] } 395 | 396 | async def to_screen_coordinates(self, x: float, y: float) -> tuple[float, float]: 397 | """Convert screenshot coordinates to screen coordinates. 398 | 399 | Args: 400 | x: X absolute coordinate in screenshot space 401 | y: Y absolute coordinate in screenshot space 402 | 403 | Returns: 404 | tuple[float, float]: (x, y) absolute coordinates in screen space 405 | """ 406 | if not self._scene_hitboxes: 407 | await self.screenshot() # get hitboxes 408 | # Try all hitboxes 409 | for h in self._scene_hitboxes[::-1]: 410 | rect_from = h.get("hitbox") 411 | rect_to = h.get("target") 412 | if not rect_from or len(rect_from) != 4: 413 | continue 414 | 415 | # check if (x, y) is inside rect_from 416 | x0, y0, x1, y1 = rect_from 417 | if x0 <= x <= x1 and y0 <= y <= y1: 418 | logger.info(f"Found hitbox: {h}") 419 | # remap (x, y) to rect_to 420 | tx0, ty0, tx1, ty1 = rect_to 421 | 422 | # calculate offset from x0, y0 423 | offset_x = x - x0 424 | offset_y = y - y0 425 | 426 | # remap offset to rect_to 427 | tx = tx0 + offset_x 428 | ty = ty0 + offset_y 429 | 430 | return tx, ty 431 | return x, y 432 | 433 | async def to_screenshot_coordinates(self, x: float, y: float) -> tuple[float, float]: 434 | """Convert screen coordinates to screenshot coordinates. 435 | 436 | Args: 437 | x: X absolute coordinate in screen space 438 | y: Y absolute coordinate in screen space 439 | 440 | Returns: 441 | tuple[float, float]: (x, y) absolute coordinates in screenshot space 442 | """ 443 | if not self._scene_hitboxes: 444 | await self.screenshot() # get hitboxes 445 | # Try all hitboxes 446 | for h in self._scene_hitboxes[::-1]: 447 | rect_from = h.get("target") 448 | rect_to = h.get("hitbox") 449 | if not rect_from or len(rect_from) != 4: 450 | continue 451 | 452 | # check if (x, y) is inside rect_from 453 | x0, y0, x1, y1 = rect_from 454 | if x0 <= x <= x1 and y0 <= y <= y1: 455 | # remap (x, y) to rect_to 456 | tx0, ty0, tx1, ty1 = rect_to 457 | 458 | # calculate offset from x0, y0 459 | offset_x = x - x0 460 | offset_y = y - y0 461 | 462 | # remap offset to rect_to 463 | tx = tx0 + offset_x 464 | ty = ty0 + offset_y 465 | 466 | return tx, ty 467 | return x, y 468 | 469 | import pyautogui 470 | import time 471 | 472 | async def main(): 473 | """Main function demonstrating Diorama usage with multiple desktops and mouse tracking.""" 474 | desktop1 = Diorama.create_from_apps(["Discord", "Notes"]) 475 | desktop2 = Diorama.create_from_apps(["Terminal"]) 476 | 477 | img1 = await desktop1.interface.screenshot(as_bytes=False) 478 | img2 = await desktop2.interface.screenshot(as_bytes=False) 479 | 480 | img1.save("app_screenshots/desktop1.png") 481 | img2.save("app_screenshots/desktop2.png") 482 | # Initialize Diorama desktop 483 | desktop3 = Diorama.create_from_apps("Safari") 484 | screen_size = await desktop3.interface.get_screen_size() 485 | print(screen_size) 486 | 487 | # Take initial screenshot 488 | img = await desktop3.interface.screenshot(as_bytes=False) 489 | img.save("app_screenshots/desktop3.png") 490 | 491 | # Prepare hitboxes and draw on the single screenshot 492 | hitboxes = desktop3.interface._scene_hitboxes[::-1] 493 | base_img = img.copy() 494 | draw = ImageDraw.Draw(base_img) 495 | for h in hitboxes: 496 | rect = h.get("hitbox") 497 | if not rect or len(rect) != 4: 498 | continue 499 | draw.rectangle(rect, outline="red", width=2) 500 | 501 | # Track and draw mouse position in real time (single screenshot size) 502 | last_mouse_pos = None 503 | print("Tracking mouse... Press Ctrl+C to stop.") 504 | try: 505 | while True: 506 | mouse_x, mouse_y = pyautogui.position() 507 | if last_mouse_pos != (mouse_x, mouse_y): 508 | last_mouse_pos = (mouse_x, mouse_y) 509 | # Map to screenshot coordinates 510 | sx, sy = await desktop3.interface.to_screenshot_coordinates(mouse_x, mouse_y) 511 | # Draw on a copy of the screenshot 512 | frame = base_img.copy() 513 | frame_draw = ImageDraw.Draw(frame) 514 | frame_draw.ellipse((sx-5, sy-5, sx+5, sy+5), fill="blue", outline="blue") 515 | # Save the frame 516 | frame.save("app_screenshots/desktop3_mouse.png") 517 | print(f"Mouse at screen ({mouse_x}, {mouse_y}) -> screenshot ({sx:.1f}, {sy:.1f})") 518 | time.sleep(0.05) # Throttle updates to ~20 FPS 519 | except KeyboardInterrupt: 520 | print("Stopped tracking.") 521 | 522 | draw.text((rect[0], rect[1]), str(idx), fill="red") 523 | 524 | canvas.save("app_screenshots/desktop3_hitboxes.png") 525 | 526 | 527 | 528 | # move mouse in a square spiral around the screen 529 | import math 530 | import random 531 | 532 | step = 20 # pixels per move 533 | dot_radius = 10 534 | width = screen_size["width"] 535 | height = screen_size["height"] 536 | x, y = 0, 10 537 | 538 | while x < width and y < height: 539 | await desktop3.interface.move_cursor(x, y) 540 | img = await desktop3.interface.screenshot(as_bytes=False) 541 | draw = ImageDraw.Draw(img) 542 | draw.ellipse((x-dot_radius, y-dot_radius, x+dot_radius, y+dot_radius), fill="red") 543 | img.save("current.png") 544 | await asyncio.sleep(0.03) 545 | x += step 546 | y = math.sin(x / width * math.pi * 2) * 50 + 25 547 | 548 | if __name__ == "__main__": 549 | asyncio.run(main()) 550 | ``` -------------------------------------------------------------------------------- /libs/lume/src/Server/Server.swift: -------------------------------------------------------------------------------- ```swift 1 | import Darwin 2 | import Foundation 3 | import Network 4 | 5 | // MARK: - Error Types 6 | enum PortError: Error, LocalizedError { 7 | case alreadyInUse(port: UInt16) 8 | 9 | var errorDescription: String? { 10 | switch self { 11 | case .alreadyInUse(let port): 12 | return "Port \(port) is already in use by another process" 13 | } 14 | } 15 | } 16 | 17 | // MARK: - Server Class 18 | @MainActor 19 | final class Server { 20 | 21 | // MARK: - Route Type 22 | private struct Route { 23 | let method: String 24 | let path: String 25 | let handler: (HTTPRequest) async throws -> HTTPResponse 26 | 27 | func matches(_ request: HTTPRequest) -> Bool { 28 | if method != request.method { return false } 29 | 30 | // Handle path parameters 31 | let routeParts = path.split(separator: "/") 32 | let requestParts = request.path.split(separator: "/") 33 | 34 | if routeParts.count != requestParts.count { return false } 35 | 36 | for (routePart, requestPart) in zip(routeParts, requestParts) { 37 | if routePart.hasPrefix(":") { continue } // Path parameter 38 | if routePart != requestPart { return false } 39 | } 40 | 41 | return true 42 | } 43 | 44 | func extractParams(_ request: HTTPRequest) -> [String: String] { 45 | var params: [String: String] = [:] 46 | let routeParts = path.split(separator: "/") 47 | 48 | // Split request path to remove query parameters 49 | let requestPathOnly = request.path.split(separator: "?", maxSplits: 1)[0] 50 | let requestParts = requestPathOnly.split(separator: "/") 51 | 52 | for (routePart, requestPart) in zip(routeParts, requestParts) { 53 | if routePart.hasPrefix(":") { 54 | let paramName = String(routePart.dropFirst()) 55 | params[paramName] = String(requestPart) 56 | } 57 | } 58 | 59 | return params 60 | } 61 | } 62 | 63 | // MARK: - Properties 64 | private let port: NWEndpoint.Port 65 | private let controller: LumeController 66 | private var isRunning = false 67 | private var listener: NWListener? 68 | private var routes: [Route] 69 | 70 | // MARK: - Initialization 71 | init(port: UInt16 = 7777) { 72 | self.port = NWEndpoint.Port(rawValue: port)! 73 | self.controller = LumeController() 74 | self.routes = [] 75 | 76 | // Define API routes after self is fully initialized 77 | self.setupRoutes() 78 | } 79 | 80 | // MARK: - Route Setup 81 | private func setupRoutes() { 82 | routes = [ 83 | Route( 84 | method: "GET", path: "/lume/vms", 85 | handler: { [weak self] request in 86 | guard let self else { throw HTTPError.internalError } 87 | // Extract storage from query params if present 88 | let storage = self.extractQueryParam(request: request, name: "storage") 89 | return try await self.handleListVMs(storage: storage) 90 | }), 91 | Route( 92 | method: "GET", path: "/lume/vms/:name", 93 | handler: { [weak self] request in 94 | guard let self else { throw HTTPError.internalError } 95 | let params = Route( 96 | method: "GET", path: "/lume/vms/:name", 97 | handler: { _ in 98 | HTTPResponse(statusCode: .ok, body: "") 99 | } 100 | ).extractParams(request) 101 | guard let name = params["name"] else { 102 | return HTTPResponse(statusCode: .badRequest, body: "Missing VM name") 103 | } 104 | 105 | // Extract storage from query params if present 106 | let storage = self.extractQueryParam(request: request, name: "storage") 107 | 108 | return try await self.handleGetVM(name: name, storage: storage) 109 | }), 110 | Route( 111 | method: "DELETE", path: "/lume/vms/:name", 112 | handler: { [weak self] request in 113 | guard let self else { throw HTTPError.internalError } 114 | let params = Route( 115 | method: "DELETE", path: "/lume/vms/:name", 116 | handler: { _ in 117 | HTTPResponse(statusCode: .ok, body: "") 118 | } 119 | ).extractParams(request) 120 | guard let name = params["name"] else { 121 | return HTTPResponse(statusCode: .badRequest, body: "Missing VM name") 122 | } 123 | 124 | // Extract storage from query params if present 125 | let storage = self.extractQueryParam(request: request, name: "storage") 126 | 127 | return try await self.handleDeleteVM(name: name, storage: storage) 128 | }), 129 | Route( 130 | method: "POST", path: "/lume/vms", 131 | handler: { [weak self] request in 132 | guard let self else { throw HTTPError.internalError } 133 | return try await self.handleCreateVM(request.body) 134 | }), 135 | Route( 136 | method: "POST", path: "/lume/vms/clone", 137 | handler: { [weak self] request in 138 | guard let self else { throw HTTPError.internalError } 139 | return try await self.handleCloneVM(request.body) 140 | }), 141 | Route( 142 | method: "PATCH", path: "/lume/vms/:name", 143 | handler: { [weak self] request in 144 | guard let self else { throw HTTPError.internalError } 145 | let params = Route( 146 | method: "PATCH", path: "/lume/vms/:name", 147 | handler: { _ in 148 | HTTPResponse(statusCode: .ok, body: "") 149 | } 150 | ).extractParams(request) 151 | guard let name = params["name"] else { 152 | return HTTPResponse(statusCode: .badRequest, body: "Missing VM name") 153 | } 154 | return try await self.handleSetVM(name: name, body: request.body) 155 | }), 156 | Route( 157 | method: "POST", path: "/lume/vms/:name/run", 158 | handler: { [weak self] request in 159 | guard let self else { throw HTTPError.internalError } 160 | let params = Route( 161 | method: "POST", path: "/lume/vms/:name/run", 162 | handler: { _ in 163 | HTTPResponse(statusCode: .ok, body: "") 164 | } 165 | ).extractParams(request) 166 | guard let name = params["name"] else { 167 | return HTTPResponse(statusCode: .badRequest, body: "Missing VM name") 168 | } 169 | return try await self.handleRunVM(name: name, body: request.body) 170 | }), 171 | Route( 172 | method: "POST", path: "/lume/vms/:name/stop", 173 | handler: { [weak self] request in 174 | guard let self else { throw HTTPError.internalError } 175 | let params = Route( 176 | method: "POST", path: "/lume/vms/:name/stop", 177 | handler: { _ in 178 | HTTPResponse(statusCode: .ok, body: "") 179 | } 180 | ).extractParams(request) 181 | guard let name = params["name"] else { 182 | return HTTPResponse(statusCode: .badRequest, body: "Missing VM name") 183 | } 184 | 185 | Logger.info("Processing stop VM request", metadata: ["method": request.method, "path": request.path]) 186 | 187 | // Extract storage from the request body 188 | var storage: String? = nil 189 | if let bodyData = request.body, !bodyData.isEmpty { 190 | do { 191 | if let json = try JSONSerialization.jsonObject(with: bodyData) as? [String: Any], 192 | let bodyStorage = json["storage"] as? String { 193 | storage = bodyStorage 194 | Logger.info("Extracted storage from request body", metadata: ["storage": bodyStorage]) 195 | } 196 | } catch { 197 | Logger.error("Failed to parse request body JSON", metadata: ["error": error.localizedDescription]) 198 | } 199 | } 200 | 201 | return try await self.handleStopVM(name: name, storage: storage) 202 | }), 203 | Route( 204 | method: "GET", path: "/lume/ipsw", 205 | handler: { [weak self] _ in 206 | guard let self else { throw HTTPError.internalError } 207 | return try await self.handleIPSW() 208 | }), 209 | Route( 210 | method: "POST", path: "/lume/pull", 211 | handler: { [weak self] request in 212 | guard let self else { throw HTTPError.internalError } 213 | return try await self.handlePull(request.body) 214 | }), 215 | Route( 216 | method: "POST", path: "/lume/prune", 217 | handler: { [weak self] _ in 218 | guard let self else { throw HTTPError.internalError } 219 | return try await self.handlePruneImages() 220 | }), 221 | Route( 222 | method: "GET", path: "/lume/images", 223 | handler: { [weak self] request in 224 | guard let self else { throw HTTPError.internalError } 225 | return try await self.handleGetImages(request) 226 | }), 227 | // New config endpoint 228 | Route( 229 | method: "GET", path: "/lume/config", 230 | handler: { [weak self] _ in 231 | guard let self else { throw HTTPError.internalError } 232 | return try await self.handleGetConfig() 233 | }), 234 | Route( 235 | method: "POST", path: "/lume/config", 236 | handler: { [weak self] request in 237 | guard let self else { throw HTTPError.internalError } 238 | return try await self.handleUpdateConfig(request.body) 239 | }), 240 | Route( 241 | method: "GET", path: "/lume/config/locations", 242 | handler: { [weak self] _ in 243 | guard let self else { throw HTTPError.internalError } 244 | return try await self.handleGetLocations() 245 | }), 246 | Route( 247 | method: "POST", path: "/lume/config/locations", 248 | handler: { [weak self] request in 249 | guard let self else { throw HTTPError.internalError } 250 | return try await self.handleAddLocation(request.body) 251 | }), 252 | Route( 253 | method: "DELETE", path: "/lume/config/locations/:name", 254 | handler: { [weak self] request in 255 | guard let self else { throw HTTPError.internalError } 256 | let params = Route( 257 | method: "DELETE", path: "/lume/config/locations/:name", 258 | handler: { _ in 259 | HTTPResponse(statusCode: .ok, body: "") 260 | } 261 | ).extractParams(request) 262 | guard let name = params["name"] else { 263 | return HTTPResponse(statusCode: .badRequest, body: "Missing location name") 264 | } 265 | return try await self.handleRemoveLocation(name) 266 | }), 267 | 268 | // Logs retrieval route 269 | Route( 270 | method: "GET", path: "/lume/logs", 271 | handler: { [weak self] request in 272 | guard let self else { throw HTTPError.internalError } 273 | 274 | // Extract query parameters 275 | let type = self.extractQueryParam(request: request, name: "type") // "info", "error", or "all" 276 | let linesParam = self.extractQueryParam(request: request, name: "lines") 277 | let lines = linesParam.flatMap { Int($0) } // Convert to Int if present 278 | 279 | return try await self.handleGetLogs(type: type, lines: lines) 280 | }), 281 | Route( 282 | method: "POST", path: "/lume/config/locations/default/:name", 283 | handler: { [weak self] request in 284 | guard let self else { throw HTTPError.internalError } 285 | let params = Route( 286 | method: "POST", path: "/lume/config/locations/default/:name", 287 | handler: { _ in 288 | HTTPResponse(statusCode: .ok, body: "") 289 | } 290 | ).extractParams(request) 291 | guard let name = params["name"] else { 292 | return HTTPResponse(statusCode: .badRequest, body: "Missing location name") 293 | } 294 | return try await self.handleSetDefaultLocation(name) 295 | }), 296 | Route( 297 | method: "POST", path: "/lume/vms/push", 298 | handler: { [weak self] request in 299 | guard let self else { throw HTTPError.internalError } 300 | return try await self.handlePush(request.body) 301 | }), 302 | ] 303 | } 304 | 305 | // Helper to extract query parameters from the URL 306 | private func extractQueryParam(request: HTTPRequest, name: String) -> String? { 307 | // Extract only the query part by splitting on '?' 308 | let parts = request.path.split(separator: "?", maxSplits: 1) 309 | guard parts.count > 1 else { return nil } // No query parameters 310 | 311 | let queryString = String(parts[1]) 312 | // Create a placeholder URL with the query string 313 | if let urlComponents = URLComponents(string: "http://placeholder.com?"+queryString), 314 | let queryItems = urlComponents.queryItems 315 | { 316 | return queryItems.first(where: { $0.name == name })?.value?.removingPercentEncoding 317 | } 318 | return nil 319 | } 320 | 321 | // MARK: - Port Utilities 322 | private func isPortAvailable(port: Int) async -> Bool { 323 | // Create a socket 324 | let socketFD = socket(AF_INET, SOCK_STREAM, 0) 325 | if socketFD == -1 { 326 | return false 327 | } 328 | 329 | // Set socket options to allow reuse 330 | var value: Int32 = 1 331 | if setsockopt( 332 | socketFD, SOL_SOCKET, SO_REUSEADDR, &value, socklen_t(MemoryLayout<Int32>.size)) == -1 333 | { 334 | close(socketFD) 335 | return false 336 | } 337 | 338 | // Set up the address structure 339 | var addr = sockaddr_in() 340 | addr.sin_family = sa_family_t(AF_INET) 341 | addr.sin_port = UInt16(port).bigEndian 342 | addr.sin_addr.s_addr = INADDR_ANY.bigEndian 343 | 344 | // Bind to the port 345 | let bindResult = withUnsafePointer(to: &addr) { addrPtr in 346 | addrPtr.withMemoryRebound(to: sockaddr.self, capacity: 1) { addrPtr in 347 | Darwin.bind(socketFD, addrPtr, socklen_t(MemoryLayout<sockaddr_in>.size)) 348 | } 349 | } 350 | 351 | // Clean up 352 | close(socketFD) 353 | 354 | // If bind failed, the port is in use 355 | return bindResult == 0 356 | } 357 | 358 | // MARK: - Server Lifecycle 359 | func start() async throws { 360 | // First check if the port is already in use 361 | if !(await isPortAvailable(port: Int(port.rawValue))) { 362 | // Don't log anything here, just throw the error 363 | throw PortError.alreadyInUse(port: port.rawValue) 364 | } 365 | 366 | let parameters = NWParameters.tcp 367 | listener = try NWListener(using: parameters, on: port) 368 | 369 | // Create an actor to safely manage state transitions 370 | actor StartupState { 371 | var error: Error? 372 | var isComplete = false 373 | 374 | func setError(_ error: Error) { 375 | self.error = error 376 | self.isComplete = true 377 | } 378 | 379 | func setComplete() { 380 | self.isComplete = true 381 | } 382 | 383 | func checkStatus() -> (isComplete: Bool, error: Error?) { 384 | return (isComplete, error) 385 | } 386 | } 387 | 388 | let startupState = StartupState() 389 | 390 | // Set up a state update handler to detect port binding errors 391 | listener?.stateUpdateHandler = { state in 392 | Task { 393 | switch state { 394 | case .setup: 395 | // Initial state, no action needed 396 | Logger.info("Listener setup", metadata: ["port": "\(self.port.rawValue)"]) 397 | break 398 | case .waiting(let error): 399 | // Log the full error details to see what we're getting 400 | Logger.error( 401 | "Listener waiting", 402 | metadata: [ 403 | "error": error.localizedDescription, 404 | "debugDescription": error.debugDescription, 405 | "localizedDescription": error.localizedDescription, 406 | "port": "\(self.port.rawValue)", 407 | ]) 408 | 409 | // Check for different port in use error messages 410 | if error.debugDescription.contains("Address already in use") 411 | || error.localizedDescription.contains("in use") 412 | || error.localizedDescription.contains("address already in use") 413 | { 414 | Logger.error( 415 | "Port conflict detected", metadata: ["port": "\(self.port.rawValue)"]) 416 | await startupState.setError( 417 | PortError.alreadyInUse(port: self.port.rawValue)) 418 | } else { 419 | // Wait for a short period to see if the listener recovers 420 | // Some network errors are transient 421 | try? await Task.sleep(nanoseconds: 1_000_000_000) // 1 second 422 | 423 | // If we're still waiting after delay, consider it an error 424 | if case .waiting = await self.listener?.state { 425 | await startupState.setError(error) 426 | } 427 | } 428 | case .failed(let error): 429 | // Log the full error details 430 | Logger.error( 431 | "Listener failed", 432 | metadata: [ 433 | "error": error.localizedDescription, 434 | "debugDescription": error.debugDescription, 435 | "port": "\(self.port.rawValue)", 436 | ]) 437 | await startupState.setError(error) 438 | case .ready: 439 | // Listener successfully bound to port 440 | Logger.info("Listener ready", metadata: ["port": "\(self.port.rawValue)"]) 441 | await startupState.setComplete() 442 | case .cancelled: 443 | // Listener was cancelled 444 | Logger.info("Listener cancelled", metadata: ["port": "\(self.port.rawValue)"]) 445 | break 446 | @unknown default: 447 | Logger.info( 448 | "Unknown listener state", 449 | metadata: ["state": "\(state)", "port": "\(self.port.rawValue)"]) 450 | break 451 | } 452 | } 453 | } 454 | 455 | listener?.newConnectionHandler = { [weak self] connection in 456 | Task { @MainActor [weak self] in 457 | guard let self else { return } 458 | self.handleConnection(connection) 459 | } 460 | } 461 | 462 | listener?.start(queue: .main) 463 | 464 | // Wait for either successful startup or an error 465 | var status: (isComplete: Bool, error: Error?) = (false, nil) 466 | repeat { 467 | try await Task.sleep(nanoseconds: 100_000_000) // 100ms 468 | status = await startupState.checkStatus() 469 | } while !status.isComplete 470 | 471 | // If there was a startup error, throw it 472 | if let error = status.error { 473 | self.stop() 474 | throw error 475 | } 476 | 477 | isRunning = true 478 | 479 | Logger.info("Server started", metadata: ["port": "\(port.rawValue)"]) 480 | 481 | // Keep the server running 482 | while isRunning { 483 | try await Task.sleep(nanoseconds: 1_000_000_000) 484 | } 485 | } 486 | 487 | func stop() { 488 | isRunning = false 489 | listener?.cancel() 490 | } 491 | 492 | // MARK: - Connection Handling 493 | private func handleConnection(_ connection: NWConnection) { 494 | connection.stateUpdateHandler = { [weak self] state in 495 | switch state { 496 | case .ready: 497 | Task { @MainActor [weak self] in 498 | guard let self else { return } 499 | self.receiveData(connection) 500 | } 501 | case .failed(let error): 502 | Logger.error("Connection failed", metadata: ["error": error.localizedDescription]) 503 | connection.cancel() 504 | case .cancelled: 505 | // Connection is already cancelled, no need to cancel again 506 | break 507 | default: 508 | break 509 | } 510 | } 511 | connection.start(queue: .main) 512 | } 513 | 514 | private func receiveData(_ connection: NWConnection) { 515 | connection.receive(minimumIncompleteLength: 1, maximumLength: 65536) { 516 | [weak self] content, _, isComplete, error in 517 | if let error = error { 518 | Logger.error("Receive error", metadata: ["error": error.localizedDescription]) 519 | connection.cancel() 520 | return 521 | } 522 | 523 | guard let data = content, !data.isEmpty else { 524 | if isComplete { 525 | connection.cancel() 526 | } 527 | return 528 | } 529 | 530 | Task { @MainActor [weak self] in 531 | guard let self else { return } 532 | do { 533 | let response = try await self.handleRequest(data) 534 | self.send(response, on: connection) 535 | } catch { 536 | let errorResponse = self.errorResponse(error) 537 | self.send(errorResponse, on: connection) 538 | } 539 | } 540 | } 541 | } 542 | 543 | private func send(_ response: HTTPResponse, on connection: NWConnection) { 544 | let data = response.serialize() 545 | Logger.info( 546 | "Serialized response", metadata: ["data": String(data: data, encoding: .utf8) ?? ""]) 547 | connection.send( 548 | content: data, 549 | completion: .contentProcessed { [weak connection] error in 550 | if let error = error { 551 | Logger.error( 552 | "Failed to send response", metadata: ["error": error.localizedDescription]) 553 | } else { 554 | Logger.info("Response sent successfully") 555 | } 556 | if connection?.state != .cancelled { 557 | connection?.cancel() 558 | } 559 | }) 560 | } 561 | 562 | // MARK: - Request Handling 563 | private func handleRequest(_ data: Data) async throws -> HTTPResponse { 564 | Logger.info( 565 | "Received request data", metadata: ["data": String(data: data, encoding: .utf8) ?? ""]) 566 | 567 | guard let request = HTTPRequest(data: data) else { 568 | Logger.error("Failed to parse request") 569 | return HTTPResponse(statusCode: .badRequest, body: "Invalid request") 570 | } 571 | 572 | Logger.info( 573 | "Parsed request", 574 | metadata: [ 575 | "method": request.method, 576 | "path": request.path, 577 | "headers": "\(request.headers)", 578 | "body": String(data: request.body ?? Data(), encoding: .utf8) ?? "", 579 | ]) 580 | 581 | // Find matching route 582 | guard let route = routes.first(where: { $0.matches(request) }) else { 583 | return HTTPResponse(statusCode: .notFound, body: "Not found") 584 | } 585 | 586 | // Handle the request 587 | let response = try await route.handler(request) 588 | 589 | Logger.info( 590 | "Sending response", 591 | metadata: [ 592 | "statusCode": "\(response.statusCode.rawValue)", 593 | "headers": "\(response.headers)", 594 | "body": String(data: response.body ?? Data(), encoding: .utf8) ?? "", 595 | ]) 596 | 597 | return response 598 | } 599 | 600 | private func errorResponse(_ error: Error) -> HTTPResponse { 601 | HTTPResponse( 602 | statusCode: .internalServerError, 603 | headers: ["Content-Type": "application/json"], 604 | body: try! JSONEncoder().encode(APIError(message: error.localizedDescription)) 605 | ) 606 | } 607 | } 608 | ```