This is page 13 of 21. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .all-contributorsrc ├── .cursorignore ├── .devcontainer │ ├── devcontainer.json │ ├── post-install.sh │ └── README.md ├── .dockerignore ├── .gitattributes ├── .github │ ├── FUNDING.yml │ ├── scripts │ │ ├── get_pyproject_version.py │ │ └── tests │ │ ├── __init__.py │ │ ├── README.md │ │ └── test_get_pyproject_version.py │ └── workflows │ ├── ci-lume.yml │ ├── docker-publish-kasm.yml │ ├── docker-publish-xfce.yml │ ├── docker-reusable-publish.yml │ ├── npm-publish-computer.yml │ ├── npm-publish-core.yml │ ├── publish-lume.yml │ ├── pypi-publish-agent.yml │ ├── pypi-publish-computer-server.yml │ ├── pypi-publish-computer.yml │ ├── pypi-publish-core.yml │ ├── pypi-publish-mcp-server.yml │ ├── pypi-publish-pylume.yml │ ├── pypi-publish-som.yml │ ├── pypi-reusable-publish.yml │ └── test-validation-script.yml ├── .gitignore ├── .vscode │ ├── docs.code-workspace │ ├── launch.json │ ├── libs-ts.code-workspace │ ├── lume.code-workspace │ ├── lumier.code-workspace │ ├── py.code-workspace │ └── settings.json ├── blog │ ├── app-use.md │ ├── assets │ │ ├── composite-agents.png │ │ ├── docker-ubuntu-support.png │ │ ├── hack-booth.png │ │ ├── hack-closing-ceremony.jpg │ │ ├── hack-cua-ollama-hud.jpeg │ │ ├── hack-leaderboard.png │ │ ├── hack-the-north.png │ │ ├── hack-winners.jpeg │ │ ├── hack-workshop.jpeg │ │ ├── hud-agent-evals.png │ │ └── trajectory-viewer.jpeg │ ├── bringing-computer-use-to-the-web.md │ ├── build-your-own-operator-on-macos-1.md │ ├── build-your-own-operator-on-macos-2.md │ ├── composite-agents.md │ ├── cua-hackathon.md │ ├── hack-the-north.md │ ├── hud-agent-evals.md │ ├── human-in-the-loop.md │ ├── introducing-cua-cloud-containers.md │ ├── lume-to-containerization.md │ ├── sandboxed-python-execution.md │ ├── training-computer-use-models-trajectories-1.md │ ├── trajectory-viewer.md │ ├── ubuntu-docker-support.md │ └── windows-sandbox.md ├── CONTRIBUTING.md ├── Development.md ├── Dockerfile ├── docs │ ├── .gitignore │ ├── .prettierrc │ ├── content │ │ └── docs │ │ ├── agent-sdk │ │ │ ├── agent-loops.mdx │ │ │ ├── benchmarks │ │ │ │ ├── index.mdx │ │ │ │ ├── interactive.mdx │ │ │ │ ├── introduction.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── osworld-verified.mdx │ │ │ │ ├── screenspot-pro.mdx │ │ │ │ └── screenspot-v2.mdx │ │ │ ├── callbacks │ │ │ │ ├── agent-lifecycle.mdx │ │ │ │ ├── cost-saving.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── logging.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── pii-anonymization.mdx │ │ │ │ └── trajectories.mdx │ │ │ ├── chat-history.mdx │ │ │ ├── custom-computer-handlers.mdx │ │ │ ├── custom-tools.mdx │ │ │ ├── customizing-computeragent.mdx │ │ │ ├── integrations │ │ │ │ ├── hud.mdx │ │ │ │ └── meta.json │ │ │ ├── message-format.mdx │ │ │ ├── meta.json │ │ │ ├── migration-guide.mdx │ │ │ ├── prompt-caching.mdx │ │ │ ├── supported-agents │ │ │ │ ├── composed-agents.mdx │ │ │ │ ├── computer-use-agents.mdx │ │ │ │ ├── grounding-models.mdx │ │ │ │ ├── human-in-the-loop.mdx │ │ │ │ └── meta.json │ │ │ ├── supported-model-providers │ │ │ │ ├── index.mdx │ │ │ │ └── local-models.mdx │ │ │ └── usage-tracking.mdx │ │ ├── computer-sdk │ │ │ ├── cloud-vm-management.mdx │ │ │ ├── commands.mdx │ │ │ ├── computer-ui.mdx │ │ │ ├── computers.mdx │ │ │ ├── meta.json │ │ │ └── sandboxed-python.mdx │ │ ├── index.mdx │ │ ├── libraries │ │ │ ├── agent │ │ │ │ └── index.mdx │ │ │ ├── computer │ │ │ │ └── index.mdx │ │ │ ├── computer-server │ │ │ │ ├── Commands.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── REST-API.mdx │ │ │ │ └── WebSocket-API.mdx │ │ │ ├── core │ │ │ │ └── index.mdx │ │ │ ├── lume │ │ │ │ ├── cli-reference.mdx │ │ │ │ ├── faq.md │ │ │ │ ├── http-api.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── meta.json │ │ │ │ └── prebuilt-images.mdx │ │ │ ├── lumier │ │ │ │ ├── building-lumier.mdx │ │ │ │ ├── docker-compose.mdx │ │ │ │ ├── docker.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ └── meta.json │ │ │ ├── mcp-server │ │ │ │ ├── client-integrations.mdx │ │ │ │ ├── configuration.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── llm-integrations.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── tools.mdx │ │ │ │ └── usage.mdx │ │ │ └── som │ │ │ ├── configuration.mdx │ │ │ └── index.mdx │ │ ├── meta.json │ │ ├── quickstart-cli.mdx │ │ ├── quickstart-devs.mdx │ │ └── telemetry.mdx │ ├── next.config.mjs │ ├── package-lock.json │ ├── package.json │ ├── pnpm-lock.yaml │ ├── postcss.config.mjs │ ├── public │ │ └── img │ │ ├── agent_gradio_ui.png │ │ ├── agent.png │ │ ├── cli.png │ │ ├── computer.png │ │ ├── som_box_threshold.png │ │ └── som_iou_threshold.png │ ├── README.md │ ├── source.config.ts │ ├── src │ │ ├── app │ │ │ ├── (home) │ │ │ │ ├── [[...slug]] │ │ │ │ │ └── page.tsx │ │ │ │ └── layout.tsx │ │ │ ├── api │ │ │ │ └── search │ │ │ │ └── route.ts │ │ │ ├── favicon.ico │ │ │ ├── global.css │ │ │ ├── layout.config.tsx │ │ │ ├── layout.tsx │ │ │ ├── llms.mdx │ │ │ │ └── [[...slug]] │ │ │ │ └── route.ts │ │ │ └── llms.txt │ │ │ └── route.ts │ │ ├── assets │ │ │ ├── discord-black.svg │ │ │ ├── discord-white.svg │ │ │ ├── logo-black.svg │ │ │ └── logo-white.svg │ │ ├── components │ │ │ ├── iou.tsx │ │ │ └── mermaid.tsx │ │ ├── lib │ │ │ ├── llms.ts │ │ │ └── source.ts │ │ └── mdx-components.tsx │ └── tsconfig.json ├── examples │ ├── agent_examples.py │ ├── agent_ui_examples.py │ ├── cloud_api_examples.py │ ├── computer_examples_windows.py │ ├── computer_examples.py │ ├── computer_ui_examples.py │ ├── computer-example-ts │ │ ├── .env.example │ │ ├── .gitignore │ │ ├── .prettierrc │ │ ├── package-lock.json │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── README.md │ │ ├── src │ │ │ ├── helpers.ts │ │ │ └── index.ts │ │ └── tsconfig.json │ ├── docker_examples.py │ ├── evals │ │ ├── hud_eval_examples.py │ │ └── wikipedia_most_linked.txt │ ├── pylume_examples.py │ ├── sandboxed_functions_examples.py │ ├── som_examples.py │ ├── utils.py │ └── winsandbox_example.py ├── img │ ├── agent_gradio_ui.png │ ├── agent.png │ ├── cli.png │ ├── computer.png │ ├── logo_black.png │ └── logo_white.png ├── libs │ ├── kasm │ │ ├── Dockerfile │ │ ├── LICENSE │ │ ├── README.md │ │ └── src │ │ └── ubuntu │ │ └── install │ │ └── firefox │ │ ├── custom_startup.sh │ │ ├── firefox.desktop │ │ └── install_firefox.sh │ ├── lume │ │ ├── .cursorignore │ │ ├── CONTRIBUTING.md │ │ ├── Development.md │ │ ├── img │ │ │ └── cli.png │ │ ├── Package.resolved │ │ ├── Package.swift │ │ ├── README.md │ │ ├── resources │ │ │ └── lume.entitlements │ │ ├── scripts │ │ │ ├── build │ │ │ │ ├── build-debug.sh │ │ │ │ ├── build-release-notarized.sh │ │ │ │ └── build-release.sh │ │ │ └── install.sh │ │ ├── src │ │ │ ├── Commands │ │ │ │ ├── Clone.swift │ │ │ │ ├── Config.swift │ │ │ │ ├── Create.swift │ │ │ │ ├── Delete.swift │ │ │ │ ├── Get.swift │ │ │ │ ├── Images.swift │ │ │ │ ├── IPSW.swift │ │ │ │ ├── List.swift │ │ │ │ ├── Logs.swift │ │ │ │ ├── Options │ │ │ │ │ └── FormatOption.swift │ │ │ │ ├── Prune.swift │ │ │ │ ├── Pull.swift │ │ │ │ ├── Push.swift │ │ │ │ ├── Run.swift │ │ │ │ ├── Serve.swift │ │ │ │ ├── Set.swift │ │ │ │ └── Stop.swift │ │ │ ├── ContainerRegistry │ │ │ │ ├── ImageContainerRegistry.swift │ │ │ │ ├── ImageList.swift │ │ │ │ └── ImagesPrinter.swift │ │ │ ├── Errors │ │ │ │ └── Errors.swift │ │ │ ├── FileSystem │ │ │ │ ├── Home.swift │ │ │ │ ├── Settings.swift │ │ │ │ ├── VMConfig.swift │ │ │ │ ├── VMDirectory.swift │ │ │ │ └── VMLocation.swift │ │ │ ├── LumeController.swift │ │ │ ├── Main.swift │ │ │ ├── Server │ │ │ │ ├── Handlers.swift │ │ │ │ ├── HTTP.swift │ │ │ │ ├── Requests.swift │ │ │ │ ├── Responses.swift │ │ │ │ └── Server.swift │ │ │ ├── Utils │ │ │ │ ├── CommandRegistry.swift │ │ │ │ ├── CommandUtils.swift │ │ │ │ ├── Logger.swift │ │ │ │ ├── NetworkUtils.swift │ │ │ │ ├── Path.swift │ │ │ │ ├── ProcessRunner.swift │ │ │ │ ├── ProgressLogger.swift │ │ │ │ ├── String.swift │ │ │ │ └── Utils.swift │ │ │ ├── Virtualization │ │ │ │ ├── DarwinImageLoader.swift │ │ │ │ ├── DHCPLeaseParser.swift │ │ │ │ ├── ImageLoaderFactory.swift │ │ │ │ └── VMVirtualizationService.swift │ │ │ ├── VM │ │ │ │ ├── DarwinVM.swift │ │ │ │ ├── LinuxVM.swift │ │ │ │ ├── VM.swift │ │ │ │ ├── VMDetails.swift │ │ │ │ ├── VMDetailsPrinter.swift │ │ │ │ ├── VMDisplayResolution.swift │ │ │ │ └── VMFactory.swift │ │ │ └── VNC │ │ │ ├── PassphraseGenerator.swift │ │ │ └── VNCService.swift │ │ └── tests │ │ ├── Mocks │ │ │ ├── MockVM.swift │ │ │ ├── MockVMVirtualizationService.swift │ │ │ └── MockVNCService.swift │ │ ├── VM │ │ │ └── VMDetailsPrinterTests.swift │ │ ├── VMTests.swift │ │ ├── VMVirtualizationServiceTests.swift │ │ └── VNCServiceTests.swift │ ├── lumier │ │ ├── .dockerignore │ │ ├── Dockerfile │ │ ├── README.md │ │ └── src │ │ ├── bin │ │ │ └── entry.sh │ │ ├── config │ │ │ └── constants.sh │ │ ├── hooks │ │ │ └── on-logon.sh │ │ └── lib │ │ ├── utils.sh │ │ └── vm.sh │ ├── python │ │ ├── agent │ │ │ ├── .bumpversion.cfg │ │ │ ├── agent │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── adapters │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── huggingfacelocal_adapter.py │ │ │ │ │ ├── human_adapter.py │ │ │ │ │ ├── mlxvlm_adapter.py │ │ │ │ │ └── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── qwen2_5_vl.py │ │ │ │ ├── agent.py │ │ │ │ ├── callbacks │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── budget_manager.py │ │ │ │ │ ├── image_retention.py │ │ │ │ │ ├── logging.py │ │ │ │ │ ├── operator_validator.py │ │ │ │ │ ├── pii_anonymization.py │ │ │ │ │ ├── prompt_instructions.py │ │ │ │ │ ├── telemetry.py │ │ │ │ │ └── trajectory_saver.py │ │ │ │ ├── cli.py │ │ │ │ ├── computers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cua.py │ │ │ │ │ └── custom.py │ │ │ │ ├── decorators.py │ │ │ │ ├── human_tool │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ ├── server.py │ │ │ │ │ └── ui.py │ │ │ │ ├── integrations │ │ │ │ │ └── hud │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── agent.py │ │ │ │ │ └── proxy.py │ │ │ │ ├── loops │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── anthropic.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── composed_grounded.py │ │ │ │ │ ├── gemini.py │ │ │ │ │ ├── glm45v.py │ │ │ │ │ ├── gta1.py │ │ │ │ │ ├── holo.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── model_types.csv │ │ │ │ │ ├── moondream3.py │ │ │ │ │ ├── omniparser.py │ │ │ │ │ ├── openai.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── uitars.py │ │ │ │ ├── proxy │ │ │ │ │ ├── examples.py │ │ │ │ │ └── handlers.py │ │ │ │ ├── responses.py │ │ │ │ ├── types.py │ │ │ │ └── ui │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ └── gradio │ │ │ │ ├── __init__.py │ │ │ │ ├── app.py │ │ │ │ └── ui_components.py │ │ │ ├── benchmarks │ │ │ │ ├── .gitignore │ │ │ │ ├── contrib.md │ │ │ │ ├── interactive.py │ │ │ │ ├── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ └── gta1.py │ │ │ │ ├── README.md │ │ │ │ ├── ss-pro.py │ │ │ │ ├── ss-v2.py │ │ │ │ └── utils.py │ │ │ ├── example.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer │ │ │ ├── .bumpversion.cfg │ │ │ ├── computer │ │ │ │ ├── __init__.py │ │ │ │ ├── computer.py │ │ │ │ ├── diorama_computer.py │ │ │ │ ├── helpers.py │ │ │ │ ├── interface │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ ├── models.py │ │ │ │ │ └── windows.py │ │ │ │ ├── logger.py │ │ │ │ ├── models.py │ │ │ │ ├── providers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cloud │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── docker │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── lume │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── lume_api.py │ │ │ │ │ ├── lumier │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── types.py │ │ │ │ │ └── winsandbox │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── provider.py │ │ │ │ │ └── setup_script.ps1 │ │ │ │ ├── ui │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ └── gradio │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── app.py │ │ │ │ └── utils.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer-server │ │ │ ├── .bumpversion.cfg │ │ │ ├── computer_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── cli.py │ │ │ │ ├── diorama │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── diorama_computer.py │ │ │ │ │ ├── diorama.py │ │ │ │ │ ├── draw.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── safezone.py │ │ │ │ ├── handlers │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── windows.py │ │ │ │ ├── main.py │ │ │ │ ├── server.py │ │ │ │ └── watchdog.py │ │ │ ├── examples │ │ │ │ ├── __init__.py │ │ │ │ └── usage_example.py │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ ├── run_server.py │ │ │ └── test_connection.py │ │ ├── core │ │ │ ├── .bumpversion.cfg │ │ │ ├── core │ │ │ │ ├── __init__.py │ │ │ │ └── telemetry │ │ │ │ ├── __init__.py │ │ │ │ └── posthog.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── mcp-server │ │ │ ├── .bumpversion.cfg │ │ │ ├── CONCURRENT_SESSIONS.md │ │ │ ├── mcp_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── server.py │ │ │ │ └── session_manager.py │ │ │ ├── pdm.lock │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ └── scripts │ │ │ ├── install_mcp_server.sh │ │ │ └── start_mcp_server.sh │ │ ├── pylume │ │ │ ├── __init__.py │ │ │ ├── .bumpversion.cfg │ │ │ ├── pylume │ │ │ │ ├── __init__.py │ │ │ │ ├── client.py │ │ │ │ ├── exceptions.py │ │ │ │ ├── lume │ │ │ │ ├── models.py │ │ │ │ ├── pylume.py │ │ │ │ └── server.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ └── som │ │ ├── .bumpversion.cfg │ │ ├── LICENSE │ │ ├── poetry.toml │ │ ├── pyproject.toml │ │ ├── README.md │ │ ├── som │ │ │ ├── __init__.py │ │ │ ├── detect.py │ │ │ ├── detection.py │ │ │ ├── models.py │ │ │ ├── ocr.py │ │ │ ├── util │ │ │ │ └── utils.py │ │ │ └── visualization.py │ │ └── tests │ │ └── test_omniparser.py │ ├── typescript │ │ ├── .gitignore │ │ ├── .nvmrc │ │ ├── agent │ │ │ ├── examples │ │ │ │ ├── playground-example.html │ │ │ │ └── README.md │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── client.ts │ │ │ │ ├── index.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ └── client.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── biome.json │ │ ├── computer │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── computer │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── providers │ │ │ │ │ │ ├── base.ts │ │ │ │ │ │ ├── cloud.ts │ │ │ │ │ │ └── index.ts │ │ │ │ │ └── types.ts │ │ │ │ ├── index.ts │ │ │ │ ├── interface │ │ │ │ │ ├── base.ts │ │ │ │ │ ├── factory.ts │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── linux.ts │ │ │ │ │ ├── macos.ts │ │ │ │ │ └── windows.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ ├── computer │ │ │ │ │ └── cloud.test.ts │ │ │ │ ├── interface │ │ │ │ │ ├── factory.test.ts │ │ │ │ │ ├── index.test.ts │ │ │ │ │ ├── linux.test.ts │ │ │ │ │ ├── macos.test.ts │ │ │ │ │ └── windows.test.ts │ │ │ │ └── setup.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── core │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── index.ts │ │ │ │ └── telemetry │ │ │ │ ├── clients │ │ │ │ │ ├── index.ts │ │ │ │ │ └── posthog.ts │ │ │ │ └── index.ts │ │ │ ├── tests │ │ │ │ └── telemetry.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── pnpm-workspace.yaml │ │ └── README.md │ └── xfce │ ├── .dockerignore │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ └── src │ ├── scripts │ │ ├── resize-display.sh │ │ ├── start-computer-server.sh │ │ ├── start-novnc.sh │ │ ├── start-vnc.sh │ │ └── xstartup.sh │ ├── supervisor │ │ └── supervisord.conf │ └── xfce-config │ ├── helpers.rc │ ├── xfce4-power-manager.xml │ └── xfce4-session.xml ├── LICENSE.md ├── Makefile ├── notebooks │ ├── agent_nb.ipynb │ ├── blog │ │ ├── build-your-own-operator-on-macos-1.ipynb │ │ └── build-your-own-operator-on-macos-2.ipynb │ ├── composite_agents_docker_nb.ipynb │ ├── computer_nb.ipynb │ ├── computer_server_nb.ipynb │ ├── customizing_computeragent.ipynb │ ├── eval_osworld.ipynb │ ├── ollama_nb.ipynb │ ├── pylume_nb.ipynb │ ├── README.md │ ├── sota_hackathon_cloud.ipynb │ └── sota_hackathon.ipynb ├── pdm.lock ├── pyproject.toml ├── pyrightconfig.json ├── README.md ├── samples │ └── community │ ├── global-online │ │ └── README.md │ └── hack-the-north │ └── README.md ├── scripts │ ├── build-uv.sh │ ├── build.ps1 │ ├── build.sh │ ├── cleanup.sh │ ├── playground-docker.sh │ ├── playground.sh │ └── run-docker-dev.sh └── tests ├── pytest.ini ├── shell_cmd.py ├── test_files.py ├── test_mcp_server_session_management.py ├── test_mcp_server_streaming.py ├── test_shell_bash.py ├── test_telemetry.py ├── test_venv.py └── test_watchdog.py ``` # Files -------------------------------------------------------------------------------- /examples/som_examples.py: -------------------------------------------------------------------------------- ```python 1 | #!/usr/bin/env python3 2 | """ 3 | Example script demonstrating the usage of OmniParser's UI element detection functionality. 4 | This script shows how to: 5 | 1. Initialize the OmniParser 6 | 2. Load and process images 7 | 3. Visualize detection results 8 | 4. Compare performance between CPU and MPS (Apple Silicon) 9 | """ 10 | 11 | import argparse 12 | import logging 13 | import sys 14 | from pathlib import Path 15 | import time 16 | from PIL import Image 17 | from typing import Dict, Any, List, Optional 18 | import numpy as np 19 | import io 20 | import base64 21 | import glob 22 | import os 23 | 24 | # Load environment variables from .env file 25 | project_root = Path(__file__).parent.parent 26 | env_file = project_root / ".env" 27 | print(f"Loading environment from: {env_file}") 28 | from dotenv import load_dotenv 29 | 30 | load_dotenv(env_file) 31 | 32 | # Add paths to sys.path if needed 33 | pythonpath = os.environ.get("PYTHONPATH", "") 34 | for path in pythonpath.split(":"): 35 | if path and path not in sys.path: 36 | sys.path.append(path) 37 | print(f"Added to sys.path: {path}") 38 | 39 | # Add the libs directory to the path to find som 40 | libs_path = project_root / "libs" 41 | if str(libs_path) not in sys.path: 42 | sys.path.append(str(libs_path)) 43 | print(f"Added to sys.path: {libs_path}") 44 | 45 | from som import OmniParser, ParseResult, IconElement, TextElement 46 | from som.models import UIElement, ParserMetadata, BoundingBox 47 | 48 | # Configure logging 49 | logging.basicConfig( 50 | level=logging.INFO, 51 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 52 | datefmt="%Y-%m-%d %H:%M:%S", 53 | ) 54 | logger = logging.getLogger(__name__) 55 | 56 | 57 | def setup_logging(): 58 | """Configure logging with a nice format.""" 59 | logging.basicConfig( 60 | level=logging.INFO, 61 | format="%(asctime)s - %(levelname)s - %(message)s", 62 | datefmt="%Y-%m-%d %H:%M:%S", 63 | ) 64 | 65 | 66 | class Timer: 67 | """Enhanced context manager for timing code blocks.""" 68 | 69 | def __init__(self, name: str, logger): 70 | self.name = name 71 | self.logger = logger 72 | self.start_time: float = 0.0 73 | self.elapsed_time: float = 0.0 74 | 75 | def __enter__(self): 76 | self.start_time = time.time() 77 | return self 78 | 79 | def __exit__(self, *args): 80 | self.elapsed_time = time.time() - self.start_time 81 | self.logger.info(f"{self.name}: {self.elapsed_time:.3f}s") 82 | return False 83 | 84 | 85 | def image_to_bytes(image: Image.Image) -> bytes: 86 | """Convert PIL Image to PNG bytes.""" 87 | buf = io.BytesIO() 88 | image.save(buf, format="PNG") 89 | return buf.getvalue() 90 | 91 | 92 | def process_image( 93 | parser: OmniParser, image_path: str, output_dir: Path, use_ocr: bool = False 94 | ) -> None: 95 | """Process a single image and save the result.""" 96 | try: 97 | # Load image 98 | logger.info(f"Processing image: {image_path}") 99 | image = Image.open(image_path).convert("RGB") 100 | logger.info(f"Image loaded successfully, size: {image.size}") 101 | 102 | # Create output filename 103 | input_filename = Path(image_path).stem 104 | output_path = output_dir / f"{input_filename}_analyzed.png" 105 | 106 | # Convert image to PNG bytes 107 | image_bytes = image_to_bytes(image) 108 | 109 | # Process image 110 | with Timer(f"Processing {input_filename}", logger): 111 | result = parser.parse(image_bytes, use_ocr=use_ocr) 112 | logger.info( 113 | f"Found {result.metadata.num_icons} icons and {result.metadata.num_text} text elements" 114 | ) 115 | 116 | # Save the annotated image 117 | logger.info(f"Saving annotated image to: {output_path}") 118 | try: 119 | # Save image from base64 120 | img_data = base64.b64decode(result.annotated_image_base64) 121 | img = Image.open(io.BytesIO(img_data)) 122 | img.save(output_path) 123 | 124 | # Print detailed results 125 | logger.info("\nDetected Elements:") 126 | for elem in result.elements: 127 | if isinstance(elem, IconElement): 128 | logger.info( 129 | f"Icon: confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}" 130 | ) 131 | elif isinstance(elem, TextElement): 132 | logger.info( 133 | f"Text: '{elem.content}', confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}" 134 | ) 135 | 136 | # Verify file exists and log size 137 | if output_path.exists(): 138 | logger.info( 139 | f"Successfully saved image. File size: {output_path.stat().st_size} bytes" 140 | ) 141 | else: 142 | logger.error(f"Failed to verify file at {output_path}") 143 | except Exception as e: 144 | logger.error(f"Error saving image: {str(e)}", exc_info=True) 145 | 146 | except Exception as e: 147 | logger.error(f"Error processing image {image_path}: {str(e)}", exc_info=True) 148 | 149 | 150 | def run_detection_benchmark( 151 | input_path: str, 152 | output_dir: Path, 153 | use_ocr: bool = False, 154 | box_threshold: float = 0.01, 155 | iou_threshold: float = 0.1, 156 | ): 157 | """Run detection benchmark on images.""" 158 | logger.info( 159 | f"Starting benchmark with OCR enabled: {use_ocr}, box_threshold: {box_threshold}, iou_threshold: {iou_threshold}" 160 | ) 161 | 162 | try: 163 | # Initialize parser 164 | logger.info("Initializing OmniParser...") 165 | parser = OmniParser() 166 | 167 | # Create output directory 168 | output_dir.mkdir(parents=True, exist_ok=True) 169 | logger.info(f"Output directory created at: {output_dir}") 170 | 171 | # Get list of PNG files 172 | if os.path.isdir(input_path): 173 | image_files = glob.glob(os.path.join(input_path, "*.png")) 174 | else: 175 | image_files = [input_path] 176 | 177 | logger.info(f"Found {len(image_files)} images to process") 178 | 179 | # Process each image with specified thresholds 180 | for image_path in image_files: 181 | try: 182 | # Load image 183 | logger.info(f"Processing image: {image_path}") 184 | image = Image.open(image_path).convert("RGB") 185 | logger.info(f"Image loaded successfully, size: {image.size}") 186 | 187 | # Create output filename 188 | input_filename = Path(image_path).stem 189 | output_path = output_dir / f"{input_filename}_analyzed.png" 190 | 191 | # Convert image to PNG bytes 192 | image_bytes = image_to_bytes(image) 193 | 194 | # Process image with specified thresholds 195 | with Timer(f"Processing {input_filename}", logger): 196 | result = parser.parse( 197 | image_bytes, 198 | use_ocr=use_ocr, 199 | box_threshold=box_threshold, 200 | iou_threshold=iou_threshold, 201 | ) 202 | logger.info( 203 | f"Found {result.metadata.num_icons} icons and {result.metadata.num_text} text elements" 204 | ) 205 | 206 | # Save the annotated image 207 | logger.info(f"Saving annotated image to: {output_path}") 208 | try: 209 | # Save image from base64 210 | img_data = base64.b64decode(result.annotated_image_base64) 211 | img = Image.open(io.BytesIO(img_data)) 212 | img.save(output_path) 213 | 214 | # Print detailed results 215 | logger.info("\nDetected Elements:") 216 | for elem in result.elements: 217 | if isinstance(elem, IconElement): 218 | logger.info( 219 | f"Icon: confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}" 220 | ) 221 | elif isinstance(elem, TextElement): 222 | logger.info( 223 | f"Text: '{elem.content}', confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}" 224 | ) 225 | 226 | # Verify file exists and log size 227 | if output_path.exists(): 228 | logger.info( 229 | f"Successfully saved image. File size: {output_path.stat().st_size} bytes" 230 | ) 231 | else: 232 | logger.error(f"Failed to verify file at {output_path}") 233 | except Exception as e: 234 | logger.error(f"Error saving image: {str(e)}", exc_info=True) 235 | 236 | except Exception as e: 237 | logger.error(f"Error processing image {image_path}: {str(e)}", exc_info=True) 238 | 239 | except Exception as e: 240 | logger.error(f"Benchmark failed: {str(e)}", exc_info=True) 241 | raise 242 | 243 | 244 | def run_experiments(input_path: str, output_dir: Path, use_ocr: bool = False): 245 | """Run experiments with different threshold combinations.""" 246 | # Define threshold values to test 247 | box_thresholds = [0.01, 0.05, 0.1, 0.3] 248 | iou_thresholds = [0.05, 0.1, 0.2, 0.5] 249 | 250 | logger.info("Starting threshold experiments...") 251 | logger.info("Box thresholds to test: %s", box_thresholds) 252 | logger.info("IOU thresholds to test: %s", iou_thresholds) 253 | 254 | # Create results directory for this experiment 255 | timestamp = time.strftime("%Y%m%d-%H%M%S") 256 | ocr_suffix = "_ocr" if use_ocr else "_no_ocr" 257 | exp_dir = output_dir / f"experiment_{timestamp}{ocr_suffix}" 258 | exp_dir.mkdir(parents=True, exist_ok=True) 259 | 260 | # Create a summary file 261 | summary_file = exp_dir / "results_summary.txt" 262 | with open(summary_file, "w") as f: 263 | f.write("Threshold Experiments Results\n") 264 | f.write("==========================\n\n") 265 | f.write(f"Input: {input_path}\n") 266 | f.write(f"OCR Enabled: {use_ocr}\n") 267 | f.write(f"Date: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n") 268 | f.write("Results:\n") 269 | f.write("-" * 80 + "\n") 270 | f.write( 271 | f"{'Box Thresh':^10} | {'IOU Thresh':^10} | {'Num Icons':^10} | {'Num Text':^10} | {'Time (s)':^10}\n" 272 | ) 273 | f.write("-" * 80 + "\n") 274 | 275 | # Initialize parser once for all experiments 276 | parser = OmniParser() 277 | 278 | # Run experiments with each combination 279 | for box_thresh in box_thresholds: 280 | for iou_thresh in iou_thresholds: 281 | logger.info(f"\nTesting box_threshold={box_thresh}, iou_threshold={iou_thresh}") 282 | 283 | # Create directory for this combination 284 | combo_dir = exp_dir / f"box_{box_thresh}_iou_{iou_thresh}" 285 | combo_dir.mkdir(exist_ok=True) 286 | 287 | try: 288 | # Process each image 289 | if os.path.isdir(input_path): 290 | image_files = glob.glob(os.path.join(input_path, "*.png")) 291 | else: 292 | image_files = [input_path] 293 | 294 | total_icons = 0 295 | total_text = 0 296 | total_time = 0 297 | 298 | for image_path in image_files: 299 | # Load and process image 300 | image = Image.open(image_path).convert("RGB") 301 | image_bytes = image_to_bytes(image) 302 | 303 | # Process with current thresholds 304 | with Timer(f"Processing {Path(image_path).stem}", logger) as t: 305 | result = parser.parse( 306 | image_bytes, 307 | use_ocr=use_ocr, 308 | box_threshold=box_thresh, 309 | iou_threshold=iou_thresh, 310 | ) 311 | 312 | # Save annotated image 313 | output_path = combo_dir / f"{Path(image_path).stem}_analyzed.png" 314 | img_data = base64.b64decode(result.annotated_image_base64) 315 | img = Image.open(io.BytesIO(img_data)) 316 | img.save(output_path) 317 | 318 | # Update totals 319 | total_icons += result.metadata.num_icons 320 | total_text += result.metadata.num_text 321 | 322 | # Log detailed results 323 | detail_file = combo_dir / f"{Path(image_path).stem}_details.txt" 324 | with open(detail_file, "w") as detail_f: 325 | detail_f.write(f"Results for {Path(image_path).name}\n") 326 | detail_f.write("-" * 40 + "\n") 327 | detail_f.write(f"Number of icons: {result.metadata.num_icons}\n") 328 | detail_f.write( 329 | f"Number of text elements: {result.metadata.num_text}\n\n" 330 | ) 331 | 332 | detail_f.write("Icon Detections:\n") 333 | icon_count = 1 334 | text_count = ( 335 | result.metadata.num_icons + 1 336 | ) # Text boxes start after icons 337 | 338 | # First list all icons 339 | for elem in result.elements: 340 | if isinstance(elem, IconElement): 341 | detail_f.write(f"Box #{icon_count}: Icon\n") 342 | detail_f.write(f" - Confidence: {elem.confidence:.3f}\n") 343 | detail_f.write( 344 | f" - Coordinates: {elem.bbox.coordinates}\n" 345 | ) 346 | icon_count += 1 347 | 348 | if use_ocr: 349 | detail_f.write("\nText Detections:\n") 350 | for elem in result.elements: 351 | if isinstance(elem, TextElement): 352 | detail_f.write(f"Box #{text_count}: Text\n") 353 | detail_f.write(f" - Content: '{elem.content}'\n") 354 | detail_f.write( 355 | f" - Confidence: {elem.confidence:.3f}\n" 356 | ) 357 | detail_f.write( 358 | f" - Coordinates: {elem.bbox.coordinates}\n" 359 | ) 360 | text_count += 1 361 | 362 | # Update timing totals 363 | total_time += t.elapsed_time 364 | 365 | # Write summary for this combination 366 | avg_time = total_time / len(image_files) 367 | f.write( 368 | f"{box_thresh:^10.3f} | {iou_thresh:^10.3f} | {total_icons:^10d} | {total_text:^10d} | {avg_time:^10.3f}\n" 369 | ) 370 | 371 | except Exception as e: 372 | logger.error( 373 | f"Error in experiment box={box_thresh}, iou={iou_thresh}: {str(e)}" 374 | ) 375 | f.write( 376 | f"{box_thresh:^10.3f} | {iou_thresh:^10.3f} | {'ERROR':^10s} | {'ERROR':^10s} | {'ERROR':^10s}\n" 377 | ) 378 | 379 | # Write summary footer 380 | f.write("-" * 80 + "\n") 381 | f.write("\nExperiment completed successfully!\n") 382 | 383 | logger.info(f"\nExperiment results saved to {exp_dir}") 384 | logger.info(f"Summary file: {summary_file}") 385 | 386 | 387 | def main(): 388 | """Main entry point.""" 389 | parser = argparse.ArgumentParser(description="Run OmniParser benchmark") 390 | parser.add_argument("input_path", help="Path to input image or directory containing images") 391 | parser.add_argument( 392 | "--output-dir", default="examples/output", help="Output directory for annotated images" 393 | ) 394 | parser.add_argument( 395 | "--ocr", 396 | choices=["none", "easyocr"], 397 | default="none", 398 | help="OCR engine to use (default: none)", 399 | ) 400 | parser.add_argument( 401 | "--mode", 402 | choices=["single", "experiment"], 403 | default="single", 404 | help="Run mode: single run or threshold experiments (default: single)", 405 | ) 406 | parser.add_argument( 407 | "--box-threshold", 408 | type=float, 409 | default=0.01, 410 | help="Confidence threshold for detection (default: 0.01)", 411 | ) 412 | parser.add_argument( 413 | "--iou-threshold", 414 | type=float, 415 | default=0.1, 416 | help="IOU threshold for Non-Maximum Suppression (default: 0.1)", 417 | ) 418 | args = parser.parse_args() 419 | 420 | logger.info(f"Starting OmniParser with arguments: {args}") 421 | use_ocr = args.ocr != "none" 422 | output_dir = Path(args.output_dir) 423 | 424 | try: 425 | if args.mode == "experiment": 426 | run_experiments(args.input_path, output_dir, use_ocr) 427 | else: 428 | run_detection_benchmark( 429 | args.input_path, output_dir, use_ocr, args.box_threshold, args.iou_threshold 430 | ) 431 | except Exception as e: 432 | logger.error(f"Process failed: {str(e)}", exc_info=True) 433 | return 1 434 | 435 | return 0 436 | 437 | 438 | if __name__ == "__main__": 439 | sys.exit(main()) 440 | ``` -------------------------------------------------------------------------------- /libs/python/som/som/detect.py: -------------------------------------------------------------------------------- ```python 1 | from pathlib import Path 2 | from typing import Union, List, Dict, Any, Tuple, Optional, cast 3 | import logging 4 | import torch 5 | import torchvision.ops 6 | import cv2 7 | import numpy as np 8 | import time 9 | import torchvision.transforms as T 10 | from PIL import Image 11 | import io 12 | import base64 13 | import argparse 14 | import signal 15 | from contextlib import contextmanager 16 | 17 | from ultralytics import YOLO 18 | from huggingface_hub import hf_hub_download 19 | import supervision as sv 20 | from supervision.detection.core import Detections 21 | 22 | from .detection import DetectionProcessor 23 | from .ocr import OCRProcessor 24 | from .visualization import BoxAnnotator 25 | from .models import BoundingBox, UIElement, IconElement, TextElement, ParserMetadata, ParseResult 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | class TimeoutException(Exception): 31 | pass 32 | 33 | 34 | @contextmanager 35 | def timeout(seconds: int): 36 | def timeout_handler(signum, frame): 37 | raise TimeoutException("OCR process timed out") 38 | 39 | # Register the signal handler 40 | original_handler = signal.signal(signal.SIGALRM, timeout_handler) 41 | signal.alarm(seconds) 42 | 43 | try: 44 | yield 45 | finally: 46 | signal.alarm(0) 47 | signal.signal(signal.SIGALRM, original_handler) 48 | 49 | 50 | def process_text_box(box, image): 51 | """Process a single text box with OCR.""" 52 | try: 53 | import easyocr 54 | from typing import List, Tuple, Any, Sequence 55 | 56 | x1 = int(min(point[0] for point in box)) 57 | y1 = int(min(point[1] for point in box)) 58 | x2 = int(max(point[0] for point in box)) 59 | y2 = int(max(point[1] for point in box)) 60 | 61 | # Add padding 62 | pad = 2 63 | x1 = max(0, x1 - pad) 64 | y1 = max(0, y1 - pad) 65 | x2 = min(image.shape[1], x2 + pad) 66 | y2 = min(image.shape[0], y2 + pad) 67 | 68 | region = image[y1:y2, x1:x2] 69 | if region.size > 0: 70 | reader = easyocr.Reader(["en"]) 71 | results = reader.readtext(region) 72 | if results and len(results) > 0: 73 | # EasyOCR returns a list of tuples (bbox, text, confidence) 74 | first_result = results[0] 75 | if isinstance(first_result, (list, tuple)) and len(first_result) >= 3: 76 | text = str(first_result[1]) 77 | confidence = float(first_result[2]) 78 | if confidence > 0.5: 79 | return text, [x1, y1, x2, y2], confidence 80 | except Exception: 81 | pass 82 | return None 83 | 84 | 85 | def check_ocr_box(image_path: Union[str, Path]) -> Tuple[List[str], List[List[float]]]: 86 | """Check OCR box using EasyOCR.""" 87 | # Read image once 88 | if isinstance(image_path, str): 89 | image_path = Path(image_path) 90 | 91 | # Read image into memory 92 | image_cv = cv2.imread(str(image_path)) 93 | if image_cv is None: 94 | logger.error(f"Failed to read image: {image_path}") 95 | return [], [] 96 | 97 | # Get image dimensions 98 | img_height, img_width = image_cv.shape[:2] 99 | confidence_threshold = 0.5 100 | 101 | # Use EasyOCR 102 | import ssl 103 | import easyocr 104 | 105 | # Create unverified SSL context for development 106 | ssl._create_default_https_context = ssl._create_unverified_context 107 | try: 108 | reader = easyocr.Reader(["en"]) 109 | with timeout(5): # 5 second timeout for EasyOCR 110 | results = reader.readtext(image_cv, paragraph=False, text_threshold=0.5) 111 | except TimeoutException: 112 | logger.warning("EasyOCR timed out, returning no results") 113 | return [], [] 114 | except Exception as e: 115 | logger.warning(f"EasyOCR failed: {str(e)}") 116 | return [], [] 117 | finally: 118 | # Restore default SSL context 119 | ssl._create_default_https_context = ssl.create_default_context 120 | 121 | texts = [] 122 | boxes = [] 123 | 124 | for box, text, conf in results: 125 | # Convert box format to [x1, y1, x2, y2] 126 | x1 = min(point[0] for point in box) 127 | y1 = min(point[1] for point in box) 128 | x2 = max(point[0] for point in box) 129 | y2 = max(point[1] for point in box) 130 | 131 | if float(conf) > 0.5: # Only keep higher confidence detections 132 | texts.append(text) 133 | boxes.append([x1, y1, x2, y2]) 134 | 135 | return texts, boxes 136 | 137 | 138 | class OmniParser: 139 | """Enhanced UI parser using computer vision and OCR for detecting interactive elements.""" 140 | 141 | def __init__( 142 | self, 143 | model_path: Optional[Union[str, Path]] = None, 144 | cache_dir: Optional[Union[str, Path]] = None, 145 | force_device: Optional[str] = None, 146 | ): 147 | """Initialize the OmniParser. 148 | 149 | Args: 150 | model_path: Optional path to the YOLO model 151 | cache_dir: Optional directory to cache model files 152 | force_device: Force specific device (cpu/cuda/mps) 153 | """ 154 | self.detector = DetectionProcessor( 155 | model_path=Path(model_path) if model_path else None, 156 | cache_dir=Path(cache_dir) if cache_dir else None, 157 | force_device=force_device, 158 | ) 159 | self.ocr = OCRProcessor() 160 | self.visualizer = BoxAnnotator() 161 | 162 | def process_image( 163 | self, 164 | image: Image.Image, 165 | box_threshold: float = 0.3, 166 | iou_threshold: float = 0.1, 167 | use_ocr: bool = True, 168 | ) -> Tuple[Image.Image, List[UIElement]]: 169 | """Process an image to detect UI elements and optionally text. 170 | 171 | Args: 172 | image: Input PIL Image 173 | box_threshold: Confidence threshold for detection 174 | iou_threshold: IOU threshold for NMS 175 | use_ocr: Whether to enable OCR processing 176 | 177 | Returns: 178 | Tuple of (annotated image, list of detections) 179 | """ 180 | try: 181 | logger.info("Starting UI element detection...") 182 | 183 | # Detect icons 184 | icon_detections = self.detector.detect_icons( 185 | image=image, box_threshold=box_threshold, iou_threshold=iou_threshold 186 | ) 187 | logger.info(f"Found {len(icon_detections)} interactive elements") 188 | 189 | # Convert icon detections to typed objects 190 | elements: List[UIElement] = cast( 191 | List[UIElement], 192 | [ 193 | IconElement( 194 | id=i + 1, 195 | bbox=BoundingBox( 196 | x1=det["bbox"][0], 197 | y1=det["bbox"][1], 198 | x2=det["bbox"][2], 199 | y2=det["bbox"][3], 200 | ), 201 | confidence=det["confidence"], 202 | scale=det.get("scale"), 203 | ) 204 | for i, det in enumerate(icon_detections) 205 | ], 206 | ) 207 | 208 | # Run OCR if enabled 209 | if use_ocr: 210 | logger.info("Running OCR detection...") 211 | text_detections = self.ocr.detect_text(image=image, confidence_threshold=0.5) 212 | if text_detections is None: 213 | text_detections = [] 214 | logger.info(f"Found {len(text_detections)} text regions") 215 | 216 | # Convert text detections to typed objects 217 | text_elements = cast( 218 | List[UIElement], 219 | [ 220 | TextElement( 221 | id=len(elements) + i + 1, 222 | bbox=BoundingBox( 223 | x1=det["bbox"][0], 224 | y1=det["bbox"][1], 225 | x2=det["bbox"][2], 226 | y2=det["bbox"][3], 227 | ), 228 | content=det["content"], 229 | confidence=det["confidence"], 230 | ) 231 | for i, det in enumerate(text_detections) 232 | ], 233 | ) 234 | 235 | if elements and text_elements: 236 | # Filter out non-OCR elements that have OCR elements with center points colliding with them 237 | filtered_elements = [] 238 | for elem in elements: # elements at this point contains only non-OCR elements 239 | should_keep = True 240 | for text_elem in text_elements: 241 | # Calculate center point of the text element 242 | center_x = (text_elem.bbox.x1 + text_elem.bbox.x2) / 2 243 | center_y = (text_elem.bbox.y1 + text_elem.bbox.y2) / 2 244 | 245 | # Check if this center point is inside the non-OCR element 246 | if (center_x >= elem.bbox.x1 and center_x <= elem.bbox.x2 and 247 | center_y >= elem.bbox.y1 and center_y <= elem.bbox.y2): 248 | should_keep = False 249 | break 250 | 251 | if should_keep: 252 | filtered_elements.append(elem) 253 | elements = filtered_elements 254 | 255 | # Merge detections using NMS 256 | all_elements = elements + text_elements 257 | boxes = torch.tensor([elem.bbox.coordinates for elem in all_elements]) 258 | scores = torch.tensor([elem.confidence for elem in all_elements]) 259 | keep_indices = torchvision.ops.nms(boxes, scores, iou_threshold) 260 | elements = [all_elements[i] for i in keep_indices] 261 | else: 262 | # Just add text elements to the list if IOU doesn't need to be applied 263 | elements.extend(text_elements) 264 | 265 | # Calculate drawing parameters based on image size 266 | box_overlay_ratio = max(image.size) / 3200 267 | draw_config = { 268 | "font_size": int(12 * box_overlay_ratio), 269 | "box_thickness": max(int(2 * box_overlay_ratio), 1), 270 | "text_padding": max(int(3 * box_overlay_ratio), 1), 271 | } 272 | 273 | # Convert elements back to dict format for visualization 274 | detection_dicts = [ 275 | { 276 | "type": elem.type, 277 | "bbox": elem.bbox.coordinates, 278 | "confidence": elem.confidence, 279 | "content": elem.content if isinstance(elem, TextElement) else None, 280 | } 281 | for elem in elements 282 | ] 283 | 284 | # Create visualization 285 | logger.info("Creating visualization...") 286 | annotated_image = self.visualizer.draw_boxes( 287 | image=image.copy(), detections=detection_dicts, draw_config=draw_config 288 | ) 289 | logger.info("Visualization complete") 290 | 291 | return annotated_image, elements 292 | 293 | except Exception as e: 294 | logger.error(f"Error in process_image: {str(e)}") 295 | import traceback 296 | 297 | logger.error(traceback.format_exc()) 298 | raise 299 | 300 | def parse( 301 | self, 302 | screenshot_data: Union[bytes, str], 303 | box_threshold: float = 0.3, 304 | iou_threshold: float = 0.1, 305 | use_ocr: bool = True, 306 | ) -> ParseResult: 307 | """Parse a UI screenshot to detect interactive elements and text. 308 | 309 | Args: 310 | screenshot_data: Raw bytes or base64 string of the screenshot 311 | box_threshold: Confidence threshold for detection 312 | iou_threshold: IOU threshold for NMS 313 | use_ocr: Whether to enable OCR processing 314 | 315 | Returns: 316 | ParseResult object containing elements, annotated image, and metadata 317 | """ 318 | try: 319 | start_time = time.time() 320 | 321 | # Convert input to PIL Image 322 | if isinstance(screenshot_data, str): 323 | screenshot_data = base64.b64decode(screenshot_data) 324 | image = Image.open(io.BytesIO(screenshot_data)).convert("RGB") 325 | 326 | # Process image 327 | annotated_image, elements = self.process_image( 328 | image=image, 329 | box_threshold=box_threshold, 330 | iou_threshold=iou_threshold, 331 | use_ocr=use_ocr, 332 | ) 333 | 334 | # Convert annotated image to base64 335 | buffered = io.BytesIO() 336 | annotated_image.save(buffered, format="PNG") 337 | annotated_image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") 338 | 339 | # Generate screen info text 340 | screen_info = [] 341 | parsed_content_list = [] 342 | 343 | # Set element IDs and generate human-readable descriptions 344 | for i, elem in enumerate(elements): 345 | # Set the ID (1-indexed) 346 | elem.id = i + 1 347 | 348 | if isinstance(elem, IconElement): 349 | screen_info.append( 350 | f"Box #{i+1}: Icon (confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates})" 351 | ) 352 | parsed_content_list.append( 353 | { 354 | "id": i + 1, 355 | "type": "icon", 356 | "bbox": elem.bbox.coordinates, 357 | "confidence": elem.confidence, 358 | "content": None, 359 | } 360 | ) 361 | elif isinstance(elem, TextElement): 362 | screen_info.append( 363 | f"Box #{i+1}: Text '{elem.content}' (confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates})" 364 | ) 365 | parsed_content_list.append( 366 | { 367 | "id": i + 1, 368 | "type": "text", 369 | "bbox": elem.bbox.coordinates, 370 | "confidence": elem.confidence, 371 | "content": elem.content, 372 | } 373 | ) 374 | 375 | # Calculate metadata 376 | latency = time.time() - start_time 377 | width, height = image.size 378 | 379 | # Create ParseResult object with enhanced properties 380 | result = ParseResult( 381 | elements=elements, 382 | annotated_image_base64=annotated_image_base64, 383 | screen_info=screen_info, 384 | parsed_content_list=parsed_content_list, 385 | metadata=ParserMetadata( 386 | image_size=(width, height), 387 | num_icons=len([e for e in elements if isinstance(e, IconElement)]), 388 | num_text=len([e for e in elements if isinstance(e, TextElement)]), 389 | device=self.detector.device, 390 | ocr_enabled=use_ocr, 391 | latency=latency, 392 | ), 393 | ) 394 | 395 | # Return the ParseResult object directly 396 | return result 397 | 398 | except Exception as e: 399 | logger.error(f"Error in parse: {str(e)}") 400 | import traceback 401 | 402 | logger.error(traceback.format_exc()) 403 | raise 404 | 405 | 406 | def main(): 407 | """Command line interface for UI element detection.""" 408 | parser = argparse.ArgumentParser(description="Detect UI elements and text in images") 409 | parser.add_argument("image_path", help="Path to the input image") 410 | parser.add_argument("--model-path", help="Path to YOLO model") 411 | parser.add_argument( 412 | "--box-threshold", type=float, default=0.3, help="Box confidence threshold (default: 0.3)" 413 | ) 414 | parser.add_argument( 415 | "--iou-threshold", type=float, default=0.1, help="IOU threshold (default: 0.1)" 416 | ) 417 | parser.add_argument( 418 | "--ocr", action="store_true", default=True, help="Enable OCR processing (default: True)" 419 | ) 420 | parser.add_argument("--output", help="Output path for annotated image") 421 | args = parser.parse_args() 422 | 423 | # Setup logging 424 | logging.basicConfig(level=logging.INFO) 425 | 426 | try: 427 | # Initialize parser 428 | parser = OmniParser(model_path=args.model_path) 429 | 430 | # Load and process image 431 | logger.info(f"Loading image from: {args.image_path}") 432 | image = Image.open(args.image_path).convert("RGB") 433 | logger.info(f"Image loaded successfully, size: {image.size}") 434 | 435 | # Process image 436 | annotated_image, elements = parser.process_image( 437 | image=image, 438 | box_threshold=args.box_threshold, 439 | iou_threshold=args.iou_threshold, 440 | use_ocr=args.ocr, 441 | ) 442 | 443 | # Save output image 444 | output_path = args.output or str( 445 | Path(args.image_path).parent 446 | / f"{Path(args.image_path).stem}_analyzed{Path(args.image_path).suffix}" 447 | ) 448 | logger.info(f"Saving annotated image to: {output_path}") 449 | 450 | Path(output_path).parent.mkdir(parents=True, exist_ok=True) 451 | annotated_image.save(output_path) 452 | logger.info(f"Image saved successfully to {output_path}") 453 | 454 | # Print detections 455 | logger.info("\nDetections:") 456 | for i, elem in enumerate(elements): 457 | if isinstance(elem, IconElement): 458 | logger.info( 459 | f"Interactive element {i}: confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}" 460 | ) 461 | elif isinstance(elem, TextElement): 462 | logger.info(f"Text {i}: '{elem.content}', bbox={elem.bbox.coordinates}") 463 | 464 | except Exception as e: 465 | logger.error(f"Error processing image: {str(e)}") 466 | import traceback 467 | 468 | logger.error(traceback.format_exc()) 469 | return 1 470 | 471 | return 0 472 | 473 | 474 | if __name__ == "__main__": 475 | import sys 476 | 477 | sys.exit(main()) 478 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/cli.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | CLI chat interface for agent - Computer Use Agent 3 | 4 | Usage: 5 | python -m agent.cli <model_string> 6 | 7 | Examples: 8 | python -m agent.cli openai/computer-use-preview 9 | python -m agent.cli anthropic/claude-3-5-sonnet-20241022 10 | python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022 11 | """ 12 | 13 | try: 14 | import asyncio 15 | import argparse 16 | import os 17 | import sys 18 | import json 19 | from typing import List, Dict, Any 20 | import dotenv 21 | import base64 22 | import time 23 | import platform 24 | from pathlib import Path 25 | try: 26 | from PIL import Image, ImageDraw 27 | PIL_AVAILABLE = True 28 | except Exception: 29 | PIL_AVAILABLE = False 30 | from yaspin import yaspin 31 | except ImportError: 32 | if __name__ == "__main__": 33 | raise ImportError( 34 | "CLI dependencies not found. " 35 | "Please install with: pip install \"cua-agent[cli]\"" 36 | ) 37 | 38 | # Load environment variables 39 | dotenv.load_dotenv() 40 | 41 | # Color codes for terminal output 42 | class Colors: 43 | RESET = '\033[0m' 44 | BOLD = '\033[1m' 45 | DIM = '\033[2m' 46 | 47 | # Text colors 48 | RED = '\033[31m' 49 | GREEN = '\033[32m' 50 | YELLOW = '\033[33m' 51 | BLUE = '\033[34m' 52 | MAGENTA = '\033[35m' 53 | CYAN = '\033[36m' 54 | WHITE = '\033[37m' 55 | GRAY = '\033[90m' 56 | 57 | # Background colors 58 | BG_RED = '\033[41m' 59 | BG_GREEN = '\033[42m' 60 | BG_YELLOW = '\033[43m' 61 | BG_BLUE = '\033[44m' 62 | 63 | def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = False, end: str = "\n", right: str = ""): 64 | """Print colored text to terminal with optional right-aligned text.""" 65 | prefix = "" 66 | if bold: 67 | prefix += Colors.BOLD 68 | if dim: 69 | prefix += Colors.DIM 70 | if color: 71 | prefix += color 72 | 73 | if right: 74 | # Get terminal width (default to 80 if unable to determine) 75 | try: 76 | import shutil 77 | terminal_width = shutil.get_terminal_size().columns 78 | except: 79 | terminal_width = 80 80 | 81 | # Add right margin 82 | terminal_width -= 1 83 | 84 | # Calculate padding needed 85 | # Account for ANSI escape codes not taking visual space 86 | visible_left_len = len(text) 87 | visible_right_len = len(right) 88 | padding = terminal_width - visible_left_len - visible_right_len 89 | 90 | if padding > 0: 91 | output = f"{prefix}{text}{' ' * padding}{right}{Colors.RESET}" 92 | else: 93 | # If not enough space, just put a single space between 94 | output = f"{prefix}{text} {right}{Colors.RESET}" 95 | else: 96 | output = f"{prefix}{text}{Colors.RESET}" 97 | 98 | print(output, end=end) 99 | 100 | 101 | def print_action(action_type: str, details: Dict[str, Any], total_cost: float): 102 | """Print computer action with nice formatting.""" 103 | # Format action details 104 | args_str = "" 105 | if action_type == "click" and "x" in details and "y" in details: 106 | args_str = f"_{details.get('button', 'left')}({details['x']}, {details['y']})" 107 | elif action_type == "type" and "text" in details: 108 | text = details["text"] 109 | if len(text) > 50: 110 | text = text[:47] + "..." 111 | args_str = f'("{text}")' 112 | elif action_type == "key" and "text" in details: 113 | args_str = f"('{details['text']}')" 114 | elif action_type == "scroll" and "x" in details and "y" in details: 115 | args_str = f"({details['x']}, {details['y']})" 116 | 117 | if total_cost > 0: 118 | print_colored(f"🛠️ {action_type}{args_str}", dim=True, right=f"💸 ${total_cost:.2f}") 119 | else: 120 | print_colored(f"🛠️ {action_type}{args_str}", dim=True) 121 | 122 | def print_welcome(model: str, agent_loop: str, container_name: str): 123 | """Print welcome message.""" 124 | print_colored(f"Connected to {container_name} ({model}, {agent_loop})") 125 | print_colored("Type 'exit' to quit.", dim=True) 126 | 127 | async def ainput(prompt: str = ""): 128 | return await asyncio.to_thread(input, prompt) 129 | 130 | async def chat_loop(agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True): 131 | """Main chat loop with the agent.""" 132 | print_welcome(model, agent.agent_config_info.agent_class.__name__, container_name) 133 | 134 | history = [] 135 | 136 | if initial_prompt: 137 | history.append({"role": "user", "content": initial_prompt}) 138 | 139 | total_cost = 0 140 | 141 | while True: 142 | if len(history) == 0 or history[-1].get("role") != "user": 143 | # Get user input with prompt 144 | print_colored("> ", end="") 145 | user_input = await ainput() 146 | 147 | if user_input.lower() in ['exit', 'quit', 'q']: 148 | print_colored("\n👋 Goodbye!") 149 | break 150 | 151 | if not user_input: 152 | continue 153 | 154 | # Add user message to history 155 | history.append({"role": "user", "content": user_input}) 156 | 157 | # Stream responses from the agent with spinner 158 | with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner: 159 | spinner.hide() 160 | 161 | async for result in agent.run(history): 162 | # Add agent responses to history 163 | history.extend(result.get("output", [])) 164 | 165 | if show_usage: 166 | total_cost += result.get("usage", {}).get("response_cost", 0) 167 | 168 | # Process and display the output 169 | for item in result.get("output", []): 170 | if item.get("type") == "message" and item.get("role") == "assistant": 171 | # Display agent text response 172 | content = item.get("content", []) 173 | for content_part in content: 174 | if content_part.get("text"): 175 | text = content_part.get("text", "").strip() 176 | if text: 177 | spinner.hide() 178 | print_colored(text) 179 | 180 | elif item.get("type") == "computer_call": 181 | # Display computer action 182 | action = item.get("action", {}) 183 | action_type = action.get("type", "") 184 | if action_type: 185 | spinner.hide() 186 | print_action(action_type, action, total_cost) 187 | spinner.text = f"Performing {action_type}..." 188 | spinner.show() 189 | 190 | elif item.get("type") == "function_call": 191 | # Display function call 192 | function_name = item.get("name", "") 193 | spinner.hide() 194 | print_colored(f"🔧 Calling function: {function_name}", dim=True) 195 | spinner.text = f"Calling {function_name}..." 196 | spinner.show() 197 | 198 | elif item.get("type") == "function_call_output": 199 | # Display function output (dimmed) 200 | output = item.get("output", "") 201 | if output and len(output.strip()) > 0: 202 | spinner.hide() 203 | print_colored(f"📤 {output}", dim=True) 204 | 205 | spinner.hide() 206 | if show_usage and total_cost > 0: 207 | print_colored(f"Total cost: ${total_cost:.2f}", dim=True) 208 | 209 | 210 | async def main(): 211 | """Main CLI function.""" 212 | parser = argparse.ArgumentParser( 213 | description="CUA Agent CLI - Interactive computer use assistant", 214 | formatter_class=argparse.RawDescriptionHelpFormatter, 215 | epilog=""" 216 | Examples: 217 | python -m agent.cli openai/computer-use-preview 218 | python -m agent.cli anthropic/claude-3-5-sonnet-20241022 219 | python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022 220 | python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B 221 | """ 222 | ) 223 | 224 | parser.add_argument( 225 | "model", 226 | help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')" 227 | ) 228 | 229 | parser.add_argument( 230 | "--provider", 231 | choices=["cloud", "lume", "winsandbox", "docker"], 232 | default="cloud", 233 | help="Computer provider to use: cloud (default), lume, winsandbox, or docker" 234 | ) 235 | 236 | parser.add_argument( 237 | "--images", 238 | type=int, 239 | default=3, 240 | help="Number of recent images to keep in context (default: 3)" 241 | ) 242 | 243 | parser.add_argument( 244 | "--trajectory", 245 | action="store_true", 246 | help="Save trajectory for debugging" 247 | ) 248 | 249 | parser.add_argument( 250 | "--budget", 251 | type=float, 252 | help="Maximum budget for the session (in dollars)" 253 | ) 254 | 255 | parser.add_argument( 256 | "--verbose", 257 | action="store_true", 258 | help="Enable verbose logging" 259 | ) 260 | 261 | parser.add_argument( 262 | "-p", "--prompt", 263 | type=str, 264 | help="Initial prompt to send to the agent. Leave blank for interactive mode." 265 | ) 266 | 267 | parser.add_argument( 268 | "--prompt-file", 269 | type=Path, 270 | help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt." 271 | ) 272 | 273 | parser.add_argument( 274 | "--predict-click", 275 | dest="predict_click", 276 | type=str, 277 | help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it." 278 | ) 279 | 280 | parser.add_argument( 281 | "-c", "--cache", 282 | action="store_true", 283 | help="Tell the API to enable caching" 284 | ) 285 | 286 | parser.add_argument( 287 | "-u", "--usage", 288 | action="store_true", 289 | help="Show total cost of the agent runs" 290 | ) 291 | 292 | parser.add_argument( 293 | "-r", "--max-retries", 294 | type=int, 295 | default=3, 296 | help="Maximum number of retries for the LLM API calls" 297 | ) 298 | 299 | args = parser.parse_args() 300 | 301 | # Check for required environment variables 302 | container_name = os.getenv("CUA_CONTAINER_NAME") 303 | cua_api_key = os.getenv("CUA_API_KEY") 304 | 305 | # Prompt for missing environment variables (container name always required) 306 | if not container_name: 307 | if args.provider == "cloud": 308 | print_colored("CUA_CONTAINER_NAME not set.", dim=True) 309 | print_colored("You can get a CUA container at https://www.trycua.com/", dim=True) 310 | container_name = input("Enter your CUA container name: ").strip() 311 | if not container_name: 312 | print_colored("❌ Container name is required.") 313 | sys.exit(1) 314 | else: 315 | container_name = "cli-sandbox" 316 | 317 | # Only require API key for cloud provider 318 | if args.provider == "cloud" and not cua_api_key: 319 | print_colored("CUA_API_KEY not set.", dim=True) 320 | cua_api_key = input("Enter your CUA API key: ").strip() 321 | if not cua_api_key: 322 | print_colored("❌ API key is required for cloud provider.") 323 | sys.exit(1) 324 | 325 | # Check for provider-specific API keys based on model 326 | provider_api_keys = { 327 | "openai/": "OPENAI_API_KEY", 328 | "anthropic/": "ANTHROPIC_API_KEY", 329 | } 330 | 331 | # Find matching provider and check for API key 332 | for prefix, env_var in provider_api_keys.items(): 333 | if prefix in args.model: 334 | if not os.getenv(env_var): 335 | print_colored(f"{env_var} not set.", dim=True) 336 | api_key = input(f"Enter your {env_var.replace('_', ' ').title()}: ").strip() 337 | if not api_key: 338 | print_colored(f"❌ {env_var.replace('_', ' ').title()} is required.") 339 | sys.exit(1) 340 | # Set the environment variable for the session 341 | os.environ[env_var] = api_key 342 | break 343 | 344 | # Import here to avoid import errors if dependencies are missing 345 | try: 346 | from agent import ComputerAgent 347 | from computer import Computer 348 | except ImportError as e: 349 | print_colored(f"❌ Import error: {e}", Colors.RED, bold=True) 350 | print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW) 351 | sys.exit(1) 352 | 353 | # Resolve provider -> os_type, provider_type, api key requirement 354 | provider_map = { 355 | "cloud": ("linux", "cloud", True), 356 | "lume": ("macos", "lume", False), 357 | "winsandbox": ("windows", "winsandbox", False), 358 | "docker": ("linux", "docker", False), 359 | } 360 | os_type, provider_type, needs_api_key = provider_map[args.provider] 361 | 362 | computer_kwargs = { 363 | "os_type": os_type, 364 | "provider_type": provider_type, 365 | "name": container_name, 366 | } 367 | if needs_api_key: 368 | computer_kwargs["api_key"] = cua_api_key # type: ignore 369 | 370 | # Create computer instance 371 | async with Computer(**computer_kwargs) as computer: # type: ignore 372 | 373 | # Create agent 374 | agent_kwargs = { 375 | "model": args.model, 376 | "tools": [computer], 377 | "trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA) 378 | "verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING 379 | "max_retries": args.max_retries 380 | } 381 | 382 | if args.images > 0: 383 | agent_kwargs["only_n_most_recent_images"] = args.images 384 | 385 | if args.trajectory: 386 | agent_kwargs["trajectory_dir"] = "trajectories" 387 | 388 | if args.budget: 389 | agent_kwargs["max_trajectory_budget"] = { 390 | "max_budget": args.budget, 391 | "raise_error": True, 392 | "reset_after_each_run": False 393 | } 394 | 395 | if args.cache: 396 | agent_kwargs["use_prompt_caching"] = True 397 | 398 | agent = ComputerAgent(**agent_kwargs) 399 | 400 | # If predict-click mode is requested, run once and exit 401 | if args.predict_click: 402 | if not PIL_AVAILABLE: 403 | print_colored("❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow", Colors.RED, bold=True) 404 | sys.exit(1) 405 | 406 | instruction = args.predict_click 407 | print_colored(f"Predicting click for: '{instruction}'", Colors.CYAN) 408 | 409 | # Take a fresh screenshot FIRST 410 | try: 411 | img_bytes = await computer.interface.screenshot() 412 | except Exception as e: 413 | print_colored(f"❌ Failed to take screenshot: {e}", Colors.RED, bold=True) 414 | sys.exit(1) 415 | 416 | # Encode screenshot to base64 for predict_click 417 | try: 418 | image_b64 = base64.b64encode(img_bytes).decode("utf-8") 419 | except Exception as e: 420 | print_colored(f"❌ Failed to encode screenshot: {e}", Colors.RED, bold=True) 421 | sys.exit(1) 422 | 423 | try: 424 | coords = await agent.predict_click(instruction, image_b64=image_b64) 425 | except Exception as e: 426 | print_colored(f"❌ predict_click failed: {e}", Colors.RED, bold=True) 427 | sys.exit(1) 428 | 429 | if not coords: 430 | print_colored("⚠️ No coordinates returned.", Colors.YELLOW) 431 | sys.exit(2) 432 | 433 | x, y = coords 434 | print_colored(f"✅ Predicted coordinates: ({x}, {y})", Colors.GREEN) 435 | 436 | try: 437 | from io import BytesIO 438 | with Image.open(BytesIO(img_bytes)) as img: 439 | img = img.convert("RGB") 440 | draw = ImageDraw.Draw(img) 441 | # Draw crosshair 442 | size = 12 443 | color = (255, 0, 0) 444 | draw.line([(x - size, y), (x + size, y)], fill=color, width=3) 445 | draw.line([(x, y - size), (x, y + size)], fill=color, width=3) 446 | # Optional small circle 447 | r = 6 448 | draw.ellipse([(x - r, y - r), (x + r, y + r)], outline=color, width=2) 449 | 450 | out_path = Path.cwd() / f"predict_click_{int(time.time())}.png" 451 | img.save(out_path) 452 | print_colored(f"🖼️ Saved to {out_path}") 453 | 454 | # Open the image with default viewer 455 | try: 456 | system = platform.system().lower() 457 | if system == "windows": 458 | os.startfile(str(out_path)) # type: ignore[attr-defined] 459 | elif system == "darwin": 460 | os.system(f"open \"{out_path}\"") 461 | else: 462 | os.system(f"xdg-open \"{out_path}\"") 463 | except Exception: 464 | pass 465 | except Exception as e: 466 | print_colored(f"❌ Failed to render/save screenshot: {e}", Colors.RED, bold=True) 467 | sys.exit(1) 468 | 469 | # Done 470 | sys.exit(0) 471 | 472 | # Resolve initial prompt from --prompt-file or --prompt 473 | initial_prompt = args.prompt or "" 474 | if args.prompt_file: 475 | try: 476 | initial_prompt = args.prompt_file.read_text(encoding="utf-8") 477 | except Exception as e: 478 | print_colored(f"❌ Failed to read --prompt-file: {e}", Colors.RED, bold=True) 479 | sys.exit(1) 480 | 481 | # Start chat loop (default interactive mode) 482 | await chat_loop(agent, args.model, container_name, initial_prompt, args.usage) 483 | 484 | 485 | 486 | if __name__ == "__main__": 487 | try: 488 | asyncio.run(main()) 489 | except (KeyboardInterrupt, EOFError) as _: 490 | print_colored("\n\n👋 Goodbye!") ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/loops/moondream3.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Moondream3+ composed-grounded agent loop implementation. 3 | Grounding is handled by a local Moondream3 preview model via Transformers. 4 | Thinking is delegated to the trailing LLM in the composed model string: "moondream3+<thinking_model>". 5 | 6 | Differences from composed_grounded: 7 | - Provides a singleton Moondream3 client outside the class. 8 | - predict_click uses model.point(image, instruction, settings={"max_objects": 1}) and returns pixel coordinates. 9 | - If the last image was a screenshot (or we take one), run model.detect(image, "all form ui") to get bboxes, then 10 | run model.caption on each cropped bbox to label it. Overlay labels on the screenshot and emit via _on_screenshot. 11 | - Add a user message listing all detected form UI names so the thinker can reference them. 12 | - If the thinking model doesn't support vision, filter out image content before calling litellm. 13 | """ 14 | 15 | from __future__ import annotations 16 | 17 | import uuid 18 | import base64 19 | import io 20 | from typing import Dict, List, Any, Optional, Tuple, Any 21 | 22 | from PIL import Image, ImageDraw, ImageFont 23 | import torch 24 | from transformers import AutoModelForCausalLM 25 | import litellm 26 | 27 | from ..decorators import register_agent 28 | from ..types import AgentCapability 29 | from ..loops.base import AsyncAgentConfig 30 | from ..responses import ( 31 | convert_computer_calls_xy2desc, 32 | convert_responses_items_to_completion_messages, 33 | convert_completion_messages_to_responses_items, 34 | convert_computer_calls_desc2xy, 35 | get_all_element_descriptions, 36 | ) 37 | 38 | _MOONDREAM_SINGLETON = None 39 | 40 | def get_moondream_model() -> Any: 41 | """Get a singleton instance of the Moondream3 preview model.""" 42 | global _MOONDREAM_SINGLETON 43 | if _MOONDREAM_SINGLETON is None: 44 | _MOONDREAM_SINGLETON = AutoModelForCausalLM.from_pretrained( 45 | "moondream/moondream3-preview", 46 | trust_remote_code=True, 47 | torch_dtype=torch.bfloat16, 48 | device_map="cuda", 49 | ) 50 | return _MOONDREAM_SINGLETON 51 | 52 | 53 | def _decode_image_b64(image_b64: str) -> Image.Image: 54 | data = base64.b64decode(image_b64) 55 | return Image.open(io.BytesIO(data)).convert("RGB") 56 | 57 | 58 | def _image_to_b64(img: Image.Image) -> str: 59 | buf = io.BytesIO() 60 | img.save(buf, format="PNG") 61 | return base64.b64encode(buf.getvalue()).decode("utf-8") 62 | 63 | 64 | def _supports_vision(model: str) -> bool: 65 | """Heuristic vision support detection for thinking model.""" 66 | m = model.lower() 67 | vision_markers = [ 68 | "gpt-4o", 69 | "gpt-4.1", 70 | "o1", 71 | "o3", 72 | "claude-3", 73 | "claude-3.5", 74 | "sonnet", 75 | "haiku", 76 | "opus", 77 | "gemini-1.5", 78 | "llava", 79 | ] 80 | return any(v in m for v in vision_markers) 81 | 82 | 83 | def _filter_images_from_completion_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 84 | filtered: List[Dict[str, Any]] = [] 85 | for msg in messages: 86 | msg_copy = {**msg} 87 | content = msg_copy.get("content") 88 | if isinstance(content, list): 89 | msg_copy["content"] = [c for c in content if c.get("type") != "image_url"] 90 | filtered.append(msg_copy) 91 | return filtered 92 | 93 | def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str, List[str]]: 94 | """Detect UI elements with Moondream, caption each, draw labels with backgrounds. 95 | 96 | Args: 97 | base_img: PIL image of the screenshot (RGB or RGBA). Will be copied/converted internally. 98 | model_md: Moondream model instance with .detect() and .query() methods. 99 | 100 | Returns: 101 | A tuple of (annotated_image_base64_png, detected_names) 102 | """ 103 | # Ensure RGBA for semi-transparent fills 104 | if base_img.mode != "RGBA": 105 | base_img = base_img.convert("RGBA") 106 | W, H = base_img.width, base_img.height 107 | 108 | # Detect objects 109 | try: 110 | detect_result = model_md.detect(base_img, "all ui elements") 111 | objects = detect_result.get("objects", []) if isinstance(detect_result, dict) else [] 112 | except Exception: 113 | objects = [] 114 | 115 | draw = ImageDraw.Draw(base_img) 116 | try: 117 | font = ImageFont.load_default() 118 | except Exception: 119 | font = None 120 | 121 | detected_names: List[str] = [] 122 | 123 | for i, obj in enumerate(objects): 124 | try: 125 | # Clamp normalized coords and crop 126 | x_min = max(0.0, min(1.0, float(obj.get("x_min", 0.0)))) 127 | y_min = max(0.0, min(1.0, float(obj.get("y_min", 0.0)))) 128 | x_max = max(0.0, min(1.0, float(obj.get("x_max", 0.0)))) 129 | y_max = max(0.0, min(1.0, float(obj.get("y_max", 0.0)))) 130 | left, top, right, bottom = int(x_min * W), int(y_min * H), int(x_max * W), int(y_max * H) 131 | left, top = max(0, left), max(0, top) 132 | right, bottom = min(W - 1, right), min(H - 1, bottom) 133 | crop = base_img.crop((left, top, right, bottom)) 134 | 135 | # Prompted short caption 136 | try: 137 | result = model_md.query(crop, "Caption this UI element in few words.") 138 | caption_text = (result or {}).get("answer", "") 139 | except Exception: 140 | caption_text = "" 141 | 142 | name = (caption_text or "").strip() or f"element_{i+1}" 143 | detected_names.append(name) 144 | 145 | # Draw bbox 146 | draw.rectangle([left, top, right, bottom], outline=(255, 215, 0, 255), width=2) 147 | 148 | # Label background with padding and rounded corners 149 | label = f"{i+1}. {name}" 150 | padding = 3 151 | if font: 152 | text_bbox = draw.textbbox((0, 0), label, font=font) 153 | else: 154 | text_bbox = draw.textbbox((0, 0), label) 155 | text_w = text_bbox[2] - text_bbox[0] 156 | text_h = text_bbox[3] - text_bbox[1] 157 | 158 | tx = left + 3 159 | ty = top - (text_h + 2 * padding + 4) 160 | if ty < 0: 161 | ty = top + 3 162 | 163 | bg_left = tx - padding 164 | bg_top = ty - padding 165 | bg_right = tx + text_w + padding 166 | bg_bottom = ty + text_h + padding 167 | try: 168 | draw.rounded_rectangle( 169 | [bg_left, bg_top, bg_right, bg_bottom], 170 | radius=4, 171 | fill=(0, 0, 0, 160), 172 | outline=(255, 215, 0, 200), 173 | width=1, 174 | ) 175 | except Exception: 176 | draw.rectangle( 177 | [bg_left, bg_top, bg_right, bg_bottom], 178 | fill=(0, 0, 0, 160), 179 | outline=(255, 215, 0, 200), 180 | width=1, 181 | ) 182 | 183 | text_fill = (255, 255, 255, 255) 184 | if font: 185 | draw.text((tx, ty), label, fill=text_fill, font=font) 186 | else: 187 | draw.text((tx, ty), label, fill=text_fill) 188 | except Exception: 189 | continue 190 | 191 | # Encode PNG base64 192 | annotated = base_img 193 | if annotated.mode not in ("RGBA", "RGB"): 194 | annotated = annotated.convert("RGBA") 195 | annotated_b64 = _image_to_b64(annotated) 196 | return annotated_b64, detected_names 197 | 198 | GROUNDED_COMPUTER_TOOL_SCHEMA = { 199 | "type": "function", 200 | "function": { 201 | "name": "computer", 202 | "description": ( 203 | "Control a computer by taking screenshots and interacting with UI elements. " 204 | "The screenshot action will include a list of detected form UI element names when available. " 205 | "Use element descriptions to locate and interact with UI elements on the screen." 206 | ), 207 | "parameters": { 208 | "type": "object", 209 | "properties": { 210 | "action": { 211 | "type": "string", 212 | "enum": [ 213 | "screenshot", 214 | "click", 215 | "double_click", 216 | "drag", 217 | "type", 218 | "keypress", 219 | "scroll", 220 | "move", 221 | "wait", 222 | "get_current_url", 223 | "get_dimensions", 224 | "get_environment", 225 | ], 226 | "description": "The action to perform (required for all actions)", 227 | }, 228 | "element_description": { 229 | "type": "string", 230 | "description": "Description of the element to interact with (required for click/double_click/move/scroll)", 231 | }, 232 | "start_element_description": { 233 | "type": "string", 234 | "description": "Description of the element to start dragging from (required for drag)", 235 | }, 236 | "end_element_description": { 237 | "type": "string", 238 | "description": "Description of the element to drag to (required for drag)", 239 | }, 240 | "text": { 241 | "type": "string", 242 | "description": "The text to type (required for type)", 243 | }, 244 | "keys": { 245 | "type": "array", 246 | "items": {"type": "string"}, 247 | "description": "Key(s) to press (required for keypress)", 248 | }, 249 | "button": { 250 | "type": "string", 251 | "enum": ["left", "right", "wheel", "back", "forward"], 252 | "description": "The mouse button to use for click/double_click", 253 | }, 254 | "scroll_x": { 255 | "type": "integer", 256 | "description": "Horizontal scroll amount (required for scroll)", 257 | }, 258 | "scroll_y": { 259 | "type": "integer", 260 | "description": "Vertical scroll amount (required for scroll)", 261 | }, 262 | }, 263 | "required": ["action"], 264 | }, 265 | }, 266 | } 267 | 268 | @register_agent(r"moondream3\+.*", priority=2) 269 | class Moondream3PlusConfig(AsyncAgentConfig): 270 | def __init__(self): 271 | self.desc2xy: Dict[str, Tuple[float, float]] = {} 272 | 273 | async def predict_step( 274 | self, 275 | messages: List[Dict[str, Any]], 276 | model: str, 277 | tools: Optional[List[Dict[str, Any]]] = None, 278 | max_retries: Optional[int] = None, 279 | stream: bool = False, 280 | computer_handler=None, 281 | use_prompt_caching: Optional[bool] = False, 282 | _on_api_start=None, 283 | _on_api_end=None, 284 | _on_usage=None, 285 | _on_screenshot=None, 286 | **kwargs, 287 | ) -> Dict[str, Any]: 288 | # Parse composed model: moondream3+<thinking_model> 289 | if "+" not in model: 290 | raise ValueError(f"Composed model must be 'moondream3+<thinking_model>', got: {model}") 291 | _, thinking_model = model.split("+", 1) 292 | 293 | pre_output_items: List[Dict[str, Any]] = [] 294 | 295 | # Acquire last screenshot; if missing, take one 296 | last_image_b64: Optional[str] = None 297 | for message in reversed(messages): 298 | if ( 299 | isinstance(message, dict) 300 | and message.get("type") == "computer_call_output" 301 | and isinstance(message.get("output"), dict) 302 | and message["output"].get("type") == "input_image" 303 | ): 304 | image_url = message["output"].get("image_url", "") 305 | if image_url.startswith("data:image/png;base64,"): 306 | last_image_b64 = image_url.split(",", 1)[1] 307 | break 308 | 309 | if last_image_b64 is None and computer_handler is not None: 310 | # Take a screenshot 311 | screenshot_b64 = await computer_handler.screenshot() # type: ignore 312 | if screenshot_b64: 313 | call_id = uuid.uuid4().hex 314 | pre_output_items += [ 315 | { 316 | "type": "message", 317 | "role": "assistant", 318 | "content": [ 319 | {"type": "output_text", "text": "Taking a screenshot to analyze the current screen."} 320 | ], 321 | }, 322 | {"type": "computer_call", "call_id": call_id, "status": "completed", "action": {"type": "screenshot"}}, 323 | { 324 | "type": "computer_call_output", 325 | "call_id": call_id, 326 | "output": {"type": "input_image", "image_url": f"data:image/png;base64,{screenshot_b64}"}, 327 | }, 328 | ] 329 | last_image_b64 = screenshot_b64 330 | if _on_screenshot: 331 | await _on_screenshot(screenshot_b64) 332 | 333 | # If we have a last screenshot, run Moondream detection and labeling 334 | detected_names: List[str] = [] 335 | if last_image_b64 is not None: 336 | base_img = _decode_image_b64(last_image_b64) 337 | model_md = get_moondream_model() 338 | annotated_b64, detected_names = _annotate_detect_and_label_ui(base_img, model_md) 339 | if _on_screenshot: 340 | await _on_screenshot(annotated_b64, "annotated_form_ui") 341 | 342 | # Also push a user message listing all detected names 343 | if detected_names: 344 | names_text = "\n".join(f"- {n}" for n in detected_names) 345 | pre_output_items.append( 346 | { 347 | "type": "message", 348 | "role": "user", 349 | "content": [ 350 | {"type": "input_text", "text": "Detected form UI elements on screen:"}, 351 | {"type": "input_text", "text": names_text}, 352 | {"type": "input_text", "text": "Please continue with the next action needed to perform your task."} 353 | ], 354 | } 355 | ) 356 | 357 | tool_schemas = [] 358 | for schema in (tools or []): 359 | if schema.get("type") == "computer": 360 | tool_schemas.append(GROUNDED_COMPUTER_TOOL_SCHEMA) 361 | else: 362 | tool_schemas.append(schema) 363 | 364 | # Step 1: Convert computer calls from xy to descriptions 365 | input_messages = messages + pre_output_items 366 | messages_with_descriptions = convert_computer_calls_xy2desc(input_messages, self.desc2xy) 367 | 368 | # Step 2: Convert responses items to completion messages 369 | completion_messages = convert_responses_items_to_completion_messages( 370 | messages_with_descriptions, 371 | allow_images_in_tool_results=False, 372 | ) 373 | 374 | # Optionally filter images if model lacks vision 375 | if not _supports_vision(thinking_model): 376 | completion_messages = _filter_images_from_completion_messages(completion_messages) 377 | 378 | # Step 3: Call thinking model with litellm.acompletion 379 | api_kwargs = { 380 | "model": thinking_model, 381 | "messages": completion_messages, 382 | "tools": tool_schemas, 383 | "max_retries": max_retries, 384 | "stream": stream, 385 | **kwargs, 386 | } 387 | if use_prompt_caching: 388 | api_kwargs["use_prompt_caching"] = use_prompt_caching 389 | 390 | if _on_api_start: 391 | await _on_api_start(api_kwargs) 392 | 393 | response = await litellm.acompletion(**api_kwargs) 394 | 395 | if _on_api_end: 396 | await _on_api_end(api_kwargs, response) 397 | 398 | usage = { 399 | **response.usage.model_dump(), # type: ignore 400 | "response_cost": response._hidden_params.get("response_cost", 0.0), 401 | } 402 | if _on_usage: 403 | await _on_usage(usage) 404 | 405 | # Step 4: Convert completion messages back to responses items format 406 | response_dict = response.model_dump() # type: ignore 407 | choice_messages = [choice["message"] for choice in response_dict["choices"]] 408 | thinking_output_items: List[Dict[str, Any]] = [] 409 | for choice_message in choice_messages: 410 | thinking_output_items.extend( 411 | convert_completion_messages_to_responses_items([choice_message]) 412 | ) 413 | 414 | # Step 5: Use Moondream to get coordinates for each description 415 | element_descriptions = get_all_element_descriptions(thinking_output_items) 416 | if element_descriptions and last_image_b64: 417 | for desc in element_descriptions: 418 | for _ in range(3): # try 3 times 419 | coords = await self.predict_click( 420 | model=model, 421 | image_b64=last_image_b64, 422 | instruction=desc, 423 | ) 424 | if coords: 425 | self.desc2xy[desc] = coords 426 | break 427 | 428 | # Step 6: Convert computer calls from descriptions back to xy coordinates 429 | final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy) 430 | 431 | # Step 7: Return output and usage 432 | return {"output": pre_output_items + final_output_items, "usage": usage} 433 | 434 | async def predict_click( 435 | self, 436 | model: str, 437 | image_b64: str, 438 | instruction: str, 439 | **kwargs, 440 | ) -> Optional[Tuple[float, float]]: 441 | """Predict click coordinates using Moondream3's point API. 442 | 443 | Returns pixel coordinates (x, y) as floats. 444 | """ 445 | img = _decode_image_b64(image_b64) 446 | W, H = img.width, img.height 447 | model_md = get_moondream_model() 448 | try: 449 | result = model_md.point(img, instruction, settings={"max_objects": 1}) 450 | except Exception: 451 | return None 452 | 453 | try: 454 | pt = (result or {}).get("points", [])[0] 455 | x_norm = float(pt.get("x", 0.0)) 456 | y_norm = float(pt.get("y", 0.0)) 457 | x_px = max(0.0, min(float(W - 1), x_norm * W)) 458 | y_px = max(0.0, min(float(H - 1), y_norm * H)) 459 | return (x_px, y_px) 460 | except Exception: 461 | return None 462 | 463 | def get_capabilities(self) -> List[AgentCapability]: 464 | return ["click", "step"] 465 | ``` -------------------------------------------------------------------------------- /libs/typescript/computer/src/interface/macos.ts: -------------------------------------------------------------------------------- ```typescript 1 | /** 2 | * macOS computer interface implementation. 3 | */ 4 | 5 | import type { ScreenSize } from '../types'; 6 | import { BaseComputerInterface } from './base'; 7 | import type { AccessibilityNode, CursorPosition, MouseButton } from './base'; 8 | 9 | export class MacOSComputerInterface extends BaseComputerInterface { 10 | // Mouse Actions 11 | /** 12 | * Press and hold a mouse button at the specified coordinates. 13 | * @param {number} [x] - X coordinate for the mouse action 14 | * @param {number} [y] - Y coordinate for the mouse action 15 | * @param {MouseButton} [button='left'] - Mouse button to press down 16 | * @returns {Promise<void>} 17 | */ 18 | async mouseDown( 19 | x?: number, 20 | y?: number, 21 | button: MouseButton = 'left' 22 | ): Promise<void> { 23 | await this.sendCommand('mouse_down', { x, y, button }); 24 | } 25 | 26 | /** 27 | * Release a mouse button at the specified coordinates. 28 | * @param {number} [x] - X coordinate for the mouse action 29 | * @param {number} [y] - Y coordinate for the mouse action 30 | * @param {MouseButton} [button='left'] - Mouse button to release 31 | * @returns {Promise<void>} 32 | */ 33 | async mouseUp( 34 | x?: number, 35 | y?: number, 36 | button: MouseButton = 'left' 37 | ): Promise<void> { 38 | await this.sendCommand('mouse_up', { x, y, button }); 39 | } 40 | 41 | /** 42 | * Perform a left mouse click at the specified coordinates. 43 | * @param {number} [x] - X coordinate for the click 44 | * @param {number} [y] - Y coordinate for the click 45 | * @returns {Promise<void>} 46 | */ 47 | async leftClick(x?: number, y?: number): Promise<void> { 48 | await this.sendCommand('left_click', { x, y }); 49 | } 50 | 51 | /** 52 | * Perform a right mouse click at the specified coordinates. 53 | * @param {number} [x] - X coordinate for the click 54 | * @param {number} [y] - Y coordinate for the click 55 | * @returns {Promise<void>} 56 | */ 57 | async rightClick(x?: number, y?: number): Promise<void> { 58 | await this.sendCommand('right_click', { x, y }); 59 | } 60 | 61 | /** 62 | * Perform a double click at the specified coordinates. 63 | * @param {number} [x] - X coordinate for the double click 64 | * @param {number} [y] - Y coordinate for the double click 65 | * @returns {Promise<void>} 66 | */ 67 | async doubleClick(x?: number, y?: number): Promise<void> { 68 | await this.sendCommand('double_click', { x, y }); 69 | } 70 | 71 | /** 72 | * Move the cursor to the specified coordinates. 73 | * @param {number} x - X coordinate to move to 74 | * @param {number} y - Y coordinate to move to 75 | * @returns {Promise<void>} 76 | */ 77 | async moveCursor(x: number, y: number): Promise<void> { 78 | await this.sendCommand('move_cursor', { x, y }); 79 | } 80 | 81 | /** 82 | * Drag from current position to the specified coordinates. 83 | * @param {number} x - X coordinate to drag to 84 | * @param {number} y - Y coordinate to drag to 85 | * @param {MouseButton} [button='left'] - Mouse button to use for dragging 86 | * @param {number} [duration=0.5] - Duration of the drag operation in seconds 87 | * @returns {Promise<void>} 88 | */ 89 | async dragTo( 90 | x: number, 91 | y: number, 92 | button: MouseButton = 'left', 93 | duration = 0.5 94 | ): Promise<void> { 95 | await this.sendCommand('drag_to', { x, y, button, duration }); 96 | } 97 | 98 | /** 99 | * Drag along a path of coordinates. 100 | * @param {Array<[number, number]>} path - Array of [x, y] coordinate pairs to drag through 101 | * @param {MouseButton} [button='left'] - Mouse button to use for dragging 102 | * @param {number} [duration=0.5] - Duration of the drag operation in seconds 103 | * @returns {Promise<void>} 104 | */ 105 | async drag( 106 | path: Array<[number, number]>, 107 | button: MouseButton = 'left', 108 | duration = 0.5 109 | ): Promise<void> { 110 | await this.sendCommand('drag', { path, button, duration }); 111 | } 112 | 113 | // Keyboard Actions 114 | /** 115 | * Press and hold a key. 116 | * @param {string} key - Key to press down 117 | * @returns {Promise<void>} 118 | */ 119 | async keyDown(key: string): Promise<void> { 120 | await this.sendCommand('key_down', { key }); 121 | } 122 | 123 | /** 124 | * Release a key. 125 | * @param {string} key - Key to release 126 | * @returns {Promise<void>} 127 | */ 128 | async keyUp(key: string): Promise<void> { 129 | await this.sendCommand('key_up', { key }); 130 | } 131 | 132 | /** 133 | * Type text as if entered from keyboard. 134 | * @param {string} text - Text to type 135 | * @returns {Promise<void>} 136 | */ 137 | async typeText(text: string): Promise<void> { 138 | await this.sendCommand('type_text', { text }); 139 | } 140 | 141 | /** 142 | * Press and release a key. 143 | * @param {string} key - Key to press 144 | * @returns {Promise<void>} 145 | */ 146 | async pressKey(key: string): Promise<void> { 147 | await this.sendCommand('press_key', { key }); 148 | } 149 | 150 | /** 151 | * Press multiple keys simultaneously as a hotkey combination. 152 | * @param {...string} keys - Keys to press together 153 | * @returns {Promise<void>} 154 | */ 155 | async hotkey(...keys: string[]): Promise<void> { 156 | await this.sendCommand('hotkey', { keys }); 157 | } 158 | 159 | // Scrolling Actions 160 | /** 161 | * Scroll by the specified amount in x and y directions. 162 | * @param {number} x - Horizontal scroll amount 163 | * @param {number} y - Vertical scroll amount 164 | * @returns {Promise<void>} 165 | */ 166 | async scroll(x: number, y: number): Promise<void> { 167 | await this.sendCommand('scroll', { x, y }); 168 | } 169 | 170 | /** 171 | * Scroll down by the specified number of clicks. 172 | * @param {number} [clicks=1] - Number of scroll clicks 173 | * @returns {Promise<void>} 174 | */ 175 | async scrollDown(clicks = 1): Promise<void> { 176 | await this.sendCommand('scroll_down', { clicks }); 177 | } 178 | 179 | /** 180 | * Scroll up by the specified number of clicks. 181 | * @param {number} [clicks=1] - Number of scroll clicks 182 | * @returns {Promise<void>} 183 | */ 184 | async scrollUp(clicks = 1): Promise<void> { 185 | await this.sendCommand('scroll_up', { clicks }); 186 | } 187 | 188 | // Screen Actions 189 | /** 190 | * Take a screenshot of the screen. 191 | * @returns {Promise<Buffer>} Screenshot image data as a Buffer 192 | * @throws {Error} If screenshot fails 193 | */ 194 | async screenshot(): Promise<Buffer> { 195 | const response = await this.sendCommand('screenshot'); 196 | if (!response.image_data) { 197 | throw new Error('Failed to take screenshot'); 198 | } 199 | return Buffer.from(response.image_data as string, 'base64'); 200 | } 201 | 202 | /** 203 | * Get the current screen size. 204 | * @returns {Promise<ScreenSize>} Screen dimensions 205 | * @throws {Error} If unable to get screen size 206 | */ 207 | async getScreenSize(): Promise<ScreenSize> { 208 | const response = await this.sendCommand('get_screen_size'); 209 | if (!response.success || !response.size) { 210 | throw new Error('Failed to get screen size'); 211 | } 212 | return response.size as ScreenSize; 213 | } 214 | 215 | /** 216 | * Get the current cursor position. 217 | * @returns {Promise<CursorPosition>} Current cursor coordinates 218 | * @throws {Error} If unable to get cursor position 219 | */ 220 | async getCursorPosition(): Promise<CursorPosition> { 221 | const response = await this.sendCommand('get_cursor_position'); 222 | if (!response.success || !response.position) { 223 | throw new Error('Failed to get cursor position'); 224 | } 225 | return response.position as CursorPosition; 226 | } 227 | 228 | // Clipboard Actions 229 | /** 230 | * Copy current selection to clipboard and return the content. 231 | * @returns {Promise<string>} Clipboard content 232 | * @throws {Error} If unable to get clipboard content 233 | */ 234 | async copyToClipboard(): Promise<string> { 235 | const response = await this.sendCommand('copy_to_clipboard'); 236 | if (!response.success || !response.content) { 237 | throw new Error('Failed to get clipboard content'); 238 | } 239 | return response.content as string; 240 | } 241 | 242 | /** 243 | * Set the clipboard content to the specified text. 244 | * @param {string} text - Text to set in clipboard 245 | * @returns {Promise<void>} 246 | */ 247 | async setClipboard(text: string): Promise<void> { 248 | await this.sendCommand('set_clipboard', { text }); 249 | } 250 | 251 | // File System Actions 252 | /** 253 | * Check if a file exists at the specified path. 254 | * @param {string} path - Path to the file 255 | * @returns {Promise<boolean>} True if file exists, false otherwise 256 | */ 257 | async fileExists(path: string): Promise<boolean> { 258 | const response = await this.sendCommand('file_exists', { path }); 259 | return (response.exists as boolean) || false; 260 | } 261 | 262 | /** 263 | * Check if a directory exists at the specified path. 264 | * @param {string} path - Path to the directory 265 | * @returns {Promise<boolean>} True if directory exists, false otherwise 266 | */ 267 | async directoryExists(path: string): Promise<boolean> { 268 | const response = await this.sendCommand('directory_exists', { path }); 269 | return (response.exists as boolean) || false; 270 | } 271 | 272 | /** 273 | * List the contents of a directory. 274 | * @param {string} path - Path to the directory 275 | * @returns {Promise<string[]>} Array of file and directory names 276 | * @throws {Error} If unable to list directory 277 | */ 278 | async listDir(path: string): Promise<string[]> { 279 | const response = await this.sendCommand('list_dir', { path }); 280 | if (!response.success) { 281 | throw new Error((response.error as string) || 'Failed to list directory'); 282 | } 283 | return (response.files as string[]) || []; 284 | } 285 | 286 | /** 287 | * Get the size of a file in bytes. 288 | * @param {string} path - Path to the file 289 | * @returns {Promise<number>} File size in bytes 290 | * @throws {Error} If unable to get file size 291 | */ 292 | async getFileSize(path: string): Promise<number> { 293 | const response = await this.sendCommand('get_file_size', { path }); 294 | if (!response.success) { 295 | throw new Error((response.error as string) || 'Failed to get file size'); 296 | } 297 | return (response.size as number) || 0; 298 | } 299 | 300 | /** 301 | * Read file content in chunks for large files. 302 | * @private 303 | * @param {string} path - Path to the file 304 | * @param {number} offset - Starting byte offset 305 | * @param {number} totalLength - Total number of bytes to read 306 | * @param {number} [chunkSize=1048576] - Size of each chunk in bytes 307 | * @returns {Promise<Buffer>} File content as Buffer 308 | * @throws {Error} If unable to read file chunk 309 | */ 310 | private async readBytesChunked( 311 | path: string, 312 | offset: number, 313 | totalLength: number, 314 | chunkSize: number = 1024 * 1024 315 | ): Promise<Buffer> { 316 | const chunks: Buffer[] = []; 317 | let currentOffset = offset; 318 | let remaining = totalLength; 319 | 320 | while (remaining > 0) { 321 | const readSize = Math.min(chunkSize, remaining); 322 | const response = await this.sendCommand('read_bytes', { 323 | path, 324 | offset: currentOffset, 325 | length: readSize, 326 | }); 327 | 328 | if (!response.success) { 329 | throw new Error( 330 | (response.error as string) || 'Failed to read file chunk' 331 | ); 332 | } 333 | 334 | const chunkData = Buffer.from(response.content_b64 as string, 'base64'); 335 | chunks.push(chunkData); 336 | 337 | currentOffset += readSize; 338 | remaining -= readSize; 339 | } 340 | 341 | return Buffer.concat(chunks); 342 | } 343 | 344 | /** 345 | * Write file content in chunks for large files. 346 | * @private 347 | * @param {string} path - Path to the file 348 | * @param {Buffer} content - Content to write 349 | * @param {boolean} [append=false] - Whether to append to existing file 350 | * @param {number} [chunkSize=1048576] - Size of each chunk in bytes 351 | * @returns {Promise<void>} 352 | * @throws {Error} If unable to write file chunk 353 | */ 354 | private async writeBytesChunked( 355 | path: string, 356 | content: Buffer, 357 | append: boolean = false, 358 | chunkSize: number = 1024 * 1024 359 | ): Promise<void> { 360 | const totalSize = content.length; 361 | let currentOffset = 0; 362 | 363 | while (currentOffset < totalSize) { 364 | const chunkEnd = Math.min(currentOffset + chunkSize, totalSize); 365 | const chunkData = content.subarray(currentOffset, chunkEnd); 366 | 367 | // First chunk uses the original append flag, subsequent chunks always append 368 | const chunkAppend = currentOffset === 0 ? append : true; 369 | 370 | const response = await this.sendCommand('write_bytes', { 371 | path, 372 | content_b64: chunkData.toString('base64'), 373 | append: chunkAppend, 374 | }); 375 | 376 | if (!response.success) { 377 | throw new Error( 378 | (response.error as string) || 'Failed to write file chunk' 379 | ); 380 | } 381 | 382 | currentOffset = chunkEnd; 383 | } 384 | } 385 | 386 | /** 387 | * Read text from a file with specified encoding. 388 | * @param {string} path - Path to the file to read 389 | * @param {BufferEncoding} [encoding='utf8'] - Text encoding to use 390 | * @returns {Promise<string>} The decoded text content of the file 391 | */ 392 | async readText(path: string, encoding: BufferEncoding = 'utf8'): Promise<string> { 393 | const contentBytes = await this.readBytes(path); 394 | return contentBytes.toString(encoding); 395 | } 396 | 397 | /** 398 | * Write text to a file with specified encoding. 399 | * @param {string} path - Path to the file to write 400 | * @param {string} content - Text content to write 401 | * @param {BufferEncoding} [encoding='utf8'] - Text encoding to use 402 | * @param {boolean} [append=false] - Whether to append to the file instead of overwriting 403 | * @returns {Promise<void>} 404 | */ 405 | async writeText( 406 | path: string, 407 | content: string, 408 | encoding: BufferEncoding = 'utf8', 409 | append: boolean = false 410 | ): Promise<void> { 411 | const contentBytes = Buffer.from(content, encoding); 412 | await this.writeBytes(path, contentBytes, append); 413 | } 414 | 415 | /** 416 | * Read bytes from a file, with optional offset and length. 417 | * @param {string} path - Path to the file 418 | * @param {number} [offset=0] - Starting byte offset 419 | * @param {number} [length] - Number of bytes to read (reads entire file if not specified) 420 | * @returns {Promise<Buffer>} File content as Buffer 421 | * @throws {Error} If unable to read file 422 | */ 423 | async readBytes(path: string, offset: number = 0, length?: number): Promise<Buffer> { 424 | // For large files, use chunked reading 425 | if (length === undefined) { 426 | // Get file size first to determine if we need chunking 427 | const fileSize = await this.getFileSize(path); 428 | // If file is larger than 5MB, read in chunks 429 | if (fileSize > 5 * 1024 * 1024) { 430 | const readLength = offset > 0 ? fileSize - offset : fileSize; 431 | return await this.readBytesChunked(path, offset, readLength); 432 | } 433 | } 434 | 435 | const response = await this.sendCommand('read_bytes', { 436 | path, 437 | offset, 438 | length, 439 | }); 440 | if (!response.success) { 441 | throw new Error((response.error as string) || 'Failed to read file'); 442 | } 443 | return Buffer.from(response.content_b64 as string, 'base64'); 444 | } 445 | 446 | /** 447 | * Write bytes to a file. 448 | * @param {string} path - Path to the file 449 | * @param {Buffer} content - Content to write as Buffer 450 | * @param {boolean} [append=false] - Whether to append to existing file 451 | * @returns {Promise<void>} 452 | * @throws {Error} If unable to write file 453 | */ 454 | async writeBytes(path: string, content: Buffer, append: boolean = false): Promise<void> { 455 | // For large files, use chunked writing 456 | if (content.length > 5 * 1024 * 1024) { 457 | // 5MB threshold 458 | await this.writeBytesChunked(path, content, append); 459 | return; 460 | } 461 | 462 | const response = await this.sendCommand('write_bytes', { 463 | path, 464 | content_b64: content.toString('base64'), 465 | append, 466 | }); 467 | if (!response.success) { 468 | throw new Error((response.error as string) || 'Failed to write file'); 469 | } 470 | } 471 | 472 | /** 473 | * Delete a file at the specified path. 474 | * @param {string} path - Path to the file to delete 475 | * @returns {Promise<void>} 476 | * @throws {Error} If unable to delete file 477 | */ 478 | async deleteFile(path: string): Promise<void> { 479 | const response = await this.sendCommand('delete_file', { path }); 480 | if (!response.success) { 481 | throw new Error((response.error as string) || 'Failed to delete file'); 482 | } 483 | } 484 | 485 | /** 486 | * Create a directory at the specified path. 487 | * @param {string} path - Path where to create the directory 488 | * @returns {Promise<void>} 489 | * @throws {Error} If unable to create directory 490 | */ 491 | async createDir(path: string): Promise<void> { 492 | const response = await this.sendCommand('create_dir', { path }); 493 | if (!response.success) { 494 | throw new Error( 495 | (response.error as string) || 'Failed to create directory' 496 | ); 497 | } 498 | } 499 | 500 | /** 501 | * Delete a directory at the specified path. 502 | * @param {string} path - Path to the directory to delete 503 | * @returns {Promise<void>} 504 | * @throws {Error} If unable to delete directory 505 | */ 506 | async deleteDir(path: string): Promise<void> { 507 | const response = await this.sendCommand('delete_dir', { path }); 508 | if (!response.success) { 509 | throw new Error( 510 | (response.error as string) || 'Failed to delete directory' 511 | ); 512 | } 513 | } 514 | 515 | /** 516 | * Execute a shell command and return stdout and stderr. 517 | * @param {string} command - Command to execute 518 | * @returns {Promise<[string, string]>} Tuple of [stdout, stderr] 519 | * @throws {Error} If command execution fails 520 | */ 521 | async runCommand(command: string): Promise<[string, string]> { 522 | const response = await this.sendCommand('run_command', { command }); 523 | if (!response.success) { 524 | throw new Error((response.error as string) || 'Failed to run command'); 525 | } 526 | return [ 527 | (response.stdout as string) || '', 528 | (response.stderr as string) || '', 529 | ]; 530 | } 531 | 532 | // Accessibility Actions 533 | /** 534 | * Get the accessibility tree of the current screen. 535 | * @returns {Promise<AccessibilityNode>} Root accessibility node 536 | * @throws {Error} If unable to get accessibility tree 537 | */ 538 | async getAccessibilityTree(): Promise<AccessibilityNode> { 539 | const response = await this.sendCommand('get_accessibility_tree'); 540 | if (!response.success) { 541 | throw new Error( 542 | (response.error as string) || 'Failed to get accessibility tree' 543 | ); 544 | } 545 | return response as unknown as AccessibilityNode; 546 | } 547 | 548 | /** 549 | * Convert coordinates to screen coordinates. 550 | * @param {number} x - X coordinate to convert 551 | * @param {number} y - Y coordinate to convert 552 | * @returns {Promise<[number, number]>} Converted screen coordinates as [x, y] 553 | * @throws {Error} If coordinate conversion fails 554 | */ 555 | async toScreenCoordinates(x: number, y: number): Promise<[number, number]> { 556 | const response = await this.sendCommand('to_screen_coordinates', { x, y }); 557 | if (!response.success || !response.coordinates) { 558 | throw new Error('Failed to convert to screen coordinates'); 559 | } 560 | return response.coordinates as [number, number]; 561 | } 562 | 563 | /** 564 | * Convert coordinates to screenshot coordinates. 565 | * @param {number} x - X coordinate to convert 566 | * @param {number} y - Y coordinate to convert 567 | * @returns {Promise<[number, number]>} Converted screenshot coordinates as [x, y] 568 | * @throws {Error} If coordinate conversion fails 569 | */ 570 | async toScreenshotCoordinates( 571 | x: number, 572 | y: number 573 | ): Promise<[number, number]> { 574 | const response = await this.sendCommand('to_screenshot_coordinates', { 575 | x, 576 | y, 577 | }); 578 | if (!response.success || !response.coordinates) { 579 | throw new Error('Failed to convert to screenshot coordinates'); 580 | } 581 | return response.coordinates as [number, number]; 582 | } 583 | } 584 | ``` -------------------------------------------------------------------------------- /libs/lume/src/Virtualization/VMVirtualizationService.swift: -------------------------------------------------------------------------------- ```swift 1 | import Foundation 2 | import Virtualization 3 | 4 | /// Framework-agnostic VM configuration 5 | struct VMVirtualizationServiceContext { 6 | let cpuCount: Int 7 | let memorySize: UInt64 8 | let display: String 9 | let sharedDirectories: [SharedDirectory]? 10 | let mount: Path? 11 | let hardwareModel: Data? 12 | let machineIdentifier: Data? 13 | let macAddress: String 14 | let diskPath: Path 15 | let nvramPath: Path 16 | let recoveryMode: Bool 17 | let usbMassStoragePaths: [Path]? 18 | } 19 | 20 | /// Protocol defining the interface for virtualization operations 21 | @MainActor 22 | protocol VMVirtualizationService { 23 | var state: VZVirtualMachine.State { get } 24 | func start() async throws 25 | func stop() async throws 26 | func pause() async throws 27 | func resume() async throws 28 | func getVirtualMachine() -> Any 29 | } 30 | 31 | /// Base implementation of VMVirtualizationService using VZVirtualMachine 32 | @MainActor 33 | class BaseVirtualizationService: VMVirtualizationService { 34 | let virtualMachine: VZVirtualMachine 35 | let recoveryMode: Bool // Store whether we should start in recovery mode 36 | 37 | var state: VZVirtualMachine.State { 38 | virtualMachine.state 39 | } 40 | 41 | init(virtualMachine: VZVirtualMachine, recoveryMode: Bool = false) { 42 | self.virtualMachine = virtualMachine 43 | self.recoveryMode = recoveryMode 44 | } 45 | 46 | func start() async throws { 47 | try await withCheckedThrowingContinuation { 48 | (continuation: CheckedContinuation<Void, Error>) in 49 | Task { @MainActor in 50 | if #available(macOS 13, *) { 51 | let startOptions = VZMacOSVirtualMachineStartOptions() 52 | startOptions.startUpFromMacOSRecovery = recoveryMode 53 | if recoveryMode { 54 | Logger.info("Starting VM in recovery mode") 55 | } 56 | virtualMachine.start(options: startOptions) { error in 57 | if let error = error { 58 | continuation.resume(throwing: error) 59 | } else { 60 | continuation.resume() 61 | } 62 | } 63 | } else { 64 | Logger.info("Starting VM in normal mode") 65 | virtualMachine.start { result in 66 | switch result { 67 | case .success: 68 | continuation.resume() 69 | case .failure(let error): 70 | continuation.resume(throwing: error) 71 | } 72 | } 73 | } 74 | } 75 | } 76 | } 77 | 78 | func stop() async throws { 79 | try await withCheckedThrowingContinuation { 80 | (continuation: CheckedContinuation<Void, Error>) in 81 | virtualMachine.stop { error in 82 | if let error = error { 83 | continuation.resume(throwing: error) 84 | } else { 85 | continuation.resume() 86 | } 87 | } 88 | } 89 | } 90 | 91 | func pause() async throws { 92 | try await withCheckedThrowingContinuation { 93 | (continuation: CheckedContinuation<Void, Error>) in 94 | virtualMachine.start { result in 95 | switch result { 96 | case .success: 97 | continuation.resume() 98 | case .failure(let error): 99 | continuation.resume(throwing: error) 100 | } 101 | } 102 | } 103 | } 104 | 105 | func resume() async throws { 106 | try await withCheckedThrowingContinuation { 107 | (continuation: CheckedContinuation<Void, Error>) in 108 | virtualMachine.start { result in 109 | switch result { 110 | case .success: 111 | continuation.resume() 112 | case .failure(let error): 113 | continuation.resume(throwing: error) 114 | } 115 | } 116 | } 117 | } 118 | 119 | func getVirtualMachine() -> Any { 120 | return virtualMachine 121 | } 122 | 123 | // Helper methods for creating common configurations 124 | static func createStorageDeviceConfiguration(diskPath: Path, readOnly: Bool = false) throws 125 | -> VZStorageDeviceConfiguration 126 | { 127 | return VZVirtioBlockDeviceConfiguration( 128 | attachment: try VZDiskImageStorageDeviceAttachment( 129 | url: diskPath.url, 130 | readOnly: readOnly, 131 | cachingMode: VZDiskImageCachingMode.automatic, 132 | synchronizationMode: VZDiskImageSynchronizationMode.fsync 133 | ) 134 | ) 135 | } 136 | 137 | static func createUSBMassStorageDeviceConfiguration(diskPath: Path, readOnly: Bool = false) 138 | throws 139 | -> VZStorageDeviceConfiguration 140 | { 141 | if #available(macOS 15.0, *) { 142 | return VZUSBMassStorageDeviceConfiguration( 143 | attachment: try VZDiskImageStorageDeviceAttachment( 144 | url: diskPath.url, 145 | readOnly: readOnly, 146 | cachingMode: VZDiskImageCachingMode.automatic, 147 | synchronizationMode: VZDiskImageSynchronizationMode.fsync 148 | ) 149 | ) 150 | } else { 151 | // Fallback to normal storage device if USB mass storage not available 152 | return try createStorageDeviceConfiguration(diskPath: diskPath, readOnly: readOnly) 153 | } 154 | } 155 | 156 | static func createNetworkDeviceConfiguration(macAddress: String) throws 157 | -> VZNetworkDeviceConfiguration 158 | { 159 | let network = VZVirtioNetworkDeviceConfiguration() 160 | guard let vzMacAddress = VZMACAddress(string: macAddress) else { 161 | throw VMConfigError.invalidMachineIdentifier 162 | } 163 | network.attachment = VZNATNetworkDeviceAttachment() 164 | network.macAddress = vzMacAddress 165 | return network 166 | } 167 | 168 | static func createDirectorySharingDevices(sharedDirectories: [SharedDirectory]?) 169 | -> [VZDirectorySharingDeviceConfiguration] 170 | { 171 | return sharedDirectories?.map { sharedDir in 172 | let device = VZVirtioFileSystemDeviceConfiguration(tag: sharedDir.tag) 173 | let url = URL(fileURLWithPath: sharedDir.hostPath) 174 | device.share = VZSingleDirectoryShare( 175 | directory: VZSharedDirectory(url: url, readOnly: sharedDir.readOnly)) 176 | return device 177 | } ?? [] 178 | } 179 | } 180 | 181 | /// macOS-specific virtualization service 182 | @MainActor 183 | final class DarwinVirtualizationService: BaseVirtualizationService { 184 | static func createConfiguration(_ config: VMVirtualizationServiceContext) throws 185 | -> VZVirtualMachineConfiguration 186 | { 187 | let vzConfig = VZVirtualMachineConfiguration() 188 | vzConfig.cpuCount = config.cpuCount 189 | vzConfig.memorySize = config.memorySize 190 | 191 | // Platform configuration 192 | guard let machineIdentifier = config.machineIdentifier else { 193 | throw VMConfigError.emptyMachineIdentifier 194 | } 195 | 196 | guard let hardwareModel = config.hardwareModel else { 197 | throw VMConfigError.emptyHardwareModel 198 | } 199 | 200 | let platform = VZMacPlatformConfiguration() 201 | platform.auxiliaryStorage = VZMacAuxiliaryStorage(url: config.nvramPath.url) 202 | Logger.info("Pre-VZMacHardwareModel: hardwareModel=\(hardwareModel)") 203 | guard let vzHardwareModel = VZMacHardwareModel(dataRepresentation: hardwareModel) else { 204 | throw VMConfigError.invalidHardwareModel 205 | } 206 | platform.hardwareModel = vzHardwareModel 207 | guard 208 | let vzMachineIdentifier = VZMacMachineIdentifier(dataRepresentation: machineIdentifier) 209 | else { 210 | throw VMConfigError.invalidMachineIdentifier 211 | } 212 | platform.machineIdentifier = vzMachineIdentifier 213 | vzConfig.platform = platform 214 | vzConfig.bootLoader = VZMacOSBootLoader() 215 | 216 | // Graphics configuration 217 | let display = VMDisplayResolution(string: config.display)! 218 | let graphics = VZMacGraphicsDeviceConfiguration() 219 | graphics.displays = [ 220 | VZMacGraphicsDisplayConfiguration( 221 | widthInPixels: display.width, 222 | heightInPixels: display.height, 223 | pixelsPerInch: 220 // Retina display density 224 | ) 225 | ] 226 | vzConfig.graphicsDevices = [graphics] 227 | 228 | // Common configurations 229 | vzConfig.keyboards = [VZUSBKeyboardConfiguration()] 230 | vzConfig.pointingDevices = [VZUSBScreenCoordinatePointingDeviceConfiguration()] 231 | var storageDevices = [try createStorageDeviceConfiguration(diskPath: config.diskPath)] 232 | if let mount = config.mount { 233 | storageDevices.append( 234 | try createStorageDeviceConfiguration(diskPath: mount, readOnly: true)) 235 | } 236 | // Add USB mass storage devices if specified 237 | if #available(macOS 15.0, *), let usbPaths = config.usbMassStoragePaths, !usbPaths.isEmpty { 238 | for usbPath in usbPaths { 239 | storageDevices.append( 240 | try createUSBMassStorageDeviceConfiguration(diskPath: usbPath, readOnly: true)) 241 | } 242 | } 243 | vzConfig.storageDevices = storageDevices 244 | vzConfig.networkDevices = [ 245 | try createNetworkDeviceConfiguration(macAddress: config.macAddress) 246 | ] 247 | vzConfig.memoryBalloonDevices = [VZVirtioTraditionalMemoryBalloonDeviceConfiguration()] 248 | vzConfig.entropyDevices = [VZVirtioEntropyDeviceConfiguration()] 249 | 250 | // Audio configuration 251 | let soundDeviceConfiguration = VZVirtioSoundDeviceConfiguration() 252 | let inputAudioStreamConfiguration = VZVirtioSoundDeviceInputStreamConfiguration() 253 | let outputAudioStreamConfiguration = VZVirtioSoundDeviceOutputStreamConfiguration() 254 | 255 | inputAudioStreamConfiguration.source = VZHostAudioInputStreamSource() 256 | outputAudioStreamConfiguration.sink = VZHostAudioOutputStreamSink() 257 | 258 | soundDeviceConfiguration.streams = [inputAudioStreamConfiguration, outputAudioStreamConfiguration] 259 | vzConfig.audioDevices = [soundDeviceConfiguration] 260 | 261 | // Clipboard sharing via Spice agent 262 | let spiceAgentConsoleDevice = VZVirtioConsoleDeviceConfiguration() 263 | let spiceAgentPort = VZVirtioConsolePortConfiguration() 264 | spiceAgentPort.name = VZSpiceAgentPortAttachment.spiceAgentPortName 265 | let spiceAgentPortAttachment = VZSpiceAgentPortAttachment() 266 | spiceAgentPortAttachment.sharesClipboard = true 267 | spiceAgentPort.attachment = spiceAgentPortAttachment 268 | spiceAgentConsoleDevice.ports[0] = spiceAgentPort 269 | vzConfig.consoleDevices.append(spiceAgentConsoleDevice) 270 | 271 | // Directory sharing 272 | let directorySharingDevices = createDirectorySharingDevices( 273 | sharedDirectories: config.sharedDirectories) 274 | if !directorySharingDevices.isEmpty { 275 | vzConfig.directorySharingDevices = directorySharingDevices 276 | } 277 | 278 | // USB Controller configuration 279 | if #available(macOS 15.0, *) { 280 | let usbControllerConfiguration = VZXHCIControllerConfiguration() 281 | vzConfig.usbControllers = [usbControllerConfiguration] 282 | } 283 | 284 | try vzConfig.validate() 285 | return vzConfig 286 | } 287 | 288 | static func generateMacAddress() -> String { 289 | VZMACAddress.randomLocallyAdministered().string 290 | } 291 | 292 | static func generateMachineIdentifier() -> Data { 293 | VZMacMachineIdentifier().dataRepresentation 294 | } 295 | 296 | func createAuxiliaryStorage(at path: Path, hardwareModel: Data) throws { 297 | guard let vzHardwareModel = VZMacHardwareModel(dataRepresentation: hardwareModel) else { 298 | throw VMConfigError.invalidHardwareModel 299 | } 300 | _ = try VZMacAuxiliaryStorage(creatingStorageAt: path.url, hardwareModel: vzHardwareModel) 301 | } 302 | 303 | init(configuration: VMVirtualizationServiceContext) throws { 304 | let vzConfig = try Self.createConfiguration(configuration) 305 | super.init( 306 | virtualMachine: VZVirtualMachine(configuration: vzConfig), 307 | recoveryMode: configuration.recoveryMode) 308 | } 309 | 310 | func installMacOS(imagePath: Path, progressHandler: (@Sendable (Double) -> Void)?) async throws 311 | { 312 | var observers: [NSKeyValueObservation] = [] // must hold observer references during installation to print process 313 | try await withCheckedThrowingContinuation { 314 | (continuation: CheckedContinuation<Void, Error>) in 315 | Task { 316 | let installer = VZMacOSInstaller( 317 | virtualMachine: virtualMachine, restoringFromImageAt: imagePath.url) 318 | Logger.info("Starting macOS installation") 319 | 320 | if let progressHandler = progressHandler { 321 | let observer = installer.progress.observe( 322 | \.fractionCompleted, options: [.initial, .new] 323 | ) { (progress, change) in 324 | if let newValue = change.newValue { 325 | progressHandler(newValue) 326 | } 327 | } 328 | observers.append(observer) 329 | } 330 | 331 | installer.install { result in 332 | switch result { 333 | case .success: 334 | continuation.resume() 335 | case .failure(let error): 336 | Logger.error("Failed to install, error=\(error))") 337 | continuation.resume(throwing: error) 338 | } 339 | } 340 | } 341 | } 342 | Logger.info("macOS installation finished") 343 | } 344 | } 345 | 346 | /// Linux-specific virtualization service 347 | @MainActor 348 | final class LinuxVirtualizationService: BaseVirtualizationService { 349 | static func createConfiguration(_ config: VMVirtualizationServiceContext) throws 350 | -> VZVirtualMachineConfiguration 351 | { 352 | let vzConfig = VZVirtualMachineConfiguration() 353 | vzConfig.cpuCount = config.cpuCount 354 | vzConfig.memorySize = config.memorySize 355 | 356 | // Platform configuration 357 | let platform = VZGenericPlatformConfiguration() 358 | if #available(macOS 15, *) { 359 | platform.isNestedVirtualizationEnabled = 360 | VZGenericPlatformConfiguration.isNestedVirtualizationSupported 361 | } 362 | vzConfig.platform = platform 363 | 364 | let bootLoader = VZEFIBootLoader() 365 | bootLoader.variableStore = VZEFIVariableStore(url: config.nvramPath.url) 366 | vzConfig.bootLoader = bootLoader 367 | 368 | // Graphics configuration 369 | let display = VMDisplayResolution(string: config.display)! 370 | let graphics = VZVirtioGraphicsDeviceConfiguration() 371 | graphics.scanouts = [ 372 | VZVirtioGraphicsScanoutConfiguration( 373 | widthInPixels: display.width, 374 | heightInPixels: display.height 375 | ) 376 | ] 377 | vzConfig.graphicsDevices = [graphics] 378 | 379 | // Common configurations 380 | vzConfig.keyboards = [VZUSBKeyboardConfiguration()] 381 | vzConfig.pointingDevices = [VZUSBScreenCoordinatePointingDeviceConfiguration()] 382 | var storageDevices = [try createStorageDeviceConfiguration(diskPath: config.diskPath)] 383 | if let mount = config.mount { 384 | storageDevices.append( 385 | try createStorageDeviceConfiguration(diskPath: mount, readOnly: true)) 386 | } 387 | // Add USB mass storage devices if specified 388 | if #available(macOS 15.0, *), let usbPaths = config.usbMassStoragePaths, !usbPaths.isEmpty { 389 | for usbPath in usbPaths { 390 | storageDevices.append( 391 | try createUSBMassStorageDeviceConfiguration(diskPath: usbPath, readOnly: true)) 392 | } 393 | } 394 | vzConfig.storageDevices = storageDevices 395 | vzConfig.networkDevices = [ 396 | try createNetworkDeviceConfiguration(macAddress: config.macAddress) 397 | ] 398 | vzConfig.memoryBalloonDevices = [VZVirtioTraditionalMemoryBalloonDeviceConfiguration()] 399 | vzConfig.entropyDevices = [VZVirtioEntropyDeviceConfiguration()] 400 | 401 | // Audio configuration 402 | let soundDeviceConfiguration = VZVirtioSoundDeviceConfiguration() 403 | let inputAudioStreamConfiguration = VZVirtioSoundDeviceInputStreamConfiguration() 404 | let outputAudioStreamConfiguration = VZVirtioSoundDeviceOutputStreamConfiguration() 405 | 406 | inputAudioStreamConfiguration.source = VZHostAudioInputStreamSource() 407 | outputAudioStreamConfiguration.sink = VZHostAudioOutputStreamSink() 408 | 409 | soundDeviceConfiguration.streams = [inputAudioStreamConfiguration, outputAudioStreamConfiguration] 410 | vzConfig.audioDevices = [soundDeviceConfiguration] 411 | 412 | // Clipboard sharing via Spice agent 413 | let spiceAgentConsoleDevice = VZVirtioConsoleDeviceConfiguration() 414 | let spiceAgentPort = VZVirtioConsolePortConfiguration() 415 | spiceAgentPort.name = VZSpiceAgentPortAttachment.spiceAgentPortName 416 | let spiceAgentPortAttachment = VZSpiceAgentPortAttachment() 417 | spiceAgentPortAttachment.sharesClipboard = true 418 | spiceAgentPort.attachment = spiceAgentPortAttachment 419 | spiceAgentConsoleDevice.ports[0] = spiceAgentPort 420 | vzConfig.consoleDevices.append(spiceAgentConsoleDevice) 421 | 422 | // Directory sharing 423 | var directorySharingDevices = createDirectorySharingDevices( 424 | sharedDirectories: config.sharedDirectories) 425 | 426 | // Add Rosetta support if available 427 | if #available(macOS 13.0, *) { 428 | if VZLinuxRosettaDirectoryShare.availability == .installed { 429 | do { 430 | let rosettaShare = try VZLinuxRosettaDirectoryShare() 431 | let rosettaDevice = VZVirtioFileSystemDeviceConfiguration(tag: "rosetta") 432 | rosettaDevice.share = rosettaShare 433 | directorySharingDevices.append(rosettaDevice) 434 | Logger.info("Added Rosetta support to Linux VM") 435 | } catch { 436 | Logger.info("Failed to add Rosetta support: \(error.localizedDescription)") 437 | } 438 | } else { 439 | Logger.info("Rosetta not installed, skipping Rosetta support") 440 | } 441 | } 442 | 443 | if !directorySharingDevices.isEmpty { 444 | vzConfig.directorySharingDevices = directorySharingDevices 445 | } 446 | 447 | // USB Controller configuration 448 | if #available(macOS 15.0, *) { 449 | let usbControllerConfiguration = VZXHCIControllerConfiguration() 450 | vzConfig.usbControllers = [usbControllerConfiguration] 451 | } 452 | 453 | try vzConfig.validate() 454 | return vzConfig 455 | } 456 | 457 | func generateMacAddress() -> String { 458 | VZMACAddress.randomLocallyAdministered().string 459 | } 460 | 461 | func createNVRAM(at path: Path) throws { 462 | _ = try VZEFIVariableStore(creatingVariableStoreAt: path.url) 463 | } 464 | 465 | init(configuration: VMVirtualizationServiceContext) throws { 466 | let vzConfig = try Self.createConfiguration(configuration) 467 | super.init(virtualMachine: VZVirtualMachine(configuration: vzConfig)) 468 | } 469 | } 470 | ``` -------------------------------------------------------------------------------- /libs/python/computer/computer/providers/lume_api.py: -------------------------------------------------------------------------------- ```python 1 | """Shared API utilities for Lume and Lumier providers. 2 | 3 | This module contains shared functions for interacting with the Lume API, 4 | used by both the LumeProvider and LumierProvider classes. 5 | """ 6 | 7 | import logging 8 | import json 9 | import subprocess 10 | import urllib.parse 11 | from typing import Dict, List, Optional, Any 12 | 13 | # Setup logging 14 | logger = logging.getLogger(__name__) 15 | 16 | # Check if curl is available 17 | try: 18 | subprocess.run(["curl", "--version"], capture_output=True, check=True) 19 | HAS_CURL = True 20 | except (subprocess.SubprocessError, FileNotFoundError): 21 | HAS_CURL = False 22 | 23 | 24 | def lume_api_get( 25 | vm_name: str, 26 | host: str, 27 | port: int, 28 | storage: Optional[str] = None, 29 | debug: bool = False, 30 | verbose: bool = False 31 | ) -> Dict[str, Any]: 32 | """Use curl to get VM information from Lume API. 33 | 34 | Args: 35 | vm_name: Name of the VM to get info for 36 | host: API host 37 | port: API port 38 | storage: Storage path for the VM 39 | debug: Whether to show debug output 40 | verbose: Enable verbose logging 41 | 42 | Returns: 43 | Dictionary with VM status information parsed from JSON response 44 | """ 45 | # URL encode the storage parameter for the query 46 | encoded_storage = "" 47 | storage_param = "" 48 | 49 | if storage: 50 | # First encode the storage path properly 51 | encoded_storage = urllib.parse.quote(storage, safe='') 52 | storage_param = f"?storage={encoded_storage}" 53 | 54 | # Construct API URL with encoded storage parameter if needed 55 | api_url = f"http://{host}:{port}/lume/vms/{vm_name}{storage_param}" 56 | 57 | # Construct the curl command with increased timeouts for more reliability 58 | # --connect-timeout: Time to establish connection (15 seconds) 59 | # --max-time: Maximum time for the whole operation (20 seconds) 60 | # -f: Fail silently (no output at all) on server errors 61 | # Add single quotes around URL to ensure special characters are handled correctly 62 | cmd = ["curl", "--connect-timeout", "15", "--max-time", "20", "-s", "-f", f"'{api_url}'"] 63 | 64 | # For logging and display, show the properly escaped URL 65 | display_cmd = ["curl", "--connect-timeout", "15", "--max-time", "20", "-s", "-f", api_url] 66 | 67 | # Only print the curl command when debug is enabled 68 | display_curl_string = ' '.join(display_cmd) 69 | logger.debug(f"Executing API request: {display_curl_string}") 70 | 71 | # Execute the command - for execution we need to use shell=True to handle URLs with special characters 72 | try: 73 | # Use a single string with shell=True for proper URL handling 74 | shell_cmd = ' '.join(cmd) 75 | result = subprocess.run(shell_cmd, shell=True, capture_output=True, text=True) 76 | 77 | # Handle curl exit codes 78 | if result.returncode != 0: 79 | curl_error = "Unknown error" 80 | 81 | # Map common curl error codes to helpful messages 82 | if result.returncode == 7: 83 | curl_error = "Failed to connect to the API server - it might still be starting up" 84 | elif result.returncode == 22: 85 | curl_error = "HTTP error returned from API server" 86 | elif result.returncode == 28: 87 | curl_error = "Operation timeout - the API server is taking too long to respond" 88 | elif result.returncode == 52: 89 | curl_error = "Empty reply from server - the API server is starting but not fully ready yet" 90 | elif result.returncode == 56: 91 | curl_error = "Network problem during data transfer - check container networking" 92 | 93 | # Only log at debug level to reduce noise during retries 94 | logger.debug(f"API request failed with code {result.returncode}: {curl_error}") 95 | 96 | # Return a more useful error message 97 | return { 98 | "error": f"API request failed: {curl_error}", 99 | "curl_code": result.returncode, 100 | "vm_name": vm_name, 101 | "status": "unknown" # We don't know the actual status due to API error 102 | } 103 | 104 | # Try to parse the response as JSON 105 | if result.stdout and result.stdout.strip(): 106 | try: 107 | vm_status = json.loads(result.stdout) 108 | if debug or verbose: 109 | logger.info(f"Successfully parsed VM status: {vm_status.get('status', 'unknown')}") 110 | return vm_status 111 | except json.JSONDecodeError as e: 112 | # Return the raw response if it's not valid JSON 113 | logger.warning(f"Invalid JSON response: {e}") 114 | if "Virtual machine not found" in result.stdout: 115 | return {"status": "not_found", "message": "VM not found in Lume API"} 116 | 117 | return {"error": f"Invalid JSON response: {result.stdout[:100]}...", "status": "unknown"} 118 | else: 119 | return {"error": "Empty response from API", "status": "unknown"} 120 | except subprocess.SubprocessError as e: 121 | logger.error(f"Failed to execute API request: {e}") 122 | return {"error": f"Failed to execute API request: {str(e)}", "status": "unknown"} 123 | 124 | 125 | def lume_api_run( 126 | vm_name: str, 127 | host: str, 128 | port: int, 129 | run_opts: Dict[str, Any], 130 | storage: Optional[str] = None, 131 | debug: bool = False, 132 | verbose: bool = False 133 | ) -> Dict[str, Any]: 134 | """Run a VM using curl. 135 | 136 | Args: 137 | vm_name: Name of the VM to run 138 | host: API host 139 | port: API port 140 | run_opts: Dictionary of run options 141 | storage: Storage path for the VM 142 | debug: Whether to show debug output 143 | verbose: Enable verbose logging 144 | 145 | Returns: 146 | Dictionary with API response or error information 147 | """ 148 | # Construct API URL 149 | api_url = f"http://{host}:{port}/lume/vms/{vm_name}/run" 150 | 151 | # Prepare JSON payload with required parameters 152 | payload = {} 153 | 154 | # Add CPU cores if specified 155 | if "cpu" in run_opts: 156 | payload["cpu"] = run_opts["cpu"] 157 | 158 | # Add memory if specified 159 | if "memory" in run_opts: 160 | payload["memory"] = run_opts["memory"] 161 | 162 | # Add storage parameter if specified 163 | if storage: 164 | payload["storage"] = storage 165 | elif "storage" in run_opts: 166 | payload["storage"] = run_opts["storage"] 167 | 168 | # Add shared directories if specified 169 | if "shared_directories" in run_opts and run_opts["shared_directories"]: 170 | payload["sharedDirectories"] = run_opts["shared_directories"] 171 | 172 | # Log the payload for debugging 173 | logger.debug(f"API payload: {json.dumps(payload, indent=2)}") 174 | 175 | # Construct the curl command 176 | cmd = [ 177 | "curl", "--connect-timeout", "30", "--max-time", "30", 178 | "-s", "-X", "POST", "-H", "Content-Type: application/json", 179 | "-d", json.dumps(payload), 180 | api_url 181 | ] 182 | 183 | # Execute the command 184 | try: 185 | result = subprocess.run(cmd, capture_output=True, text=True) 186 | 187 | if result.returncode != 0: 188 | logger.warning(f"API request failed with code {result.returncode}: {result.stderr}") 189 | return {"error": f"API request failed: {result.stderr}"} 190 | 191 | # Try to parse the response as JSON 192 | if result.stdout and result.stdout.strip(): 193 | try: 194 | response = json.loads(result.stdout) 195 | return response 196 | except json.JSONDecodeError: 197 | # Return the raw response if it's not valid JSON 198 | return {"success": True, "message": "VM started successfully", "raw_response": result.stdout} 199 | else: 200 | return {"success": True, "message": "VM started successfully"} 201 | except subprocess.SubprocessError as e: 202 | logger.error(f"Failed to execute run request: {e}") 203 | return {"error": f"Failed to execute run request: {str(e)}"} 204 | 205 | 206 | def lume_api_stop( 207 | vm_name: str, 208 | host: str, 209 | port: int, 210 | storage: Optional[str] = None, 211 | debug: bool = False, 212 | verbose: bool = False 213 | ) -> Dict[str, Any]: 214 | """Stop a VM using curl. 215 | 216 | Args: 217 | vm_name: Name of the VM to stop 218 | host: API host 219 | port: API port 220 | storage: Storage path for the VM 221 | debug: Whether to show debug output 222 | verbose: Enable verbose logging 223 | 224 | Returns: 225 | Dictionary with API response or error information 226 | """ 227 | # Construct API URL 228 | api_url = f"http://{host}:{port}/lume/vms/{vm_name}/stop" 229 | 230 | # Prepare JSON payload with required parameters 231 | payload = {} 232 | 233 | # Add storage path if specified 234 | if storage: 235 | payload["storage"] = storage 236 | 237 | # Construct the curl command 238 | cmd = [ 239 | "curl", "--connect-timeout", "15", "--max-time", "20", 240 | "-s", "-X", "POST", "-H", "Content-Type: application/json", 241 | "-d", json.dumps(payload), 242 | api_url 243 | ] 244 | 245 | # Execute the command 246 | try: 247 | if debug or verbose: 248 | logger.info(f"Executing: {' '.join(cmd)}") 249 | 250 | result = subprocess.run(cmd, capture_output=True, text=True) 251 | 252 | if result.returncode != 0: 253 | logger.warning(f"API request failed with code {result.returncode}: {result.stderr}") 254 | return {"error": f"API request failed: {result.stderr}"} 255 | 256 | # Try to parse the response as JSON 257 | if result.stdout and result.stdout.strip(): 258 | try: 259 | response = json.loads(result.stdout) 260 | return response 261 | except json.JSONDecodeError: 262 | # Return the raw response if it's not valid JSON 263 | return {"success": True, "message": "VM stopped successfully", "raw_response": result.stdout} 264 | else: 265 | return {"success": True, "message": "VM stopped successfully"} 266 | except subprocess.SubprocessError as e: 267 | logger.error(f"Failed to execute stop request: {e}") 268 | return {"error": f"Failed to execute stop request: {str(e)}"} 269 | 270 | 271 | def lume_api_update( 272 | vm_name: str, 273 | host: str, 274 | port: int, 275 | update_opts: Dict[str, Any], 276 | storage: Optional[str] = None, 277 | debug: bool = False, 278 | verbose: bool = False 279 | ) -> Dict[str, Any]: 280 | """Update VM settings using curl. 281 | 282 | Args: 283 | vm_name: Name of the VM to update 284 | host: API host 285 | port: API port 286 | update_opts: Dictionary of update options 287 | storage: Storage path for the VM 288 | debug: Whether to show debug output 289 | verbose: Enable verbose logging 290 | 291 | Returns: 292 | Dictionary with API response or error information 293 | """ 294 | # Construct API URL 295 | api_url = f"http://{host}:{port}/lume/vms/{vm_name}/update" 296 | 297 | # Prepare JSON payload with required parameters 298 | payload = {} 299 | 300 | # Add CPU cores if specified 301 | if "cpu" in update_opts: 302 | payload["cpu"] = update_opts["cpu"] 303 | 304 | # Add memory if specified 305 | if "memory" in update_opts: 306 | payload["memory"] = update_opts["memory"] 307 | 308 | # Add storage path if specified 309 | if storage: 310 | payload["storage"] = storage 311 | 312 | # Construct the curl command 313 | cmd = [ 314 | "curl", "--connect-timeout", "15", "--max-time", "20", 315 | "-s", "-X", "POST", "-H", "Content-Type: application/json", 316 | "-d", json.dumps(payload), 317 | api_url 318 | ] 319 | 320 | # Execute the command 321 | try: 322 | if debug: 323 | logger.info(f"Executing: {' '.join(cmd)}") 324 | 325 | result = subprocess.run(cmd, capture_output=True, text=True) 326 | 327 | if result.returncode != 0: 328 | logger.warning(f"API request failed with code {result.returncode}: {result.stderr}") 329 | return {"error": f"API request failed: {result.stderr}"} 330 | 331 | # Try to parse the response as JSON 332 | if result.stdout and result.stdout.strip(): 333 | try: 334 | response = json.loads(result.stdout) 335 | return response 336 | except json.JSONDecodeError: 337 | # Return the raw response if it's not valid JSON 338 | return {"success": True, "message": "VM updated successfully", "raw_response": result.stdout} 339 | else: 340 | return {"success": True, "message": "VM updated successfully"} 341 | except subprocess.SubprocessError as e: 342 | logger.error(f"Failed to execute update request: {e}") 343 | return {"error": f"Failed to execute update request: {str(e)}"} 344 | 345 | 346 | def lume_api_pull( 347 | image: str, 348 | name: str, 349 | host: str, 350 | port: int, 351 | storage: Optional[str] = None, 352 | registry: str = "ghcr.io", 353 | organization: str = "trycua", 354 | debug: bool = False, 355 | verbose: bool = False 356 | ) -> Dict[str, Any]: 357 | """Pull a VM image from a registry using curl. 358 | 359 | Args: 360 | image: Name/tag of the image to pull 361 | name: Name to give the VM after pulling 362 | host: API host 363 | port: API port 364 | storage: Storage path for the VM 365 | registry: Registry to pull from (default: ghcr.io) 366 | organization: Organization in registry (default: trycua) 367 | debug: Whether to show debug output 368 | verbose: Enable verbose logging 369 | 370 | Returns: 371 | Dictionary with pull status and information 372 | """ 373 | # Prepare pull request payload 374 | pull_payload = { 375 | "image": image, # Use provided image name 376 | "name": name, # Always use name as the target VM name 377 | "registry": registry, 378 | "organization": organization 379 | } 380 | 381 | if storage: 382 | pull_payload["storage"] = storage 383 | 384 | # Construct pull command with proper JSON payload 385 | pull_cmd = [ 386 | "curl" 387 | ] 388 | 389 | if not verbose: 390 | pull_cmd.append("-s") 391 | 392 | pull_cmd.extend([ 393 | "-X", "POST", 394 | "-H", "Content-Type: application/json", 395 | "-d", json.dumps(pull_payload), 396 | f"http://{host}:{port}/lume/pull" 397 | ]) 398 | 399 | logger.debug(f"Executing API request: {' '.join(pull_cmd)}") 400 | 401 | try: 402 | # Execute pull command 403 | result = subprocess.run(pull_cmd, capture_output=True, text=True) 404 | 405 | if result.returncode != 0: 406 | error_msg = f"Failed to pull VM {name}: {result.stderr}" 407 | logger.error(error_msg) 408 | return {"error": error_msg} 409 | 410 | try: 411 | response = json.loads(result.stdout) 412 | logger.info(f"Successfully initiated pull for VM {name}") 413 | return response 414 | except json.JSONDecodeError: 415 | if result.stdout: 416 | logger.info(f"Pull response: {result.stdout}") 417 | return {"success": True, "message": f"Successfully initiated pull for VM {name}"} 418 | 419 | except subprocess.SubprocessError as e: 420 | error_msg = f"Failed to execute pull command: {str(e)}" 421 | logger.error(error_msg) 422 | return {"error": error_msg} 423 | 424 | 425 | def lume_api_delete( 426 | vm_name: str, 427 | host: str, 428 | port: int, 429 | storage: Optional[str] = None, 430 | debug: bool = False, 431 | verbose: bool = False 432 | ) -> Dict[str, Any]: 433 | """Delete a VM using curl. 434 | 435 | Args: 436 | vm_name: Name of the VM to delete 437 | host: API host 438 | port: API port 439 | storage: Storage path for the VM 440 | debug: Whether to show debug output 441 | verbose: Enable verbose logging 442 | 443 | Returns: 444 | Dictionary with API response or error information 445 | """ 446 | # URL encode the storage parameter for the query 447 | encoded_storage = "" 448 | storage_param = "" 449 | 450 | if storage: 451 | # First encode the storage path properly 452 | encoded_storage = urllib.parse.quote(storage, safe='') 453 | storage_param = f"?storage={encoded_storage}" 454 | 455 | # Construct API URL with encoded storage parameter if needed 456 | api_url = f"http://{host}:{port}/lume/vms/{vm_name}{storage_param}" 457 | 458 | # Construct the curl command for DELETE operation - using much longer timeouts matching shell implementation 459 | cmd = ["curl", "--connect-timeout", "6000", "--max-time", "5000", "-s", "-X", "DELETE", f"'{api_url}'"] 460 | 461 | # For logging and display, show the properly escaped URL 462 | display_cmd = ["curl", "--connect-timeout", "6000", "--max-time", "5000", "-s", "-X", "DELETE", api_url] 463 | 464 | # Only print the curl command when debug is enabled 465 | display_curl_string = ' '.join(display_cmd) 466 | logger.debug(f"Executing API request: {display_curl_string}") 467 | 468 | # Execute the command - for execution we need to use shell=True to handle URLs with special characters 469 | try: 470 | # Use a single string with shell=True for proper URL handling 471 | shell_cmd = ' '.join(cmd) 472 | result = subprocess.run(shell_cmd, shell=True, capture_output=True, text=True) 473 | 474 | # Handle curl exit codes 475 | if result.returncode != 0: 476 | curl_error = "Unknown error" 477 | 478 | # Map common curl error codes to helpful messages 479 | if result.returncode == 7: 480 | curl_error = "Failed to connect to the API server - it might still be starting up" 481 | elif result.returncode == 22: 482 | curl_error = "HTTP error returned from API server" 483 | elif result.returncode == 28: 484 | curl_error = "Operation timeout - the API server is taking too long to respond" 485 | elif result.returncode == 52: 486 | curl_error = "Empty reply from server - the API server is starting but not fully ready yet" 487 | elif result.returncode == 56: 488 | curl_error = "Network problem during data transfer - check container networking" 489 | 490 | # Only log at debug level to reduce noise during retries 491 | logger.debug(f"API request failed with code {result.returncode}: {curl_error}") 492 | 493 | # Return a more useful error message 494 | return { 495 | "error": f"API request failed: {curl_error}", 496 | "curl_code": result.returncode, 497 | "vm_name": vm_name, 498 | "storage": storage 499 | } 500 | 501 | # Try to parse the response as JSON 502 | if result.stdout and result.stdout.strip(): 503 | try: 504 | response = json.loads(result.stdout) 505 | return response 506 | except json.JSONDecodeError: 507 | # Return the raw response if it's not valid JSON 508 | return {"success": True, "message": "VM deleted successfully", "raw_response": result.stdout} 509 | else: 510 | return {"success": True, "message": "VM deleted successfully"} 511 | except subprocess.SubprocessError as e: 512 | logger.error(f"Failed to execute delete request: {e}") 513 | return {"error": f"Failed to execute delete request: {str(e)}"} 514 | 515 | 516 | def parse_memory(memory_str: str) -> int: 517 | """Parse memory string to MB integer. 518 | 519 | Examples: 520 | "8GB" -> 8192 521 | "1024MB" -> 1024 522 | "512" -> 512 523 | 524 | Returns: 525 | Memory value in MB 526 | """ 527 | if isinstance(memory_str, int): 528 | return memory_str 529 | 530 | if isinstance(memory_str, str): 531 | # Extract number and unit 532 | import re 533 | match = re.match(r"(\d+)([A-Za-z]*)", memory_str) 534 | if match: 535 | value, unit = match.groups() 536 | value = int(value) 537 | unit = unit.upper() 538 | 539 | if unit == "GB" or unit == "G": 540 | return value * 1024 541 | elif unit == "MB" or unit == "M" or unit == "": 542 | return value 543 | 544 | # Default fallback 545 | logger.warning(f"Could not parse memory string '{memory_str}', using 8GB default") 546 | return 8192 # Default to 8GB 547 | ``` -------------------------------------------------------------------------------- /libs/python/pylume/pylume/server.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import time 3 | import asyncio 4 | import subprocess 5 | import tempfile 6 | import logging 7 | import socket 8 | from typing import Optional 9 | import sys 10 | from .exceptions import LumeConnectionError 11 | import signal 12 | import json 13 | import shlex 14 | import random 15 | from logging import getLogger 16 | 17 | 18 | class LumeServer: 19 | def __init__( 20 | self, 21 | debug: bool = False, 22 | server_start_timeout: int = 60, 23 | port: Optional[int] = None, 24 | use_existing_server: bool = False, 25 | host: str = "localhost", 26 | ): 27 | """Initialize the LumeServer. 28 | 29 | Args: 30 | debug: Enable debug logging 31 | server_start_timeout: Timeout in seconds to wait for server to start 32 | port: Specific port to use for the server 33 | use_existing_server: If True, will try to connect to an existing server 34 | instead of starting a new one 35 | host: Host to use for connections (e.g., "localhost", "127.0.0.1", "host.docker.internal") 36 | """ 37 | self.debug = debug 38 | self.server_start_timeout = server_start_timeout 39 | self.server_process = None 40 | self.output_file = None 41 | self.requested_port = port 42 | self.port = None 43 | self.base_url = None 44 | self.use_existing_server = use_existing_server 45 | self.host = host 46 | 47 | # Configure logging 48 | self.logger = getLogger("pylume.server") 49 | if not self.logger.handlers: 50 | handler = logging.StreamHandler() 51 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") 52 | handler.setFormatter(formatter) 53 | self.logger.addHandler(handler) 54 | self.logger.setLevel(logging.DEBUG if debug else logging.INFO) 55 | 56 | self.logger.debug(f"Server initialized with host: {self.host}") 57 | 58 | def _check_port_available(self, port: int) -> bool: 59 | """Check if a port is available.""" 60 | try: 61 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: 62 | s.settimeout(0.5) 63 | result = s.connect_ex(("127.0.0.1", port)) 64 | if result == 0: # Port is in use on localhost 65 | return False 66 | except: 67 | pass 68 | 69 | # Check the specified host (e.g., "host.docker.internal") if it's not a localhost alias 70 | if self.host not in ["localhost", "127.0.0.1"]: 71 | try: 72 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: 73 | s.settimeout(0.5) 74 | result = s.connect_ex((self.host, port)) 75 | if result == 0: # Port is in use on host 76 | return False 77 | except: 78 | pass 79 | 80 | return True 81 | 82 | def _get_server_port(self) -> int: 83 | """Get an available port for the server.""" 84 | # Use requested port if specified 85 | if self.requested_port is not None: 86 | if not self._check_port_available(self.requested_port): 87 | raise RuntimeError(f"Requested port {self.requested_port} is not available") 88 | return self.requested_port 89 | 90 | # Find a free port 91 | for _ in range(10): # Try up to 10 times 92 | port = random.randint(49152, 65535) 93 | if self._check_port_available(port): 94 | return port 95 | 96 | raise RuntimeError("Could not find an available port") 97 | 98 | async def _ensure_server_running(self) -> None: 99 | """Ensure the lume server is running, start it if it's not.""" 100 | try: 101 | self.logger.debug("Checking if lume server is running...") 102 | # Try to connect to the server with a short timeout 103 | cmd = ["curl", "-s", "-w", "%{http_code}", "-m", "5", f"{self.base_url}/vms"] 104 | process = await asyncio.create_subprocess_exec( 105 | *cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE 106 | ) 107 | stdout, stderr = await process.communicate() 108 | 109 | if process.returncode == 0: 110 | response = stdout.decode() 111 | status_code = int(response[-3:]) 112 | if status_code == 200: 113 | self.logger.debug("PyLume server is running") 114 | return 115 | 116 | self.logger.debug("PyLume server not running, attempting to start it") 117 | # Server not running, try to start it 118 | lume_path = os.path.join(os.path.dirname(__file__), "lume") 119 | if not os.path.exists(lume_path): 120 | raise RuntimeError(f"Could not find lume binary at {lume_path}") 121 | 122 | # Make sure the file is executable 123 | os.chmod(lume_path, 0o755) 124 | 125 | # Create a temporary file for server output 126 | self.output_file = tempfile.NamedTemporaryFile(mode="w+", delete=False) 127 | self.logger.debug(f"Using temporary file for server output: {self.output_file.name}") 128 | 129 | # Start the server 130 | self.logger.debug(f"Starting lume server with: {lume_path} serve --port {self.port}") 131 | 132 | # Start server in background using subprocess.Popen 133 | try: 134 | self.server_process = subprocess.Popen( 135 | [lume_path, "serve", "--port", str(self.port)], 136 | stdout=self.output_file, 137 | stderr=self.output_file, 138 | cwd=os.path.dirname(lume_path), 139 | start_new_session=True, # Run in new session to avoid blocking 140 | ) 141 | except Exception as e: 142 | self.output_file.close() 143 | os.unlink(self.output_file.name) 144 | raise RuntimeError(f"Failed to start lume server process: {str(e)}") 145 | 146 | # Wait for server to start 147 | self.logger.debug( 148 | f"Waiting up to {self.server_start_timeout} seconds for server to start..." 149 | ) 150 | start_time = time.time() 151 | server_ready = False 152 | last_size = 0 153 | 154 | while time.time() - start_time < self.server_start_timeout: 155 | if self.server_process.poll() is not None: 156 | # Process has terminated 157 | self.output_file.seek(0) 158 | output = self.output_file.read() 159 | self.output_file.close() 160 | os.unlink(self.output_file.name) 161 | error_msg = ( 162 | f"Server process terminated unexpectedly.\n" 163 | f"Exit code: {self.server_process.returncode}\n" 164 | f"Output: {output}" 165 | ) 166 | raise RuntimeError(error_msg) 167 | 168 | # Check output file for server ready message 169 | self.output_file.seek(0, os.SEEK_END) 170 | size = self.output_file.tell() 171 | if size > last_size: # Only read if there's new content 172 | self.output_file.seek(last_size) 173 | new_output = self.output_file.read() 174 | if new_output.strip(): # Only log non-empty output 175 | self.logger.debug(f"Server output: {new_output.strip()}") 176 | last_size = size 177 | 178 | if "Server started" in new_output: 179 | server_ready = True 180 | self.logger.debug("Server startup detected") 181 | break 182 | 183 | # Try to connect to the server periodically 184 | try: 185 | cmd = ["curl", "-s", "-w", "%{http_code}", "-m", "5", f"{self.base_url}/vms"] 186 | process = await asyncio.create_subprocess_exec( 187 | *cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE 188 | ) 189 | stdout, stderr = await process.communicate() 190 | 191 | if process.returncode == 0: 192 | response = stdout.decode() 193 | status_code = int(response[-3:]) 194 | if status_code == 200: 195 | server_ready = True 196 | self.logger.debug("Server is responding to requests") 197 | break 198 | except: 199 | pass # Server not ready yet 200 | 201 | await asyncio.sleep(1.0) 202 | 203 | if not server_ready: 204 | # Cleanup if server didn't start 205 | if self.server_process: 206 | self.server_process.terminate() 207 | try: 208 | self.server_process.wait(timeout=5) 209 | except subprocess.TimeoutExpired: 210 | self.server_process.kill() 211 | self.output_file.close() 212 | os.unlink(self.output_file.name) 213 | raise RuntimeError( 214 | f"Failed to start lume server after {self.server_start_timeout} seconds. " 215 | "Check the debug output for more details." 216 | ) 217 | 218 | # Give the server a moment to fully initialize 219 | await asyncio.sleep(2.0) 220 | 221 | # Verify server is responding 222 | try: 223 | cmd = ["curl", "-s", "-w", "%{http_code}", "-m", "10", f"{self.base_url}/vms"] 224 | process = await asyncio.create_subprocess_exec( 225 | *cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE 226 | ) 227 | stdout, stderr = await process.communicate() 228 | 229 | if process.returncode != 0: 230 | raise RuntimeError(f"Curl command failed: {stderr.decode()}") 231 | 232 | response = stdout.decode() 233 | status_code = int(response[-3:]) 234 | 235 | if status_code != 200: 236 | raise RuntimeError(f"Server returned status code {status_code}") 237 | 238 | self.logger.debug("PyLume server started successfully") 239 | except Exception as e: 240 | self.logger.debug(f"Server verification failed: {str(e)}") 241 | if self.server_process: 242 | self.server_process.terminate() 243 | try: 244 | self.server_process.wait(timeout=5) 245 | except subprocess.TimeoutExpired: 246 | self.server_process.kill() 247 | self.output_file.close() 248 | os.unlink(self.output_file.name) 249 | raise RuntimeError(f"Server started but is not responding: {str(e)}") 250 | 251 | self.logger.debug("Server startup completed successfully") 252 | 253 | except Exception as e: 254 | raise RuntimeError(f"Failed to start lume server: {str(e)}") 255 | 256 | async def _start_server(self) -> None: 257 | """Start the lume server using the lume executable.""" 258 | self.logger.debug("Starting PyLume server") 259 | 260 | # Get absolute path to lume executable in the same directory as this file 261 | lume_path = os.path.join(os.path.dirname(__file__), "lume") 262 | if not os.path.exists(lume_path): 263 | raise RuntimeError(f"Could not find lume binary at {lume_path}") 264 | 265 | try: 266 | # Make executable 267 | os.chmod(lume_path, 0o755) 268 | 269 | # Get and validate port 270 | self.port = self._get_server_port() 271 | self.base_url = f"http://{self.host}:{self.port}/lume" 272 | 273 | # Set up output handling 274 | self.output_file = tempfile.NamedTemporaryFile(mode="w+", delete=False) 275 | 276 | # Start the server process with the lume executable 277 | env = os.environ.copy() 278 | env["RUST_BACKTRACE"] = "1" # Enable backtrace for better error reporting 279 | 280 | # Specify the host to bind to (0.0.0.0 to allow external connections) 281 | self.server_process = subprocess.Popen( 282 | [lume_path, "serve", "--port", str(self.port)], 283 | stdout=self.output_file, 284 | stderr=subprocess.STDOUT, 285 | cwd=os.path.dirname(lume_path), # Run from same directory as executable 286 | env=env, 287 | ) 288 | 289 | # Wait for server to initialize 290 | await asyncio.sleep(2) 291 | await self._wait_for_server() 292 | 293 | except Exception as e: 294 | await self._cleanup() 295 | raise RuntimeError(f"Failed to start lume server process: {str(e)}") 296 | 297 | async def _tail_log(self) -> None: 298 | """Read and display server log output in debug mode.""" 299 | while True: 300 | try: 301 | self.output_file.seek(0, os.SEEK_END) # type: ignore[attr-defined] 302 | line = self.output_file.readline() # type: ignore[attr-defined] 303 | if line: 304 | line = line.strip() 305 | if line: 306 | print(f"SERVER: {line}") 307 | if self.server_process.poll() is not None: # type: ignore[attr-defined] 308 | print("Server process ended") 309 | break 310 | await asyncio.sleep(0.1) 311 | except Exception as e: 312 | print(f"Error reading log: {e}") 313 | await asyncio.sleep(0.1) 314 | 315 | async def _wait_for_server(self) -> None: 316 | """Wait for server to start and become responsive with increased timeout.""" 317 | start_time = time.time() 318 | while time.time() - start_time < self.server_start_timeout: 319 | if self.server_process.poll() is not None: # type: ignore[attr-defined] 320 | error_msg = await self._get_error_output() 321 | await self._cleanup() 322 | raise RuntimeError(error_msg) 323 | 324 | try: 325 | await self._verify_server() 326 | self.logger.debug("Server is now responsive") 327 | return 328 | except Exception as e: 329 | self.logger.debug(f"Server not ready yet: {str(e)}") 330 | await asyncio.sleep(1.0) 331 | 332 | await self._cleanup() 333 | raise RuntimeError(f"Server failed to start after {self.server_start_timeout} seconds") 334 | 335 | async def _verify_server(self) -> None: 336 | """Verify server is responding to requests.""" 337 | try: 338 | cmd = [ 339 | "curl", 340 | "-s", 341 | "-w", 342 | "%{http_code}", 343 | "-m", 344 | "10", 345 | f"http://{self.host}:{self.port}/lume/vms", 346 | ] 347 | process = await asyncio.create_subprocess_exec( 348 | *cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE 349 | ) 350 | stdout, stderr = await process.communicate() 351 | 352 | if process.returncode != 0: 353 | raise RuntimeError(f"Curl command failed: {stderr.decode()}") 354 | 355 | response = stdout.decode() 356 | status_code = int(response[-3:]) 357 | 358 | if status_code != 200: 359 | raise RuntimeError(f"Server returned status code {status_code}") 360 | 361 | self.logger.debug("PyLume server started successfully") 362 | except Exception as e: 363 | raise RuntimeError(f"Server not responding: {str(e)}") 364 | 365 | async def _get_error_output(self) -> str: 366 | """Get error output from the server process.""" 367 | if not self.output_file: 368 | return "No output available" 369 | self.output_file.seek(0) 370 | output = self.output_file.read() 371 | return ( 372 | f"Server process terminated unexpectedly.\n" 373 | f"Exit code: {self.server_process.returncode}\n" # type: ignore[attr-defined] 374 | f"Output: {output}" 375 | ) 376 | 377 | async def _cleanup(self) -> None: 378 | """Clean up all server resources.""" 379 | if self.server_process: 380 | try: 381 | self.server_process.terminate() 382 | try: 383 | self.server_process.wait(timeout=5) 384 | except subprocess.TimeoutExpired: 385 | self.server_process.kill() 386 | except: 387 | pass 388 | self.server_process = None 389 | 390 | # Clean up output file 391 | if self.output_file: 392 | try: 393 | self.output_file.close() 394 | os.unlink(self.output_file.name) 395 | except Exception as e: 396 | self.logger.debug(f"Error cleaning up output file: {e}") 397 | self.output_file = None 398 | 399 | async def ensure_running(self) -> None: 400 | """Ensure the server is running. 401 | 402 | If use_existing_server is True, will only try to connect to an existing server. 403 | Otherwise will: 404 | 1. Try to connect to an existing server on the specified port 405 | 2. If that fails and not in Docker, start a new server 406 | 3. If in Docker and no existing server is found, raise an error 407 | """ 408 | # First check if we're in Docker 409 | in_docker = os.path.exists("/.dockerenv") or ( 410 | os.path.exists("/proc/1/cgroup") and "docker" in open("/proc/1/cgroup", "r").read() 411 | ) 412 | 413 | # If using a non-localhost host like host.docker.internal, set up the connection details 414 | if self.host not in ["localhost", "127.0.0.1"]: 415 | if self.requested_port is None: 416 | raise RuntimeError("Port must be specified when using a remote host") 417 | 418 | self.port = self.requested_port 419 | self.base_url = f"http://{self.host}:{self.port}/lume" 420 | self.logger.debug(f"Using remote host server at {self.base_url}") 421 | 422 | # Try to verify the server is accessible 423 | try: 424 | await self._verify_server() 425 | self.logger.debug("Successfully connected to remote server") 426 | return 427 | except Exception as e: 428 | if self.use_existing_server or in_docker: 429 | # If explicitly requesting an existing server or in Docker, we can't start a new one 430 | raise RuntimeError( 431 | f"Failed to connect to remote server at {self.base_url}: {str(e)}" 432 | ) 433 | else: 434 | self.logger.debug(f"Remote server not available at {self.base_url}: {str(e)}") 435 | # Fall back to localhost for starting a new server 436 | self.host = "localhost" 437 | 438 | # If explicitly using an existing server, verify it's running 439 | if self.use_existing_server: 440 | if self.requested_port is None: 441 | raise RuntimeError("Port must be specified when using an existing server") 442 | 443 | self.port = self.requested_port 444 | self.base_url = f"http://{self.host}:{self.port}/lume" 445 | 446 | try: 447 | await self._verify_server() 448 | self.logger.debug("Successfully connected to existing server") 449 | except Exception as e: 450 | raise RuntimeError( 451 | f"Failed to connect to existing server at {self.base_url}: {str(e)}" 452 | ) 453 | else: 454 | # Try to connect to an existing server first 455 | if self.requested_port is not None: 456 | self.port = self.requested_port 457 | self.base_url = f"http://{self.host}:{self.port}/lume" 458 | 459 | try: 460 | await self._verify_server() 461 | self.logger.debug("Successfully connected to existing server") 462 | return 463 | except Exception: 464 | self.logger.debug(f"No existing server found at {self.base_url}") 465 | 466 | # If in Docker and can't connect to existing server, raise an error 467 | if in_docker: 468 | raise RuntimeError( 469 | f"Failed to connect to server at {self.base_url} and cannot start a new server in Docker" 470 | ) 471 | 472 | # Start a new server 473 | self.logger.debug("Starting a new server instance") 474 | await self._start_server() 475 | 476 | async def stop(self) -> None: 477 | """Stop the server if we're managing it.""" 478 | if not self.use_existing_server: 479 | self.logger.debug("Stopping lume server...") 480 | await self._cleanup() 481 | ```