This is page 15 of 21. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .all-contributorsrc ├── .cursorignore ├── .devcontainer │ ├── devcontainer.json │ ├── post-install.sh │ └── README.md ├── .dockerignore ├── .gitattributes ├── .github │ ├── FUNDING.yml │ ├── scripts │ │ ├── get_pyproject_version.py │ │ └── tests │ │ ├── __init__.py │ │ ├── README.md │ │ └── test_get_pyproject_version.py │ └── workflows │ ├── ci-lume.yml │ ├── docker-publish-kasm.yml │ ├── docker-publish-xfce.yml │ ├── docker-reusable-publish.yml │ ├── npm-publish-computer.yml │ ├── npm-publish-core.yml │ ├── publish-lume.yml │ ├── pypi-publish-agent.yml │ ├── pypi-publish-computer-server.yml │ ├── pypi-publish-computer.yml │ ├── pypi-publish-core.yml │ ├── pypi-publish-mcp-server.yml │ ├── pypi-publish-pylume.yml │ ├── pypi-publish-som.yml │ ├── pypi-reusable-publish.yml │ └── test-validation-script.yml ├── .gitignore ├── .vscode │ ├── docs.code-workspace │ ├── launch.json │ ├── libs-ts.code-workspace │ ├── lume.code-workspace │ ├── lumier.code-workspace │ └── py.code-workspace ├── blog │ ├── app-use.md │ ├── assets │ │ ├── composite-agents.png │ │ ├── docker-ubuntu-support.png │ │ ├── hack-booth.png │ │ ├── hack-closing-ceremony.jpg │ │ ├── hack-cua-ollama-hud.jpeg │ │ ├── hack-leaderboard.png │ │ ├── hack-the-north.png │ │ ├── hack-winners.jpeg │ │ ├── hack-workshop.jpeg │ │ ├── hud-agent-evals.png │ │ └── trajectory-viewer.jpeg │ ├── bringing-computer-use-to-the-web.md │ ├── build-your-own-operator-on-macos-1.md │ ├── build-your-own-operator-on-macos-2.md │ ├── composite-agents.md │ ├── cua-hackathon.md │ ├── hack-the-north.md │ ├── hud-agent-evals.md │ ├── human-in-the-loop.md │ ├── introducing-cua-cloud-containers.md │ ├── lume-to-containerization.md │ ├── sandboxed-python-execution.md │ ├── training-computer-use-models-trajectories-1.md │ ├── trajectory-viewer.md │ ├── ubuntu-docker-support.md │ └── windows-sandbox.md ├── CONTRIBUTING.md ├── Development.md ├── Dockerfile ├── docs │ ├── .gitignore │ ├── .prettierrc │ ├── content │ │ └── docs │ │ ├── agent-sdk │ │ │ ├── agent-loops.mdx │ │ │ ├── benchmarks │ │ │ │ ├── index.mdx │ │ │ │ ├── interactive.mdx │ │ │ │ ├── introduction.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── osworld-verified.mdx │ │ │ │ ├── screenspot-pro.mdx │ │ │ │ └── screenspot-v2.mdx │ │ │ ├── callbacks │ │ │ │ ├── agent-lifecycle.mdx │ │ │ │ ├── cost-saving.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── logging.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── pii-anonymization.mdx │ │ │ │ └── trajectories.mdx │ │ │ ├── chat-history.mdx │ │ │ ├── custom-computer-handlers.mdx │ │ │ ├── custom-tools.mdx │ │ │ ├── customizing-computeragent.mdx │ │ │ ├── integrations │ │ │ │ ├── hud.mdx │ │ │ │ └── meta.json │ │ │ ├── message-format.mdx │ │ │ ├── meta.json │ │ │ ├── migration-guide.mdx │ │ │ ├── prompt-caching.mdx │ │ │ ├── supported-agents │ │ │ │ ├── composed-agents.mdx │ │ │ │ ├── computer-use-agents.mdx │ │ │ │ ├── grounding-models.mdx │ │ │ │ ├── human-in-the-loop.mdx │ │ │ │ └── meta.json │ │ │ ├── supported-model-providers │ │ │ │ ├── index.mdx │ │ │ │ └── local-models.mdx │ │ │ └── usage-tracking.mdx │ │ ├── computer-sdk │ │ │ ├── cloud-vm-management.mdx │ │ │ ├── commands.mdx │ │ │ ├── computer-ui.mdx │ │ │ ├── computers.mdx │ │ │ ├── meta.json │ │ │ └── sandboxed-python.mdx │ │ ├── index.mdx │ │ ├── libraries │ │ │ ├── agent │ │ │ │ └── index.mdx │ │ │ ├── computer │ │ │ │ └── index.mdx │ │ │ ├── computer-server │ │ │ │ ├── Commands.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── REST-API.mdx │ │ │ │ └── WebSocket-API.mdx │ │ │ ├── core │ │ │ │ └── index.mdx │ │ │ ├── lume │ │ │ │ ├── cli-reference.mdx │ │ │ │ ├── faq.md │ │ │ │ ├── http-api.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── meta.json │ │ │ │ └── prebuilt-images.mdx │ │ │ ├── lumier │ │ │ │ ├── building-lumier.mdx │ │ │ │ ├── docker-compose.mdx │ │ │ │ ├── docker.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ └── meta.json │ │ │ ├── mcp-server │ │ │ │ ├── client-integrations.mdx │ │ │ │ ├── configuration.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── llm-integrations.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── tools.mdx │ │ │ │ └── usage.mdx │ │ │ └── som │ │ │ ├── configuration.mdx │ │ │ └── index.mdx │ │ ├── meta.json │ │ ├── quickstart-cli.mdx │ │ ├── quickstart-devs.mdx │ │ └── telemetry.mdx │ ├── next.config.mjs │ ├── package-lock.json │ ├── package.json │ ├── pnpm-lock.yaml │ ├── postcss.config.mjs │ ├── public │ │ └── img │ │ ├── agent_gradio_ui.png │ │ ├── agent.png │ │ ├── cli.png │ │ ├── computer.png │ │ ├── som_box_threshold.png │ │ └── som_iou_threshold.png │ ├── README.md │ ├── source.config.ts │ ├── src │ │ ├── app │ │ │ ├── (home) │ │ │ │ ├── [[...slug]] │ │ │ │ │ └── page.tsx │ │ │ │ └── layout.tsx │ │ │ ├── api │ │ │ │ └── search │ │ │ │ └── route.ts │ │ │ ├── favicon.ico │ │ │ ├── global.css │ │ │ ├── layout.config.tsx │ │ │ ├── layout.tsx │ │ │ ├── llms.mdx │ │ │ │ └── [[...slug]] │ │ │ │ └── route.ts │ │ │ └── llms.txt │ │ │ └── route.ts │ │ ├── assets │ │ │ ├── discord-black.svg │ │ │ ├── discord-white.svg │ │ │ ├── logo-black.svg │ │ │ └── logo-white.svg │ │ ├── components │ │ │ ├── iou.tsx │ │ │ └── mermaid.tsx │ │ ├── lib │ │ │ ├── llms.ts │ │ │ └── source.ts │ │ └── mdx-components.tsx │ └── tsconfig.json ├── examples │ ├── agent_examples.py │ ├── agent_ui_examples.py │ ├── cloud_api_examples.py │ ├── computer_examples_windows.py │ ├── computer_examples.py │ ├── computer_ui_examples.py │ ├── computer-example-ts │ │ ├── .env.example │ │ ├── .gitignore │ │ ├── .prettierrc │ │ ├── package-lock.json │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── README.md │ │ ├── src │ │ │ ├── helpers.ts │ │ │ └── index.ts │ │ └── tsconfig.json │ ├── docker_examples.py │ ├── evals │ │ ├── hud_eval_examples.py │ │ └── wikipedia_most_linked.txt │ ├── pylume_examples.py │ ├── sandboxed_functions_examples.py │ ├── som_examples.py │ ├── utils.py │ └── winsandbox_example.py ├── img │ ├── agent_gradio_ui.png │ ├── agent.png │ ├── cli.png │ ├── computer.png │ ├── logo_black.png │ └── logo_white.png ├── libs │ ├── kasm │ │ ├── Dockerfile │ │ ├── LICENSE │ │ ├── README.md │ │ └── src │ │ └── ubuntu │ │ └── install │ │ └── firefox │ │ ├── custom_startup.sh │ │ ├── firefox.desktop │ │ └── install_firefox.sh │ ├── lume │ │ ├── .cursorignore │ │ ├── CONTRIBUTING.md │ │ ├── Development.md │ │ ├── img │ │ │ └── cli.png │ │ ├── Package.resolved │ │ ├── Package.swift │ │ ├── README.md │ │ ├── resources │ │ │ └── lume.entitlements │ │ ├── scripts │ │ │ ├── build │ │ │ │ ├── build-debug.sh │ │ │ │ ├── build-release-notarized.sh │ │ │ │ └── build-release.sh │ │ │ └── install.sh │ │ ├── src │ │ │ ├── Commands │ │ │ │ ├── Clone.swift │ │ │ │ ├── Config.swift │ │ │ │ ├── Create.swift │ │ │ │ ├── Delete.swift │ │ │ │ ├── Get.swift │ │ │ │ ├── Images.swift │ │ │ │ ├── IPSW.swift │ │ │ │ ├── List.swift │ │ │ │ ├── Logs.swift │ │ │ │ ├── Options │ │ │ │ │ └── FormatOption.swift │ │ │ │ ├── Prune.swift │ │ │ │ ├── Pull.swift │ │ │ │ ├── Push.swift │ │ │ │ ├── Run.swift │ │ │ │ ├── Serve.swift │ │ │ │ ├── Set.swift │ │ │ │ └── Stop.swift │ │ │ ├── ContainerRegistry │ │ │ │ ├── ImageContainerRegistry.swift │ │ │ │ ├── ImageList.swift │ │ │ │ └── ImagesPrinter.swift │ │ │ ├── Errors │ │ │ │ └── Errors.swift │ │ │ ├── FileSystem │ │ │ │ ├── Home.swift │ │ │ │ ├── Settings.swift │ │ │ │ ├── VMConfig.swift │ │ │ │ ├── VMDirectory.swift │ │ │ │ └── VMLocation.swift │ │ │ ├── LumeController.swift │ │ │ ├── Main.swift │ │ │ ├── Server │ │ │ │ ├── Handlers.swift │ │ │ │ ├── HTTP.swift │ │ │ │ ├── Requests.swift │ │ │ │ ├── Responses.swift │ │ │ │ └── Server.swift │ │ │ ├── Utils │ │ │ │ ├── CommandRegistry.swift │ │ │ │ ├── CommandUtils.swift │ │ │ │ ├── Logger.swift │ │ │ │ ├── NetworkUtils.swift │ │ │ │ ├── Path.swift │ │ │ │ ├── ProcessRunner.swift │ │ │ │ ├── ProgressLogger.swift │ │ │ │ ├── String.swift │ │ │ │ └── Utils.swift │ │ │ ├── Virtualization │ │ │ │ ├── DarwinImageLoader.swift │ │ │ │ ├── DHCPLeaseParser.swift │ │ │ │ ├── ImageLoaderFactory.swift │ │ │ │ └── VMVirtualizationService.swift │ │ │ ├── VM │ │ │ │ ├── DarwinVM.swift │ │ │ │ ├── LinuxVM.swift │ │ │ │ ├── VM.swift │ │ │ │ ├── VMDetails.swift │ │ │ │ ├── VMDetailsPrinter.swift │ │ │ │ ├── VMDisplayResolution.swift │ │ │ │ └── VMFactory.swift │ │ │ └── VNC │ │ │ ├── PassphraseGenerator.swift │ │ │ └── VNCService.swift │ │ └── tests │ │ ├── Mocks │ │ │ ├── MockVM.swift │ │ │ ├── MockVMVirtualizationService.swift │ │ │ └── MockVNCService.swift │ │ ├── VM │ │ │ └── VMDetailsPrinterTests.swift │ │ ├── VMTests.swift │ │ ├── VMVirtualizationServiceTests.swift │ │ └── VNCServiceTests.swift │ ├── lumier │ │ ├── .dockerignore │ │ ├── Dockerfile │ │ ├── README.md │ │ └── src │ │ ├── bin │ │ │ └── entry.sh │ │ ├── config │ │ │ └── constants.sh │ │ ├── hooks │ │ │ └── on-logon.sh │ │ └── lib │ │ ├── utils.sh │ │ └── vm.sh │ ├── python │ │ ├── agent │ │ │ ├── agent │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── adapters │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── huggingfacelocal_adapter.py │ │ │ │ │ ├── human_adapter.py │ │ │ │ │ ├── mlxvlm_adapter.py │ │ │ │ │ └── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── qwen2_5_vl.py │ │ │ │ ├── agent.py │ │ │ │ ├── callbacks │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── budget_manager.py │ │ │ │ │ ├── image_retention.py │ │ │ │ │ ├── logging.py │ │ │ │ │ ├── operator_validator.py │ │ │ │ │ ├── pii_anonymization.py │ │ │ │ │ ├── prompt_instructions.py │ │ │ │ │ ├── telemetry.py │ │ │ │ │ └── trajectory_saver.py │ │ │ │ ├── cli.py │ │ │ │ ├── computers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cua.py │ │ │ │ │ └── custom.py │ │ │ │ ├── decorators.py │ │ │ │ ├── human_tool │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ ├── server.py │ │ │ │ │ └── ui.py │ │ │ │ ├── integrations │ │ │ │ │ └── hud │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── agent.py │ │ │ │ │ └── proxy.py │ │ │ │ ├── loops │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── anthropic.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── composed_grounded.py │ │ │ │ │ ├── glm45v.py │ │ │ │ │ ├── gta1.py │ │ │ │ │ ├── holo.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── model_types.csv │ │ │ │ │ ├── moondream3.py │ │ │ │ │ ├── omniparser.py │ │ │ │ │ ├── openai.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── uitars.py │ │ │ │ ├── proxy │ │ │ │ │ ├── examples.py │ │ │ │ │ └── handlers.py │ │ │ │ ├── responses.py │ │ │ │ ├── types.py │ │ │ │ └── ui │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ └── gradio │ │ │ │ ├── __init__.py │ │ │ │ ├── app.py │ │ │ │ └── ui_components.py │ │ │ ├── benchmarks │ │ │ │ ├── .gitignore │ │ │ │ ├── contrib.md │ │ │ │ ├── interactive.py │ │ │ │ ├── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ └── gta1.py │ │ │ │ ├── README.md │ │ │ │ ├── ss-pro.py │ │ │ │ ├── ss-v2.py │ │ │ │ └── utils.py │ │ │ ├── example.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer │ │ │ ├── computer │ │ │ │ ├── __init__.py │ │ │ │ ├── computer.py │ │ │ │ ├── diorama_computer.py │ │ │ │ ├── helpers.py │ │ │ │ ├── interface │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ ├── models.py │ │ │ │ │ └── windows.py │ │ │ │ ├── logger.py │ │ │ │ ├── models.py │ │ │ │ ├── providers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cloud │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── docker │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── lume │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── lume_api.py │ │ │ │ │ ├── lumier │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── types.py │ │ │ │ │ └── winsandbox │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── provider.py │ │ │ │ │ └── setup_script.ps1 │ │ │ │ ├── ui │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ └── gradio │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── app.py │ │ │ │ └── utils.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer-server │ │ │ ├── computer_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── cli.py │ │ │ │ ├── diorama │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── diorama_computer.py │ │ │ │ │ ├── diorama.py │ │ │ │ │ ├── draw.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── safezone.py │ │ │ │ ├── handlers │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── windows.py │ │ │ │ ├── main.py │ │ │ │ ├── server.py │ │ │ │ └── watchdog.py │ │ │ ├── examples │ │ │ │ ├── __init__.py │ │ │ │ └── usage_example.py │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ ├── run_server.py │ │ │ └── test_connection.py │ │ ├── core │ │ │ ├── core │ │ │ │ ├── __init__.py │ │ │ │ └── telemetry │ │ │ │ ├── __init__.py │ │ │ │ └── posthog.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── mcp-server │ │ │ ├── mcp_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ └── server.py │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ └── scripts │ │ │ ├── install_mcp_server.sh │ │ │ └── start_mcp_server.sh │ │ ├── pylume │ │ │ ├── __init__.py │ │ │ ├── pylume │ │ │ │ ├── __init__.py │ │ │ │ ├── client.py │ │ │ │ ├── exceptions.py │ │ │ │ ├── lume │ │ │ │ ├── models.py │ │ │ │ ├── pylume.py │ │ │ │ └── server.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ └── som │ │ ├── LICENSE │ │ ├── poetry.toml │ │ ├── pyproject.toml │ │ ├── README.md │ │ ├── som │ │ │ ├── __init__.py │ │ │ ├── detect.py │ │ │ ├── detection.py │ │ │ ├── models.py │ │ │ ├── ocr.py │ │ │ ├── util │ │ │ │ └── utils.py │ │ │ └── visualization.py │ │ └── tests │ │ └── test_omniparser.py │ ├── typescript │ │ ├── .gitignore │ │ ├── .nvmrc │ │ ├── agent │ │ │ ├── examples │ │ │ │ ├── playground-example.html │ │ │ │ └── README.md │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── client.ts │ │ │ │ ├── index.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ └── client.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── biome.json │ │ ├── computer │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── computer │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── providers │ │ │ │ │ │ ├── base.ts │ │ │ │ │ │ ├── cloud.ts │ │ │ │ │ │ └── index.ts │ │ │ │ │ └── types.ts │ │ │ │ ├── index.ts │ │ │ │ ├── interface │ │ │ │ │ ├── base.ts │ │ │ │ │ ├── factory.ts │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── linux.ts │ │ │ │ │ ├── macos.ts │ │ │ │ │ └── windows.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ ├── computer │ │ │ │ │ └── cloud.test.ts │ │ │ │ ├── interface │ │ │ │ │ ├── factory.test.ts │ │ │ │ │ ├── index.test.ts │ │ │ │ │ ├── linux.test.ts │ │ │ │ │ ├── macos.test.ts │ │ │ │ │ └── windows.test.ts │ │ │ │ └── setup.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── core │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── index.ts │ │ │ │ └── telemetry │ │ │ │ ├── clients │ │ │ │ │ ├── index.ts │ │ │ │ │ └── posthog.ts │ │ │ │ └── index.ts │ │ │ ├── tests │ │ │ │ └── telemetry.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── pnpm-workspace.yaml │ │ └── README.md │ └── xfce │ ├── .dockerignore │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ └── src │ ├── scripts │ │ ├── resize-display.sh │ │ ├── start-computer-server.sh │ │ ├── start-novnc.sh │ │ ├── start-vnc.sh │ │ └── xstartup.sh │ ├── supervisor │ │ └── supervisord.conf │ └── xfce-config │ ├── helpers.rc │ ├── xfce4-power-manager.xml │ └── xfce4-session.xml ├── LICENSE.md ├── notebooks │ ├── agent_nb.ipynb │ ├── blog │ │ ├── build-your-own-operator-on-macos-1.ipynb │ │ └── build-your-own-operator-on-macos-2.ipynb │ ├── composite_agents_docker_nb.ipynb │ ├── computer_nb.ipynb │ ├── computer_server_nb.ipynb │ ├── customizing_computeragent.ipynb │ ├── eval_osworld.ipynb │ ├── ollama_nb.ipynb │ ├── pylume_nb.ipynb │ ├── README.md │ ├── sota_hackathon_cloud.ipynb │ └── sota_hackathon.ipynb ├── pdm.lock ├── pyproject.toml ├── pyrightconfig.json ├── README.md ├── samples │ └── community │ ├── global-online │ │ └── README.md │ └── hack-the-north │ └── README.md ├── scripts │ ├── build-uv.sh │ ├── build.ps1 │ ├── build.sh │ ├── cleanup.sh │ ├── playground-docker.sh │ ├── playground.sh │ └── run-docker-dev.sh └── tests ├── pytest.ini ├── shell_cmd.py ├── test_files.py ├── test_shell_bash.py ├── test_telemetry.py ├── test_venv.py └── test_watchdog.py ``` # Files -------------------------------------------------------------------------------- /docs/content/docs/libraries/lume/http-api.mdx: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | title: HTTP Server API 3 | description: Lume exposes a local HTTP API server that listens at localhost for programmatic management of VMs. 4 | --- 5 | 6 | import { Tabs, Tab } from 'fumadocs-ui/components/tabs'; 7 | import { Callout } from 'fumadocs-ui/components/callout'; 8 | 9 | ## Default URL 10 | 11 | ``` 12 | http://localhost:7777 13 | ``` 14 | 15 | <Callout type="info"> 16 | The HTTP API service runs on port `7777` by default. If you'd like to use a 17 | different port, pass the `--port` option during installation or when running 18 | `lume serve`. 19 | </Callout> 20 | 21 | ## Endpoints 22 | 23 | --- 24 | 25 | ### Create VM 26 | 27 | Create a new virtual machine. 28 | 29 | `POST: /lume/vms` 30 | 31 | #### Parameters 32 | 33 | | Name | Type | Required | Description | 34 | | -------- | ------- | -------- | ------------------------------------ | 35 | | name | string | Yes | Name of the VM | 36 | | os | string | Yes | Guest OS (`macOS`, `linux`, etc.) | 37 | | cpu | integer | Yes | Number of CPU cores | 38 | | memory | string | Yes | Memory size (e.g. `4GB`) | 39 | | diskSize | string | Yes | Disk size (e.g. `64GB`) | 40 | | display | string | No | Display resolution (e.g. `1024x768`) | 41 | | ipsw | string | No | IPSW version (e.g. `latest`) | 42 | | storage | string | No | Storage type (`ssd`, etc.) | 43 | 44 | #### Example Request 45 | 46 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 47 | <Tab value="Curl"> 48 | 49 | ```bash 50 | curl --connect-timeout 6000 \ 51 | --max-time 5000 \ 52 | -X POST \ 53 | -H "Content-Type: application/json" \ 54 | -d '{ 55 | "name": "lume_vm", 56 | "os": "macOS", 57 | "cpu": 2, 58 | "memory": "4GB", 59 | "diskSize": "64GB", 60 | "display": "1024x768", 61 | "ipsw": "latest", 62 | "storage": "ssd" 63 | }' \ 64 | http://localhost:7777/lume/vms 65 | ``` 66 | 67 | </Tab> 68 | <Tab value="Python"> 69 | 70 | ```python 71 | import requests 72 | 73 | payload = { 74 | "name": "lume_vm", 75 | "os": "macOS", 76 | "cpu": 2, 77 | "memory": "4GB", 78 | "diskSize": "64GB", 79 | "display": "1024x768", 80 | "ipsw": "latest", 81 | "storage": "ssd" 82 | } 83 | r = requests.post("http://localhost:7777/lume/vms", json=payload, timeout=50) 84 | print(r.json()) 85 | ``` 86 | 87 | </Tab> 88 | <Tab value="TypeScript"> 89 | 90 | ```typescript 91 | const payload = { 92 | name: 'lume_vm', 93 | os: 'macOS', 94 | cpu: 2, 95 | memory: '4GB', 96 | diskSize: '64GB', 97 | display: '1024x768', 98 | ipsw: 'latest', 99 | storage: 'ssd', 100 | }; 101 | 102 | const res = await fetch('http://localhost:7777/lume/vms', { 103 | method: 'POST', 104 | headers: { 'Content-Type': 'application/json' }, 105 | body: JSON.stringify(payload), 106 | }); 107 | console.log(await res.json()); 108 | ``` 109 | 110 | </Tab> 111 | </Tabs> 112 | 113 | --- 114 | 115 | ### Run VM 116 | 117 | Run a virtual machine instance. 118 | 119 | `POST: /lume/vms/:name/run` 120 | 121 | #### Parameters 122 | 123 | | Name | Type | Required | Description | 124 | | ----------------- | --------------- | -------- | --------------------------------------------------- | 125 | | noDisplay | boolean | No | If true, do not start VNC client | 126 | | sharedDirectories | array of object | No | List of shared directories (`hostPath`, `readOnly`) | 127 | | recoveryMode | boolean | No | Start in recovery mode | 128 | | storage | string | No | Storage type (`ssd`, etc.) | 129 | 130 | #### Example Request 131 | 132 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 133 | <Tab value="Curl"> 134 | 135 | ```bash 136 | # Basic run 137 | curl --connect-timeout 6000 \ 138 | --max-time 5000 \ 139 | -X POST \ 140 | http://localhost:7777/lume/vms/my-vm-name/run 141 | 142 | # Run with VNC client started and shared directory 143 | curl --connect-timeout 6000 \ 144 | --max-time 5000 \ 145 | -X POST \ 146 | -H "Content-Type: application/json" \ 147 | -d '{ 148 | "noDisplay": false, 149 | "sharedDirectories": [ 150 | { 151 | "hostPath": "~/Projects", 152 | "readOnly": false 153 | } 154 | ], 155 | "recoveryMode": false, 156 | "storage": "ssd" 157 | }' \ 158 | http://localhost:7777/lume/vms/lume_vm/run 159 | ``` 160 | 161 | </Tab> 162 | <Tab value="Python"> 163 | 164 | ```python 165 | import requests 166 | 167 | # Basic run 168 | r = requests.post("http://localhost:7777/lume/vms/my-vm-name/run", timeout=50) 169 | print(r.json()) 170 | 171 | # With VNC and shared directory 172 | payload = { 173 | "noDisplay": False, 174 | "sharedDirectories": [ 175 | {"hostPath": "~/Projects", "readOnly": False} 176 | ], 177 | "recoveryMode": False, 178 | "storage": "ssd" 179 | } 180 | r = requests.post("http://localhost:7777/lume/vms/lume_vm/run", json=payload, timeout=50) 181 | print(r.json()) 182 | ``` 183 | 184 | </Tab> 185 | <Tab value="TypeScript"> 186 | 187 | ```typescript 188 | // Basic run 189 | let res = await fetch('http://localhost:7777/lume/vms/my-vm-name/run', { 190 | method: 'POST', 191 | }); 192 | console.log(await res.json()); 193 | 194 | // With VNC and shared directory 195 | const payload = { 196 | noDisplay: false, 197 | sharedDirectories: [{ hostPath: '~/Projects', readOnly: false }], 198 | recoveryMode: false, 199 | storage: 'ssd', 200 | }; 201 | res = await fetch('http://localhost:7777/lume/vms/lume_vm/run', { 202 | method: 'POST', 203 | headers: { 'Content-Type': 'application/json' }, 204 | body: JSON.stringify(payload), 205 | }); 206 | console.log(await res.json()); 207 | ``` 208 | 209 | </Tab> 210 | </Tabs> 211 | 212 | --- 213 | 214 | ### List VMs 215 | 216 | List all virtual machines. 217 | 218 | `GET: /lume/vms` 219 | 220 | #### Example Request 221 | 222 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 223 | <Tab value="Curl"> 224 | 225 | ```bash 226 | curl --connect-timeout 6000 \ 227 | --max-time 5000 \ 228 | http://localhost:7777/lume/vms 229 | ``` 230 | 231 | </Tab> 232 | <Tab value="Python"> 233 | 234 | ```python 235 | import requests 236 | 237 | r = requests.get("http://localhost:7777/lume/vms", timeout=50) 238 | print(r.json()) 239 | ``` 240 | 241 | </Tab> 242 | <Tab value="TypeScript"> 243 | 244 | ```typescript 245 | const res = await fetch('http://localhost:7777/lume/vms'); 246 | console.log(await res.json()); 247 | ``` 248 | 249 | </Tab> 250 | </Tabs> 251 | 252 | ```json 253 | [ 254 | { 255 | "name": "my-vm", 256 | "state": "stopped", 257 | "os": "macOS", 258 | "cpu": 2, 259 | "memory": "4GB", 260 | "diskSize": "64GB" 261 | }, 262 | { 263 | "name": "my-vm-2", 264 | "state": "stopped", 265 | "os": "linux", 266 | "cpu": 2, 267 | "memory": "4GB", 268 | "diskSize": "64GB" 269 | } 270 | ] 271 | ``` 272 | 273 | --- 274 | 275 | ### Get VM Details 276 | 277 | Get details for a specific virtual machine. 278 | 279 | `GET: /lume/vms/:name` 280 | 281 | #### Parameters 282 | 283 | | Name | Type | Required | Description | 284 | | ------- | ------ | -------- | -------------------------- | 285 | | storage | string | No | Storage type (`ssd`, etc.) | 286 | 287 | #### Example Request 288 | 289 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 290 | <Tab value="Curl"> 291 | 292 | ```bash 293 | # Basic get 294 | curl --connect-timeout 6000 \ 295 | --max-time 5000 \ 296 | http://localhost:7777/lume/vms/lume_vm 297 | 298 | # Get with specific storage 299 | curl --connect-timeout 6000 \ 300 | --max-time 5000 \ 301 | http://localhost:7777/lume/vms/lume_vm?storage=ssd 302 | ``` 303 | 304 | </Tab> 305 | <Tab value="Python"> 306 | 307 | ```python 308 | import requests 309 | 310 | # Basic get 311 | details = requests.get("http://localhost:7777/lume/vms/lume_vm", timeout=50) 312 | print(details.json()) 313 | 314 | # Get with specific storage 315 | details = requests.get("http://localhost:7777/lume/vms/lume_vm", params={"storage": "ssd"}, timeout=50) 316 | print(details.json()) 317 | ``` 318 | 319 | </Tab> 320 | <Tab value="TypeScript"> 321 | 322 | ```typescript 323 | // Basic get 324 | let res = await fetch('http://localhost:7777/lume/vms/lume_vm'); 325 | console.log(await res.json()); 326 | 327 | // Get with specific storage 328 | res = await fetch('http://localhost:7777/lume/vms/lume_vm?storage=ssd'); 329 | console.log(await res.json()); 330 | ``` 331 | 332 | </Tab> 333 | </Tabs> 334 | 335 | ```json 336 | { 337 | "name": "lume_vm", 338 | "state": "stopped", 339 | "os": "macOS", 340 | "cpu": 2, 341 | "memory": "4GB", 342 | "diskSize": "64GB", 343 | "display": "1024x768", 344 | "ipAddress": "192.168.65.2", 345 | "vncPort": 5900, 346 | "sharedDirectories": [ 347 | { 348 | "hostPath": "~/Projects", 349 | "readOnly": false, 350 | "tag": "com.apple.virtio-fs.automount" 351 | } 352 | ] 353 | } 354 | ``` 355 | 356 | --- 357 | 358 | ### Update VM Configuration 359 | 360 | Update the configuration of a virtual machine. 361 | 362 | `PATCH: /lume/vms/:name` 363 | 364 | #### Parameters 365 | 366 | | Name | Type | Required | Description | 367 | | -------- | ------- | -------- | ------------------------------------- | 368 | | cpu | integer | No | Number of CPU cores | 369 | | memory | string | No | Memory size (e.g. `8GB`) | 370 | | diskSize | string | No | Disk size (e.g. `100GB`) | 371 | | display | string | No | Display resolution (e.g. `1920x1080`) | 372 | | storage | string | No | Storage type (`ssd`, etc.) | 373 | 374 | #### Example Request 375 | 376 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 377 | <Tab value="Curl"> 378 | 379 | ```bash 380 | curl --connect-timeout 6000 \ 381 | --max-time 5000 \ 382 | -X PATCH \ 383 | -H "Content-Type: application/json" \ 384 | -d '{ 385 | "cpu": 4, 386 | "memory": "8GB", 387 | "diskSize": "100GB", 388 | "display": "1920x1080", 389 | "storage": "ssd" 390 | }' \ 391 | http://localhost:7777/lume/vms/lume_vm 392 | ``` 393 | 394 | </Tab> 395 | <Tab value="Python"> 396 | 397 | ```python 398 | import requests 399 | 400 | payload = { 401 | "cpu": 4, 402 | "memory": "8GB", 403 | "diskSize": "100GB", 404 | "display": "1920x1080", 405 | "storage": "ssd" 406 | } 407 | r = requests.patch("http://localhost:7777/lume/vms/lume_vm", json=payload, timeout=50) 408 | print(r.json()) 409 | ``` 410 | 411 | </Tab> 412 | <Tab value="TypeScript"> 413 | 414 | ```typescript 415 | const payload = { 416 | cpu: 4, 417 | memory: '8GB', 418 | diskSize: '100GB', 419 | display: '1920x1080', 420 | storage: 'ssd', 421 | }; 422 | const res = await fetch('http://localhost:7777/lume/vms/lume_vm', { 423 | method: 'PATCH', 424 | headers: { 'Content-Type': 'application/json' }, 425 | body: JSON.stringify(payload), 426 | }); 427 | console.log(await res.json()); 428 | ``` 429 | 430 | </Tab> 431 | </Tabs> 432 | 433 | --- 434 | 435 | ### Stop VM 436 | 437 | Stop a running virtual machine. 438 | 439 | `POST: /lume/vms/:name/stop` 440 | 441 | #### Parameters 442 | 443 | | Name | Type | Required | Description | 444 | | ------- | ------ | -------- | -------------------------- | 445 | | storage | string | No | Storage type (`ssd`, etc.) | 446 | 447 | #### Example Request 448 | 449 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 450 | <Tab value="Curl"> 451 | 452 | ```bash 453 | # Basic stop 454 | curl --connect-timeout 6000 \ 455 | --max-time 5000 \ 456 | -X POST \ 457 | http://localhost:7777/lume/vms/lume_vm/stop 458 | 459 | # Stop with storage location specified 460 | curl --connect-timeout 6000 \ 461 | --max-time 5000 \ 462 | -X POST \ 463 | http://localhost:7777/lume/vms/lume_vm/stop?storage=ssd 464 | ``` 465 | 466 | </Tab> 467 | <Tab value="Python"> 468 | 469 | ```python 470 | import requests 471 | 472 | # Basic stop 473 | r = requests.post("http://localhost:7777/lume/vms/lume_vm/stop", timeout=50) 474 | print(r.json()) 475 | 476 | # Stop with storage location specified 477 | r = requests.post("http://localhost:7777/lume/vms/lume_vm/stop", params={"storage": "ssd"}, timeout=50) 478 | print(r.json()) 479 | ``` 480 | 481 | </Tab> 482 | <Tab value="TypeScript"> 483 | 484 | ```typescript 485 | // Basic stop 486 | let res = await fetch('http://localhost:7777/lume/vms/lume_vm/stop', { 487 | method: 'POST', 488 | }); 489 | console.log(await res.json()); 490 | 491 | // Stop with storage location specified 492 | res = await fetch('http://localhost:7777/lume/vms/lume_vm/stop?storage=ssd', { 493 | method: 'POST', 494 | }); 495 | console.log(await res.json()); 496 | ``` 497 | 498 | </Tab> 499 | </Tabs> 500 | 501 | --- 502 | 503 | ### Delete VM 504 | 505 | Delete a virtual machine instance. 506 | 507 | `DELETE: /lume/vms/:name` 508 | 509 | #### Parameters 510 | 511 | | Name | Type | Required | Description | 512 | | ------- | ------ | -------- | -------------------------- | 513 | | storage | string | No | Storage type (`ssd`, etc.) | 514 | 515 | #### Example Request 516 | 517 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 518 | <Tab value="Curl"> 519 | 520 | ```bash 521 | # Basic delete 522 | curl --connect-timeout 6000 \ 523 | --max-time 5000 \ 524 | -X DELETE \ 525 | http://localhost:7777/lume/vms/lume_vm 526 | 527 | # Delete with specific storage 528 | curl --connect-timeout 6000 \ 529 | --max-time 5000 \ 530 | -X DELETE \ 531 | http://localhost:7777/lume/vms/lume_vm?storage=ssd 532 | ``` 533 | 534 | </Tab> 535 | <Tab value="Python"> 536 | 537 | ```python 538 | import requests 539 | 540 | # Basic delete 541 | r = requests.delete("http://localhost:7777/lume/vms/lume_vm", timeout=50) 542 | print(r.status_code) 543 | 544 | # Delete with specific storage 545 | r = requests.delete("http://localhost:7777/lume/vms/lume_vm", params={"storage": "ssd"}, timeout=50) 546 | print(r.status_code) 547 | ``` 548 | 549 | </Tab> 550 | <Tab value="TypeScript"> 551 | 552 | ```typescript 553 | // Basic delete 554 | let res = await fetch('http://localhost:7777/lume/vms/lume_vm', { 555 | method: 'DELETE', 556 | }); 557 | console.log(res.status); 558 | 559 | // Delete with specific storage 560 | res = await fetch('http://localhost:7777/lume/vms/lume_vm?storage=ssd', { 561 | method: 'DELETE', 562 | }); 563 | console.log(res.status); 564 | ``` 565 | 566 | </Tab> 567 | </Tabs> 568 | 569 | --- 570 | 571 | ### Clone VM 572 | 573 | Clone an existing virtual machine. 574 | 575 | `POST: /lume/vms/clone` 576 | 577 | #### Parameters 578 | 579 | | Name | Type | Required | Description | 580 | | -------------- | ------ | -------- | ----------------------------------- | 581 | | name | string | Yes | Source VM name | 582 | | newName | string | Yes | New VM name | 583 | | sourceLocation | string | No | Source storage location (`default`) | 584 | | destLocation | string | No | Destination storage location | 585 | 586 | #### Example Request 587 | 588 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 589 | <Tab value="Curl"> 590 | 591 | ```bash 592 | curl --connect-timeout 6000 \ 593 | --max-time 5000 \ 594 | -X POST \ 595 | -H "Content-Type: application/json" \ 596 | -d '{ 597 | "name": "source-vm", 598 | "newName": "cloned-vm", 599 | "sourceLocation": "default", 600 | "destLocation": "ssd" 601 | }' \ 602 | http://localhost:7777/lume/vms/clone 603 | ``` 604 | 605 | </Tab> 606 | <Tab value="Python"> 607 | 608 | ```python 609 | import requests 610 | 611 | payload = { 612 | "name": "source-vm", 613 | "newName": "cloned-vm", 614 | "sourceLocation": "default", 615 | "destLocation": "ssd" 616 | } 617 | r = requests.post("http://localhost:7777/lume/vms/clone", json=payload, timeout=50) 618 | print(r.json()) 619 | ``` 620 | 621 | </Tab> 622 | <Tab value="TypeScript"> 623 | 624 | ```typescript 625 | const payload = { 626 | name: 'source-vm', 627 | newName: 'cloned-vm', 628 | sourceLocation: 'default', 629 | destLocation: 'ssd', 630 | }; 631 | const res = await fetch('http://localhost:7777/lume/vms/clone', { 632 | method: 'POST', 633 | headers: { 'Content-Type': 'application/json' }, 634 | body: JSON.stringify(payload), 635 | }); 636 | console.log(await res.json()); 637 | ``` 638 | 639 | </Tab> 640 | </Tabs> 641 | 642 | --- 643 | 644 | ### Pull VM Image 645 | 646 | Pull a VM image from a registry. 647 | 648 | `POST: /lume/pull` 649 | 650 | #### Parameters 651 | 652 | | Name | Type | Required | Description | 653 | | ------------ | ------ | -------- | ------------------------------------- | 654 | | image | string | Yes | Image name (e.g. `macos-sequoia-...`) | 655 | | name | string | No | VM name for the pulled image | 656 | | registry | string | No | Registry host (e.g. `ghcr.io`) | 657 | | organization | string | No | Organization name | 658 | | storage | string | No | Storage type (`ssd`, etc.) | 659 | 660 | #### Example Request 661 | 662 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 663 | <Tab value="Curl"> 664 | 665 | ```bash 666 | curl --connect-timeout 6000 \ 667 | --max-time 5000 \ 668 | -X POST \ 669 | -H "Content-Type: application/json" \ 670 | -d '{ 671 | "image": "macos-sequoia-vanilla:latest", 672 | "name": "my-vm-name", 673 | "registry": "ghcr.io", 674 | "organization": "trycua", 675 | "storage": "ssd" 676 | }' \ 677 | http://localhost:7777/lume/pull 678 | ``` 679 | 680 | </Tab> 681 | <Tab value="Python"> 682 | 683 | ```python 684 | import requests 685 | 686 | payload = { 687 | "image": "macos-sequoia-vanilla:latest", 688 | "name": "my-vm-name", 689 | "registry": "ghcr.io", 690 | "organization": "trycua", 691 | "storage": "ssd" 692 | } 693 | r = requests.post("http://localhost:7777/lume/pull", json=payload, timeout=50) 694 | print(r.json()) 695 | ``` 696 | 697 | </Tab> 698 | <Tab value="TypeScript"> 699 | 700 | ```typescript 701 | const payload = { 702 | image: 'macos-sequoia-vanilla:latest', 703 | name: 'my-vm-name', 704 | registry: 'ghcr.io', 705 | organization: 'trycua', 706 | storage: 'ssd', 707 | }; 708 | const res = await fetch('http://localhost:7777/lume/pull', { 709 | method: 'POST', 710 | headers: { 'Content-Type': 'application/json' }, 711 | body: JSON.stringify(payload), 712 | }); 713 | console.log(await res.json()); 714 | ``` 715 | 716 | </Tab> 717 | </Tabs> 718 | 719 | --- 720 | 721 | ### Push VM Image 722 | 723 | Push a VM to a registry as an image (asynchronous operation). 724 | 725 | `POST: /lume/vms/push` 726 | 727 | #### Parameters 728 | 729 | | Name | Type | Required | Description | 730 | | ------------ | ------------ | -------- | ----------------------------------------------- | 731 | | name | string | Yes | Local VM name to push | 732 | | imageName | string | Yes | Image name in registry | 733 | | tags | array | Yes | Image tags (e.g. `["latest", "v1"]`) | 734 | | organization | string | Yes | Organization name | 735 | | registry | string | No | Registry host (e.g. `ghcr.io`) | 736 | | chunkSizeMb | integer | No | Chunk size in MB for upload | 737 | | storage | string/null | No | Storage type (`ssd`, etc.) | 738 | 739 | #### Example Request 740 | 741 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 742 | <Tab value="Curl"> 743 | 744 | ```bash 745 | curl --connect-timeout 6000 \ 746 | --max-time 5000 \ 747 | -X POST \ 748 | -H "Content-Type: application/json" \ 749 | -d '{ 750 | "name": "my-local-vm", 751 | "imageName": "my-image", 752 | "tags": ["latest", "v1"], 753 | "organization": "my-org", 754 | "registry": "ghcr.io", 755 | "chunkSizeMb": 512, 756 | "storage": null 757 | }' \ 758 | http://localhost:7777/lume/vms/push 759 | ``` 760 | 761 | </Tab> 762 | <Tab value="Python"> 763 | 764 | ```python 765 | import requests 766 | 767 | payload = { 768 | "name": "my-local-vm", 769 | "imageName": "my-image", 770 | "tags": ["latest", "v1"], 771 | "organization": "my-org", 772 | "registry": "ghcr.io", 773 | "chunkSizeMb": 512, 774 | "storage": None 775 | } 776 | r = requests.post("http://localhost:7777/lume/vms/push", json=payload, timeout=50) 777 | print(r.json()) 778 | ``` 779 | 780 | </Tab> 781 | <Tab value="TypeScript"> 782 | 783 | ```typescript 784 | const payload = { 785 | name: 'my-local-vm', 786 | imageName: 'my-image', 787 | tags: ['latest', 'v1'], 788 | organization: 'my-org', 789 | registry: 'ghcr.io', 790 | chunkSizeMb: 512, 791 | storage: null, 792 | }; 793 | const res = await fetch('http://localhost:7777/lume/vms/push', { 794 | method: 'POST', 795 | headers: { 'Content-Type': 'application/json' }, 796 | body: JSON.stringify(payload), 797 | }); 798 | console.log(await res.json()); 799 | ``` 800 | 801 | </Tab> 802 | </Tabs> 803 | 804 | **Response (202 Accepted):** 805 | 806 | ```json 807 | { 808 | "message": "Push initiated in background", 809 | "name": "my-local-vm", 810 | "imageName": "my-image", 811 | "tags": [ 812 | "latest", 813 | "v1" 814 | ] 815 | } 816 | ``` 817 | 818 | --- 819 | 820 | ### List Images 821 | 822 | List available VM images. 823 | 824 | `GET: /lume/images` 825 | 826 | #### Example Request 827 | 828 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 829 | <Tab value="Curl"> 830 | 831 | ```bash 832 | curl --connect-timeout 6000 \ 833 | --max-time 5000 \ 834 | http://localhost:7777/lume/images 835 | ``` 836 | 837 | </Tab> 838 | <Tab value="Python"> 839 | 840 | ```python 841 | import requests 842 | 843 | r = requests.get("http://localhost:7777/lume/images", timeout=50) 844 | print(r.json()) 845 | ``` 846 | 847 | </Tab> 848 | <Tab value="TypeScript"> 849 | 850 | ```typescript 851 | const res = await fetch('http://localhost:7777/lume/images'); 852 | console.log(await res.json()); 853 | ``` 854 | 855 | </Tab> 856 | </Tabs> 857 | 858 | ```json 859 | { 860 | "local": [ 861 | "macos-sequoia-xcode:latest", 862 | "macos-sequoia-vanilla:latest" 863 | ] 864 | } 865 | ``` 866 | 867 | --- 868 | 869 | ### Prune Images 870 | 871 | Remove unused VM images to free up disk space. 872 | 873 | `POST: /lume/prune` 874 | 875 | #### Example Request 876 | 877 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 878 | <Tab value="Curl"> 879 | 880 | ```bash 881 | curl --connect-timeout 6000 \ 882 | --max-time 5000 \ 883 | -X POST \ 884 | http://localhost:7777/lume/prune 885 | ``` 886 | 887 | </Tab> 888 | <Tab value="Python"> 889 | 890 | ```python 891 | import requests 892 | 893 | r = requests.post("http://localhost:7777/lume/prune", timeout=50) 894 | print(r.json()) 895 | ``` 896 | 897 | </Tab> 898 | <Tab value="TypeScript"> 899 | 900 | ```typescript 901 | const res = await fetch('http://localhost:7777/lume/prune', { 902 | method: 'POST', 903 | }); 904 | console.log(await res.json()); 905 | ``` 906 | 907 | </Tab> 908 | </Tabs> 909 | 910 | --- 911 | 912 | ### Get Latest IPSW URL 913 | 914 | Get the URL for the latest macOS IPSW file. 915 | 916 | `GET: /lume/ipsw` 917 | 918 | #### Example Request 919 | 920 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 921 | <Tab value="Curl"> 922 | 923 | ```bash 924 | curl --connect-timeout 6000 \ 925 | --max-time 5000 \ 926 | http://localhost:7777/lume/ipsw 927 | ``` 928 | 929 | </Tab> 930 | <Tab value="Python"> 931 | 932 | ```python 933 | import requests 934 | 935 | r = requests.get("http://localhost:7777/lume/ipsw", timeout=50) 936 | print(r.json()) 937 | ``` 938 | 939 | </Tab> 940 | <Tab value="TypeScript"> 941 | 942 | ```typescript 943 | const res = await fetch('http://localhost:7777/lume/ipsw'); 944 | console.log(await res.json()); 945 | ``` 946 | 947 | </Tab> 948 | </Tabs> 949 | 950 | --- 951 | 952 | ## Configuration Management 953 | 954 | ### Get Configuration 955 | 956 | Get current Lume configuration settings. 957 | 958 | `GET: /lume/config` 959 | 960 | #### Example Request 961 | 962 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 963 | <Tab value="Curl"> 964 | 965 | ```bash 966 | curl --connect-timeout 6000 \ 967 | --max-time 5000 \ 968 | http://localhost:7777/lume/config 969 | ``` 970 | 971 | </Tab> 972 | <Tab value="Python"> 973 | 974 | ```python 975 | import requests 976 | 977 | r = requests.get("http://localhost:7777/lume/config", timeout=50) 978 | print(r.json()) 979 | ``` 980 | 981 | </Tab> 982 | <Tab value="TypeScript"> 983 | 984 | ```typescript 985 | const res = await fetch('http://localhost:7777/lume/config'); 986 | console.log(await res.json()); 987 | ``` 988 | 989 | </Tab> 990 | </Tabs> 991 | 992 | ```json 993 | { 994 | "homeDirectory": "~/.lume", 995 | "cacheDirectory": "~/.lume/cache", 996 | "cachingEnabled": true 997 | } 998 | ``` 999 | 1000 | ### Update Configuration 1001 | 1002 | Update Lume configuration settings. 1003 | 1004 | `POST: /lume/config` 1005 | 1006 | #### Parameters 1007 | 1008 | | Name | Type | Required | Description | 1009 | | --------------- | ------- | -------- | -------------------------------- | 1010 | | homeDirectory | string | No | Lume home directory path | 1011 | | cacheDirectory | string | No | Cache directory path | 1012 | | cachingEnabled | boolean | No | Enable or disable caching | 1013 | 1014 | #### Example Request 1015 | 1016 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 1017 | <Tab value="Curl"> 1018 | 1019 | ```bash 1020 | curl --connect-timeout 6000 \ 1021 | --max-time 5000 \ 1022 | -X POST \ 1023 | -H "Content-Type: application/json" \ 1024 | -d '{ 1025 | "homeDirectory": "~/custom/lume", 1026 | "cacheDirectory": "~/custom/lume/cache", 1027 | "cachingEnabled": true 1028 | }' \ 1029 | http://localhost:7777/lume/config 1030 | ``` 1031 | 1032 | </Tab> 1033 | <Tab value="Python"> 1034 | 1035 | ```python 1036 | import requests 1037 | 1038 | payload = { 1039 | "homeDirectory": "~/custom/lume", 1040 | "cacheDirectory": "~/custom/lume/cache", 1041 | "cachingEnabled": True 1042 | } 1043 | r = requests.post("http://localhost:7777/lume/config", json=payload, timeout=50) 1044 | print(r.json()) 1045 | ``` 1046 | 1047 | </Tab> 1048 | <Tab value="TypeScript"> 1049 | 1050 | ```typescript 1051 | const payload = { 1052 | homeDirectory: '~/custom/lume', 1053 | cacheDirectory: '~/custom/lume/cache', 1054 | cachingEnabled: true, 1055 | }; 1056 | const res = await fetch('http://localhost:7777/lume/config', { 1057 | method: 'POST', 1058 | headers: { 'Content-Type': 'application/json' }, 1059 | body: JSON.stringify(payload), 1060 | }); 1061 | console.log(await res.json()); 1062 | ``` 1063 | 1064 | </Tab> 1065 | </Tabs> 1066 | 1067 | --- 1068 | 1069 | ## Storage Location Management 1070 | 1071 | ### Get VM Storage Locations 1072 | 1073 | List all configured VM storage locations. 1074 | 1075 | `GET: /lume/config/locations` 1076 | 1077 | #### Example Request 1078 | 1079 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 1080 | <Tab value="Curl"> 1081 | 1082 | ```bash 1083 | curl --connect-timeout 6000 \ 1084 | --max-time 5000 \ 1085 | http://localhost:7777/lume/config/locations 1086 | ``` 1087 | 1088 | </Tab> 1089 | <Tab value="Python"> 1090 | 1091 | ```python 1092 | import requests 1093 | 1094 | r = requests.get("http://localhost:7777/lume/config/locations", timeout=50) 1095 | print(r.json()) 1096 | ``` 1097 | 1098 | </Tab> 1099 | <Tab value="TypeScript"> 1100 | 1101 | ```typescript 1102 | const res = await fetch('http://localhost:7777/lume/config/locations'); 1103 | console.log(await res.json()); 1104 | ``` 1105 | 1106 | </Tab> 1107 | </Tabs> 1108 | 1109 | ```json 1110 | [ 1111 | { 1112 | "name": "default", 1113 | "path": "~/.lume/vms", 1114 | "isDefault": true 1115 | }, 1116 | { 1117 | "name": "ssd", 1118 | "path": "/Volumes/SSD/lume/vms", 1119 | "isDefault": false 1120 | } 1121 | ] 1122 | ``` 1123 | 1124 | ### Add VM Storage Location 1125 | 1126 | Add a new VM storage location. 1127 | 1128 | `POST: /lume/config/locations` 1129 | 1130 | #### Parameters 1131 | 1132 | | Name | Type | Required | Description | 1133 | | ---- | ------ | -------- | ---------------------------- | 1134 | | name | string | Yes | Storage location name | 1135 | | path | string | Yes | File system path for storage | 1136 | 1137 | #### Example Request 1138 | 1139 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 1140 | <Tab value="Curl"> 1141 | 1142 | ```bash 1143 | curl --connect-timeout 6000 \ 1144 | --max-time 5000 \ 1145 | -X POST \ 1146 | -H "Content-Type: application/json" \ 1147 | -d '{ 1148 | "name": "ssd", 1149 | "path": "/Volumes/SSD/lume/vms" 1150 | }' \ 1151 | http://localhost:7777/lume/config/locations 1152 | ``` 1153 | 1154 | </Tab> 1155 | <Tab value="Python"> 1156 | 1157 | ```python 1158 | import requests 1159 | 1160 | payload = { 1161 | "name": "ssd", 1162 | "path": "/Volumes/SSD/lume/vms" 1163 | } 1164 | r = requests.post("http://localhost:7777/lume/config/locations", json=payload, timeout=50) 1165 | print(r.json()) 1166 | ``` 1167 | 1168 | </Tab> 1169 | <Tab value="TypeScript"> 1170 | 1171 | ```typescript 1172 | const payload = { 1173 | name: 'ssd', 1174 | path: '/Volumes/SSD/lume/vms', 1175 | }; 1176 | const res = await fetch('http://localhost:7777/lume/config/locations', { 1177 | method: 'POST', 1178 | headers: { 'Content-Type': 'application/json' }, 1179 | body: JSON.stringify(payload), 1180 | }); 1181 | console.log(await res.json()); 1182 | ``` 1183 | 1184 | </Tab> 1185 | </Tabs> 1186 | 1187 | ### Remove VM Storage Location 1188 | 1189 | Remove a VM storage location. 1190 | 1191 | `DELETE: /lume/config/locations/:name` 1192 | 1193 | #### Example Request 1194 | 1195 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 1196 | <Tab value="Curl"> 1197 | 1198 | ```bash 1199 | curl --connect-timeout 6000 \ 1200 | --max-time 5000 \ 1201 | -X DELETE \ 1202 | http://localhost:7777/lume/config/locations/ssd 1203 | ``` 1204 | 1205 | </Tab> 1206 | <Tab value="Python"> 1207 | 1208 | ```python 1209 | import requests 1210 | 1211 | r = requests.delete("http://localhost:7777/lume/config/locations/ssd", timeout=50) 1212 | print(r.status_code) 1213 | ``` 1214 | 1215 | </Tab> 1216 | <Tab value="TypeScript"> 1217 | 1218 | ```typescript 1219 | const res = await fetch('http://localhost:7777/lume/config/locations/ssd', { 1220 | method: 'DELETE', 1221 | }); 1222 | console.log(res.status); 1223 | ``` 1224 | 1225 | </Tab> 1226 | </Tabs> 1227 | 1228 | ### Set Default VM Storage Location 1229 | 1230 | Set a storage location as the default. 1231 | 1232 | `POST: /lume/config/locations/default/:name` 1233 | 1234 | #### Example Request 1235 | 1236 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}> 1237 | <Tab value="Curl"> 1238 | 1239 | ```bash 1240 | curl --connect-timeout 6000 \ 1241 | --max-time 5000 \ 1242 | -X POST \ 1243 | http://localhost:7777/lume/config/locations/default/ssd 1244 | ``` 1245 | 1246 | </Tab> 1247 | <Tab value="Python"> 1248 | 1249 | ```python 1250 | import requests 1251 | 1252 | r = requests.post("http://localhost:7777/lume/config/locations/default/ssd", timeout=50) 1253 | print(r.json()) 1254 | ``` 1255 | 1256 | </Tab> 1257 | <Tab value="TypeScript"> 1258 | 1259 | ```typescript 1260 | const res = await fetch('http://localhost:7777/lume/config/locations/default/ssd', { 1261 | method: 'POST', 1262 | }); 1263 | console.log(await res.json()); 1264 | ``` 1265 | 1266 | </Tab> 1267 | </Tabs> 1268 | ``` -------------------------------------------------------------------------------- /libs/python/computer-server/computer_server/main.py: -------------------------------------------------------------------------------- ```python 1 | from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request, HTTPException, Header 2 | from fastapi.responses import StreamingResponse, JSONResponse 3 | from typing import List, Dict, Any, Optional, Union, Literal, cast 4 | import uvicorn 5 | import logging 6 | import asyncio 7 | import json 8 | import traceback 9 | import inspect 10 | from contextlib import redirect_stdout, redirect_stderr 11 | from io import StringIO 12 | from .handlers.factory import HandlerFactory 13 | import os 14 | import aiohttp 15 | import hashlib 16 | import time 17 | import platform 18 | from fastapi.middleware.cors import CORSMiddleware 19 | 20 | # Authentication session TTL (in seconds). Override via env var CUA_AUTH_TTL_SECONDS. Default: 60s 21 | AUTH_SESSION_TTL_SECONDS: int = int(os.environ.get("CUA_AUTH_TTL_SECONDS", "60")) 22 | 23 | try: 24 | from agent import ComputerAgent 25 | HAS_AGENT = True 26 | except ImportError: 27 | HAS_AGENT = False 28 | 29 | # Set up logging with more detail 30 | logger = logging.getLogger(__name__) 31 | logger.setLevel(logging.INFO) 32 | 33 | # Configure WebSocket with larger message size 34 | WEBSOCKET_MAX_SIZE = 1024 * 1024 * 10 # 10MB limit 35 | 36 | # Configure application with WebSocket settings 37 | app = FastAPI( 38 | title="Computer API", 39 | description="API for the Computer project", 40 | version="0.1.0", 41 | websocket_max_size=WEBSOCKET_MAX_SIZE, 42 | ) 43 | 44 | # CORS configuration 45 | origins = ["*"] 46 | app.add_middleware( 47 | CORSMiddleware, 48 | allow_origins=origins, 49 | allow_credentials=True, 50 | allow_methods=["*"], 51 | allow_headers=["*"], 52 | ) 53 | 54 | protocol_version = 1 55 | try: 56 | from importlib.metadata import version 57 | package_version = version("cua-computer-server") 58 | except Exception: 59 | # Fallback for cases where package is not installed or importlib.metadata is not available 60 | try: 61 | import pkg_resources 62 | package_version = pkg_resources.get_distribution("cua-computer-server").version 63 | except Exception: 64 | package_version = "unknown" 65 | 66 | accessibility_handler, automation_handler, diorama_handler, file_handler = HandlerFactory.create_handlers() 67 | handlers = { 68 | "version": lambda: {"protocol": protocol_version, "package": package_version}, 69 | # App-Use commands 70 | "diorama_cmd": diorama_handler.diorama_cmd, 71 | # Accessibility commands 72 | "get_accessibility_tree": accessibility_handler.get_accessibility_tree, 73 | "find_element": accessibility_handler.find_element, 74 | # Shell commands 75 | "run_command": automation_handler.run_command, 76 | # File system commands 77 | "file_exists": file_handler.file_exists, 78 | "directory_exists": file_handler.directory_exists, 79 | "list_dir": file_handler.list_dir, 80 | "read_text": file_handler.read_text, 81 | "write_text": file_handler.write_text, 82 | "read_bytes": file_handler.read_bytes, 83 | "write_bytes": file_handler.write_bytes, 84 | "get_file_size": file_handler.get_file_size, 85 | "delete_file": file_handler.delete_file, 86 | "create_dir": file_handler.create_dir, 87 | "delete_dir": file_handler.delete_dir, 88 | # Mouse commands 89 | "mouse_down": automation_handler.mouse_down, 90 | "mouse_up": automation_handler.mouse_up, 91 | "left_click": automation_handler.left_click, 92 | "right_click": automation_handler.right_click, 93 | "double_click": automation_handler.double_click, 94 | "move_cursor": automation_handler.move_cursor, 95 | "drag_to": automation_handler.drag_to, 96 | "drag": automation_handler.drag, 97 | # Keyboard commands 98 | "key_down": automation_handler.key_down, 99 | "key_up": automation_handler.key_up, 100 | "type_text": automation_handler.type_text, 101 | "press_key": automation_handler.press_key, 102 | "hotkey": automation_handler.hotkey, 103 | # Scrolling actions 104 | "scroll": automation_handler.scroll, 105 | "scroll_down": automation_handler.scroll_down, 106 | "scroll_up": automation_handler.scroll_up, 107 | # Screen actions 108 | "screenshot": automation_handler.screenshot, 109 | "get_cursor_position": automation_handler.get_cursor_position, 110 | "get_screen_size": automation_handler.get_screen_size, 111 | # Clipboard actions 112 | "copy_to_clipboard": automation_handler.copy_to_clipboard, 113 | "set_clipboard": automation_handler.set_clipboard, 114 | } 115 | 116 | 117 | class AuthenticationManager: 118 | def __init__(self): 119 | self.sessions: Dict[str, Dict[str, Any]] = {} 120 | self.container_name = os.environ.get("CONTAINER_NAME") 121 | 122 | def _hash_credentials(self, container_name: str, api_key: str) -> str: 123 | """Create a hash of container name and API key for session identification""" 124 | combined = f"{container_name}:{api_key}" 125 | return hashlib.sha256(combined.encode()).hexdigest() 126 | 127 | def _is_session_valid(self, session_data: Dict[str, Any]) -> bool: 128 | """Check if a session is still valid based on expiration time""" 129 | if not session_data.get('valid', False): 130 | return False 131 | 132 | expires_at = session_data.get('expires_at', 0) 133 | return time.time() < expires_at 134 | 135 | async def auth(self, container_name: str, api_key: str) -> bool: 136 | """Authenticate container name and API key, using cached sessions when possible""" 137 | # If no CONTAINER_NAME is set, always allow access (local development) 138 | if not self.container_name: 139 | logger.info("No CONTAINER_NAME set in environment. Allowing access (local development mode)") 140 | return True 141 | 142 | # Layer 1: VM Identity Verification 143 | if container_name != self.container_name: 144 | logger.warning(f"VM name mismatch. Expected: {self.container_name}, Got: {container_name}") 145 | return False 146 | 147 | # Create hash for session lookup 148 | session_hash = self._hash_credentials(container_name, api_key) 149 | 150 | # Check if we have a valid cached session 151 | if session_hash in self.sessions: 152 | session_data = self.sessions[session_hash] 153 | if self._is_session_valid(session_data): 154 | logger.info(f"Using cached authentication for container: {container_name}") 155 | return session_data['valid'] 156 | else: 157 | # Remove expired session 158 | del self.sessions[session_hash] 159 | 160 | # No valid cached session, authenticate with API 161 | logger.info(f"Authenticating with TryCUA API for container: {container_name}") 162 | 163 | try: 164 | async with aiohttp.ClientSession() as session: 165 | headers = { 166 | "Authorization": f"Bearer {api_key}" 167 | } 168 | 169 | async with session.get( 170 | f"https://www.trycua.com/api/vm/auth?container_name={container_name}", 171 | headers=headers, 172 | ) as resp: 173 | is_valid = resp.status == 200 and bool((await resp.text()).strip()) 174 | 175 | # Cache the result with configurable expiration 176 | self.sessions[session_hash] = { 177 | 'valid': is_valid, 178 | 'expires_at': time.time() + AUTH_SESSION_TTL_SECONDS 179 | } 180 | 181 | if is_valid: 182 | logger.info(f"Authentication successful for container: {container_name}") 183 | else: 184 | logger.warning(f"Authentication failed for container: {container_name}. Status: {resp.status}") 185 | 186 | return is_valid 187 | 188 | except aiohttp.ClientError as e: 189 | logger.error(f"Failed to validate API key with TryCUA API: {str(e)}") 190 | # Cache failed result to avoid repeated requests 191 | self.sessions[session_hash] = { 192 | 'valid': False, 193 | 'expires_at': time.time() + AUTH_SESSION_TTL_SECONDS 194 | } 195 | return False 196 | except Exception as e: 197 | logger.error(f"Unexpected error during authentication: {str(e)}") 198 | # Cache failed result to avoid repeated requests 199 | self.sessions[session_hash] = { 200 | 'valid': False, 201 | 'expires_at': time.time() + AUTH_SESSION_TTL_SECONDS 202 | } 203 | return False 204 | 205 | 206 | class ConnectionManager: 207 | def __init__(self): 208 | self.active_connections: List[WebSocket] = [] 209 | 210 | async def connect(self, websocket: WebSocket): 211 | await websocket.accept() 212 | self.active_connections.append(websocket) 213 | 214 | def disconnect(self, websocket: WebSocket): 215 | self.active_connections.remove(websocket) 216 | 217 | 218 | manager = ConnectionManager() 219 | auth_manager = AuthenticationManager() 220 | 221 | @app.get("/status") 222 | async def status(): 223 | sys = platform.system().lower() 224 | # get os type 225 | if "darwin" in sys or sys == "macos" or sys == "mac": 226 | os_type = "macos" 227 | elif "windows" in sys: 228 | os_type = "windows" 229 | else: 230 | os_type = "linux" 231 | # get computer-server features 232 | features = [] 233 | if HAS_AGENT: 234 | features.append("agent") 235 | return {"status": "ok", "os_type": os_type, "features": features} 236 | 237 | @app.websocket("/ws", name="websocket_endpoint") 238 | async def websocket_endpoint(websocket: WebSocket): 239 | global handlers 240 | 241 | # WebSocket message size is configured at the app or endpoint level, not on the instance 242 | await manager.connect(websocket) 243 | 244 | # Check if CONTAINER_NAME is set (indicating cloud provider) 245 | server_container_name = os.environ.get("CONTAINER_NAME") 246 | 247 | # If cloud provider, perform authentication handshake 248 | if server_container_name: 249 | try: 250 | logger.info(f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Waiting for authentication...") 251 | 252 | # Wait for authentication message 253 | auth_data = await websocket.receive_json() 254 | 255 | # Validate auth message format 256 | if auth_data.get("command") != "authenticate": 257 | await websocket.send_json({ 258 | "success": False, 259 | "error": "First message must be authentication" 260 | }) 261 | await websocket.close() 262 | manager.disconnect(websocket) 263 | return 264 | 265 | # Extract credentials 266 | client_api_key = auth_data.get("params", {}).get("api_key") 267 | client_container_name = auth_data.get("params", {}).get("container_name") 268 | 269 | # Validate credentials using AuthenticationManager 270 | if not client_api_key: 271 | await websocket.send_json({ 272 | "success": False, 273 | "error": "API key required" 274 | }) 275 | await websocket.close() 276 | manager.disconnect(websocket) 277 | return 278 | 279 | if not client_container_name: 280 | await websocket.send_json({ 281 | "success": False, 282 | "error": "Container name required" 283 | }) 284 | await websocket.close() 285 | manager.disconnect(websocket) 286 | return 287 | 288 | # Use AuthenticationManager for validation 289 | is_authenticated = await auth_manager.auth(client_container_name, client_api_key) 290 | if not is_authenticated: 291 | await websocket.send_json({ 292 | "success": False, 293 | "error": "Authentication failed" 294 | }) 295 | await websocket.close() 296 | manager.disconnect(websocket) 297 | return 298 | 299 | logger.info(f"Authentication successful for VM: {client_container_name}") 300 | await websocket.send_json({ 301 | "success": True, 302 | "message": "Authentication successful" 303 | }) 304 | 305 | except Exception as e: 306 | logger.error(f"Error during authentication handshake: {str(e)}") 307 | await websocket.send_json({ 308 | "success": False, 309 | "error": "Authentication failed" 310 | }) 311 | await websocket.close() 312 | manager.disconnect(websocket) 313 | return 314 | 315 | try: 316 | while True: 317 | try: 318 | data = await websocket.receive_json() 319 | command = data.get("command") 320 | params = data.get("params", {}) 321 | 322 | if command not in handlers: 323 | await websocket.send_json( 324 | {"success": False, "error": f"Unknown command: {command}"} 325 | ) 326 | continue 327 | 328 | try: 329 | # Filter params to only include those accepted by the handler function 330 | handler_func = handlers[command] 331 | sig = inspect.signature(handler_func) 332 | filtered_params = {k: v for k, v in params.items() if k in sig.parameters} 333 | 334 | # Handle both sync and async functions 335 | if asyncio.iscoroutinefunction(handler_func): 336 | result = await handler_func(**filtered_params) 337 | else: 338 | # Run sync functions in thread pool to avoid blocking event loop 339 | result = await asyncio.to_thread(handler_func, **filtered_params) 340 | await websocket.send_json({"success": True, **result}) 341 | except Exception as cmd_error: 342 | logger.error(f"Error executing command {command}: {str(cmd_error)}") 343 | logger.error(traceback.format_exc()) 344 | await websocket.send_json({"success": False, "error": str(cmd_error)}) 345 | 346 | except WebSocketDisconnect: 347 | raise 348 | except json.JSONDecodeError as json_err: 349 | logger.error(f"JSON decode error: {str(json_err)}") 350 | await websocket.send_json( 351 | {"success": False, "error": f"Invalid JSON: {str(json_err)}"} 352 | ) 353 | except Exception as loop_error: 354 | logger.error(f"Error in message loop: {str(loop_error)}") 355 | logger.error(traceback.format_exc()) 356 | await websocket.send_json({"success": False, "error": str(loop_error)}) 357 | 358 | except WebSocketDisconnect: 359 | logger.info("Client disconnected") 360 | manager.disconnect(websocket) 361 | except Exception as e: 362 | logger.error(f"Fatal error in websocket connection: {str(e)}") 363 | logger.error(traceback.format_exc()) 364 | try: 365 | await websocket.close() 366 | except: 367 | pass 368 | manager.disconnect(websocket) 369 | 370 | @app.post("/cmd") 371 | async def cmd_endpoint( 372 | request: Request, 373 | container_name: Optional[str] = Header(None, alias="X-Container-Name"), 374 | api_key: Optional[str] = Header(None, alias="X-API-Key") 375 | ): 376 | """ 377 | Backup endpoint for when WebSocket connections fail. 378 | Accepts commands via HTTP POST with streaming response. 379 | 380 | Headers: 381 | - X-Container-Name: Container name for cloud authentication 382 | - X-API-Key: API key for cloud authentication 383 | 384 | Body: 385 | { 386 | "command": "command_name", 387 | "params": {...} 388 | } 389 | """ 390 | global handlers 391 | 392 | # Parse request body 393 | try: 394 | body = await request.json() 395 | command = body.get("command") 396 | params = body.get("params", {}) 397 | except Exception as e: 398 | raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}") 399 | 400 | if not command: 401 | raise HTTPException(status_code=400, detail="Command is required") 402 | 403 | # Check if CONTAINER_NAME is set (indicating cloud provider) 404 | server_container_name = os.environ.get("CONTAINER_NAME") 405 | 406 | # If cloud provider, perform authentication 407 | if server_container_name: 408 | logger.info(f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication...") 409 | 410 | # Validate required headers 411 | if not container_name: 412 | raise HTTPException(status_code=401, detail="Container name required") 413 | 414 | if not api_key: 415 | raise HTTPException(status_code=401, detail="API key required") 416 | 417 | # Validate with AuthenticationManager 418 | is_authenticated = await auth_manager.auth(container_name, api_key) 419 | if not is_authenticated: 420 | raise HTTPException(status_code=401, detail="Authentication failed") 421 | 422 | if command not in handlers: 423 | raise HTTPException(status_code=400, detail=f"Unknown command: {command}") 424 | 425 | async def generate_response(): 426 | """Generate streaming response for the command execution""" 427 | try: 428 | # Filter params to only include those accepted by the handler function 429 | handler_func = handlers[command] 430 | sig = inspect.signature(handler_func) 431 | filtered_params = {k: v for k, v in params.items() if k in sig.parameters} 432 | 433 | # Handle both sync and async functions 434 | if asyncio.iscoroutinefunction(handler_func): 435 | result = await handler_func(**filtered_params) 436 | else: 437 | # Run sync functions in thread pool to avoid blocking event loop 438 | result = await asyncio.to_thread(handler_func, **filtered_params) 439 | 440 | # Stream the successful result 441 | response_data = {"success": True, **result} 442 | yield f"data: {json.dumps(response_data)}\n\n" 443 | 444 | except Exception as cmd_error: 445 | logger.error(f"Error executing command {command}: {str(cmd_error)}") 446 | logger.error(traceback.format_exc()) 447 | 448 | # Stream the error result 449 | error_data = {"success": False, "error": str(cmd_error)} 450 | yield f"data: {json.dumps(error_data)}\n\n" 451 | 452 | return StreamingResponse( 453 | generate_response(), 454 | media_type="text/plain", 455 | headers={ 456 | "Cache-Control": "no-cache", 457 | "Connection": "keep-alive", 458 | } 459 | ) 460 | 461 | @app.post("/responses") 462 | async def agent_response_endpoint( 463 | request: Request, 464 | api_key: Optional[str] = Header(None, alias="X-API-Key"), 465 | ): 466 | """ 467 | Minimal proxy to run ComputerAgent for up to 2 turns. 468 | 469 | Security: 470 | - If CONTAINER_NAME is set on the server, require X-API-Key 471 | and validate using AuthenticationManager unless CUA_ENABLE_PUBLIC_PROXY is true. 472 | 473 | Body JSON: 474 | { 475 | "model": "...", # required 476 | "input": "... or messages[]", # required 477 | "agent_kwargs": { ... }, # optional, passed directly to ComputerAgent 478 | "env": { ... } # optional env overrides for agent 479 | } 480 | """ 481 | if not HAS_AGENT: 482 | raise HTTPException(status_code=501, detail="ComputerAgent not available") 483 | 484 | # Authenticate via AuthenticationManager if running in cloud (CONTAINER_NAME set) 485 | container_name = os.environ.get("CONTAINER_NAME") 486 | if container_name: 487 | is_public = os.environ.get("CUA_ENABLE_PUBLIC_PROXY", "").lower().strip() in ["1", "true", "yes", "y", "on"] 488 | if not is_public: 489 | if not api_key: 490 | raise HTTPException(status_code=401, detail="Missing AGENT PROXY auth headers") 491 | ok = await auth_manager.auth(container_name, api_key) 492 | if not ok: 493 | raise HTTPException(status_code=401, detail="Unauthorized") 494 | 495 | # Parse request body 496 | try: 497 | body = await request.json() 498 | except Exception as e: 499 | raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}") 500 | 501 | model = body.get("model") 502 | input_data = body.get("input") 503 | if not model or input_data is None: 504 | raise HTTPException(status_code=400, detail="'model' and 'input' are required") 505 | 506 | agent_kwargs: Dict[str, Any] = body.get("agent_kwargs") or {} 507 | env_overrides: Dict[str, str] = body.get("env") or {} 508 | 509 | # Simple env override context 510 | class _EnvOverride: 511 | def __init__(self, overrides: Dict[str, str]): 512 | self.overrides = overrides 513 | self._original: Dict[str, Optional[str]] = {} 514 | def __enter__(self): 515 | for k, v in (self.overrides or {}).items(): 516 | self._original[k] = os.environ.get(k) 517 | os.environ[k] = str(v) 518 | def __exit__(self, exc_type, exc, tb): 519 | for k, old in self._original.items(): 520 | if old is None: 521 | os.environ.pop(k, None) 522 | else: 523 | os.environ[k] = old 524 | 525 | # Convert input to messages 526 | def _to_messages(data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]: 527 | if isinstance(data, str): 528 | return [{"role": "user", "content": data}] 529 | if isinstance(data, list): 530 | return data 531 | 532 | messages = _to_messages(input_data) 533 | 534 | # Define a direct computer tool that implements the AsyncComputerHandler protocol 535 | # and delegates to our existing automation/file/accessibility handlers. 536 | from agent.computers import AsyncComputerHandler # runtime-checkable Protocol 537 | 538 | class DirectComputer(AsyncComputerHandler): 539 | def __init__(self): 540 | # use module-scope handler singletons created by HandlerFactory 541 | self._auto = automation_handler 542 | self._file = file_handler 543 | self._access = accessibility_handler 544 | 545 | async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]: 546 | sys = platform.system().lower() 547 | if "darwin" in sys or sys in ("macos", "mac"): 548 | return "mac" 549 | if "windows" in sys: 550 | return "windows" 551 | return "linux" 552 | 553 | async def get_dimensions(self) -> tuple[int, int]: 554 | size = await self._auto.get_screen_size() 555 | return size["width"], size["height"] 556 | 557 | async def screenshot(self) -> str: 558 | img_b64 = await self._auto.screenshot() 559 | return img_b64["image_data"] 560 | 561 | async def click(self, x: int, y: int, button: str = "left") -> None: 562 | if button == "left": 563 | await self._auto.left_click(x, y) 564 | elif button == "right": 565 | await self._auto.right_click(x, y) 566 | else: 567 | await self._auto.left_click(x, y) 568 | 569 | async def double_click(self, x: int, y: int) -> None: 570 | await self._auto.double_click(x, y) 571 | 572 | async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: 573 | await self._auto.move_cursor(x, y) 574 | await self._auto.scroll(scroll_x, scroll_y) 575 | 576 | async def type(self, text: str) -> None: 577 | await self._auto.type_text(text) 578 | 579 | async def wait(self, ms: int = 1000) -> None: 580 | await asyncio.sleep(ms / 1000.0) 581 | 582 | async def move(self, x: int, y: int) -> None: 583 | await self._auto.move_cursor(x, y) 584 | 585 | async def keypress(self, keys: Union[List[str], str]) -> None: 586 | if isinstance(keys, str): 587 | parts = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys] 588 | else: 589 | parts = keys 590 | if len(parts) == 1: 591 | await self._auto.press_key(parts[0]) 592 | else: 593 | await self._auto.hotkey(parts) 594 | 595 | async def drag(self, path: List[Dict[str, int]]) -> None: 596 | if not path: 597 | return 598 | start = path[0] 599 | await self._auto.mouse_down(start["x"], start["y"]) 600 | for pt in path[1:]: 601 | await self._auto.move_cursor(pt["x"], pt["y"]) 602 | end = path[-1] 603 | await self._auto.mouse_up(end["x"], end["y"]) 604 | 605 | async def get_current_url(self) -> str: 606 | # Not available in this server context 607 | return "" 608 | 609 | async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None: 610 | await self._auto.mouse_down(x, y, button="left") 611 | 612 | async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None: 613 | await self._auto.mouse_up(x, y, button="left") 614 | 615 | # # Inline image URLs to base64 616 | # import base64, mimetypes, requests 617 | # # Use a browser-like User-Agent to avoid 403s from some CDNs (e.g., Wikimedia) 618 | # HEADERS = { 619 | # "User-Agent": ( 620 | # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " 621 | # "AppleWebKit/537.36 (KHTML, like Gecko) " 622 | # "Chrome/124.0.0.0 Safari/537.36" 623 | # ) 624 | # } 625 | # def _to_data_url(content_bytes: bytes, url: str, resp: requests.Response) -> str: 626 | # ctype = resp.headers.get("Content-Type") or mimetypes.guess_type(url)[0] or "application/octet-stream" 627 | # b64 = base64.b64encode(content_bytes).decode("utf-8") 628 | # return f"data:{ctype};base64,{b64}" 629 | # def inline_image_urls(messages): 630 | # # messages: List[{"role": "...","content":[...]}] 631 | # out = [] 632 | # for m in messages: 633 | # if not isinstance(m.get("content"), list): 634 | # out.append(m) 635 | # continue 636 | # new_content = [] 637 | # for part in (m.get("content") or []): 638 | # if part.get("type") == "input_image" and (url := part.get("image_url")): 639 | # resp = requests.get(url, headers=HEADERS, timeout=30) 640 | # resp.raise_for_status() 641 | # new_content.append({ 642 | # "type": "input_image", 643 | # "image_url": _to_data_url(resp.content, url, resp) 644 | # }) 645 | # else: 646 | # new_content.append(part) 647 | # out.append({**m, "content": new_content}) 648 | # return out 649 | # messages = inline_image_urls(messages) 650 | 651 | error = None 652 | 653 | with _EnvOverride(env_overrides): 654 | # Prepare tools: if caller did not pass tools, inject our DirectComputer 655 | tools = agent_kwargs.get("tools") 656 | if not tools: 657 | tools = [DirectComputer()] 658 | agent_kwargs = {**agent_kwargs, "tools": tools} 659 | # Instantiate agent with our tools 660 | agent = ComputerAgent(model=model, **agent_kwargs) # type: ignore[arg-type] 661 | 662 | total_output: List[Any] = [] 663 | total_usage: Dict[str, Any] = {} 664 | 665 | pending_computer_call_ids = set() 666 | try: 667 | async for result in agent.run(messages): 668 | total_output += result["output"] 669 | # Try to collect usage if present 670 | if isinstance(result, dict) and "usage" in result and isinstance(result["usage"], dict): 671 | # Merge usage counters 672 | for k, v in result["usage"].items(): 673 | if isinstance(v, (int, float)): 674 | total_usage[k] = total_usage.get(k, 0) + v 675 | else: 676 | total_usage[k] = v 677 | for msg in result.get("output", []): 678 | if msg.get("type") == "computer_call": 679 | pending_computer_call_ids.add(msg["call_id"]) 680 | elif msg.get("type") == "computer_call_output": 681 | pending_computer_call_ids.discard(msg["call_id"]) 682 | # exit if no pending computer calls 683 | if not pending_computer_call_ids: 684 | break 685 | except Exception as e: 686 | logger.error(f"Error running agent: {str(e)}") 687 | logger.error(traceback.format_exc()) 688 | error = str(e) 689 | 690 | # Build response payload 691 | payload = { 692 | "model": model, 693 | "error": error, 694 | "output": total_output, 695 | "usage": total_usage, 696 | "status": "completed" if not error else "failed" 697 | } 698 | 699 | # CORS: allow any origin 700 | headers = { 701 | "Cache-Control": "no-cache", 702 | "Connection": "keep-alive", 703 | } 704 | 705 | return JSONResponse(content=payload, headers=headers) 706 | 707 | 708 | if __name__ == "__main__": 709 | uvicorn.run(app, host="0.0.0.0", port=8000) 710 | ``` -------------------------------------------------------------------------------- /libs/lume/src/Server/Handlers.swift: -------------------------------------------------------------------------------- ```swift 1 | import ArgumentParser 2 | import Foundation 3 | import Virtualization 4 | 5 | @MainActor 6 | extension Server { 7 | // MARK: - VM Management Handlers 8 | 9 | func handleListVMs(storage: String? = nil) async throws -> HTTPResponse { 10 | do { 11 | let vmController = LumeController() 12 | let vms = try vmController.list(storage: storage) 13 | return try .json(vms) 14 | } catch { 15 | print( 16 | "ERROR: Failed to list VMs: \(error.localizedDescription), storage=\(String(describing: storage))" 17 | ) 18 | return .badRequest(message: error.localizedDescription) 19 | } 20 | } 21 | 22 | func handleGetVM(name: String, storage: String? = nil) async throws -> HTTPResponse { 23 | print("Getting VM details: name=\(name), storage=\(String(describing: storage))") 24 | 25 | do { 26 | let vmController = LumeController() 27 | print("Created VM controller, attempting to get VM") 28 | let vm = try vmController.get(name: name, storage: storage) 29 | print("Successfully retrieved VM") 30 | 31 | // Check for nil values that might cause crashes 32 | if vm.vmDirContext.config.macAddress == nil { 33 | print("ERROR: VM has nil macAddress") 34 | return .badRequest(message: "VM configuration is invalid (nil macAddress)") 35 | } 36 | print("MacAddress check passed") 37 | 38 | // Log that we're about to access details 39 | print("Preparing VM details response") 40 | 41 | // Print the full details object for debugging 42 | let details = vm.details 43 | print("VM DETAILS: \(details)") 44 | print(" name: \(details.name)") 45 | print(" os: \(details.os)") 46 | print(" cpuCount: \(details.cpuCount)") 47 | print(" memorySize: \(details.memorySize)") 48 | print(" diskSize: \(details.diskSize)") 49 | print(" display: \(details.display)") 50 | print(" status: \(details.status)") 51 | print(" vncUrl: \(String(describing: details.vncUrl))") 52 | print(" ipAddress: \(String(describing: details.ipAddress))") 53 | print(" locationName: \(details.locationName)") 54 | 55 | // Serialize the VM details 56 | print("About to serialize VM details") 57 | let response = try HTTPResponse.json(vm.details) 58 | print("Successfully serialized VM details") 59 | return response 60 | 61 | } catch { 62 | // This will catch errors from both vmController.get and the json serialization 63 | print("ERROR: Failed to get VM details: \(error.localizedDescription)") 64 | return .badRequest(message: error.localizedDescription) 65 | } 66 | } 67 | 68 | func handleCreateVM(_ body: Data?) async throws -> HTTPResponse { 69 | guard let body = body, 70 | let request = try? JSONDecoder().decode(CreateVMRequest.self, from: body) 71 | else { 72 | return HTTPResponse( 73 | statusCode: .badRequest, 74 | headers: ["Content-Type": "application/json"], 75 | body: try JSONEncoder().encode(APIError(message: "Invalid request body")) 76 | ) 77 | } 78 | 79 | do { 80 | let sizes = try request.parse() 81 | let vmController = LumeController() 82 | try await vmController.create( 83 | name: request.name, 84 | os: request.os, 85 | diskSize: sizes.diskSize, 86 | cpuCount: request.cpu, 87 | memorySize: sizes.memory, 88 | display: request.display, 89 | ipsw: request.ipsw, 90 | storage: request.storage 91 | ) 92 | 93 | return HTTPResponse( 94 | statusCode: .ok, 95 | headers: ["Content-Type": "application/json"], 96 | body: try JSONEncoder().encode([ 97 | "message": "VM created successfully", "name": request.name, 98 | ]) 99 | ) 100 | } catch { 101 | return HTTPResponse( 102 | statusCode: .badRequest, 103 | headers: ["Content-Type": "application/json"], 104 | body: try JSONEncoder().encode(APIError(message: error.localizedDescription)) 105 | ) 106 | } 107 | } 108 | 109 | func handleDeleteVM(name: String, storage: String? = nil) async throws -> HTTPResponse { 110 | do { 111 | let vmController = LumeController() 112 | try await vmController.delete(name: name, storage: storage) 113 | return HTTPResponse( 114 | statusCode: .ok, headers: ["Content-Type": "application/json"], body: Data()) 115 | } catch { 116 | return HTTPResponse( 117 | statusCode: .badRequest, headers: ["Content-Type": "application/json"], 118 | body: try JSONEncoder().encode(APIError(message: error.localizedDescription))) 119 | } 120 | } 121 | 122 | func handleCloneVM(_ body: Data?) async throws -> HTTPResponse { 123 | guard let body = body, 124 | let request = try? JSONDecoder().decode(CloneRequest.self, from: body) 125 | else { 126 | return HTTPResponse( 127 | statusCode: .badRequest, 128 | headers: ["Content-Type": "application/json"], 129 | body: try JSONEncoder().encode(APIError(message: "Invalid request body")) 130 | ) 131 | } 132 | 133 | do { 134 | let vmController = LumeController() 135 | try vmController.clone( 136 | name: request.name, 137 | newName: request.newName, 138 | sourceLocation: request.sourceLocation, 139 | destLocation: request.destLocation 140 | ) 141 | 142 | return HTTPResponse( 143 | statusCode: .ok, 144 | headers: ["Content-Type": "application/json"], 145 | body: try JSONEncoder().encode([ 146 | "message": "VM cloned successfully", 147 | "source": request.name, 148 | "destination": request.newName, 149 | ]) 150 | ) 151 | } catch { 152 | return HTTPResponse( 153 | statusCode: .badRequest, 154 | headers: ["Content-Type": "application/json"], 155 | body: try JSONEncoder().encode(APIError(message: error.localizedDescription)) 156 | ) 157 | } 158 | } 159 | 160 | // MARK: - VM Operation Handlers 161 | 162 | func handleSetVM(name: String, body: Data?) async throws -> HTTPResponse { 163 | guard let body = body, 164 | let request = try? JSONDecoder().decode(SetVMRequest.self, from: body) 165 | else { 166 | return HTTPResponse( 167 | statusCode: .badRequest, 168 | headers: ["Content-Type": "application/json"], 169 | body: try JSONEncoder().encode(APIError(message: "Invalid request body")) 170 | ) 171 | } 172 | 173 | do { 174 | let vmController = LumeController() 175 | let sizes = try request.parse() 176 | try vmController.updateSettings( 177 | name: name, 178 | cpu: request.cpu, 179 | memory: sizes.memory, 180 | diskSize: sizes.diskSize, 181 | display: sizes.display?.string, 182 | storage: request.storage 183 | ) 184 | 185 | return HTTPResponse( 186 | statusCode: .ok, 187 | headers: ["Content-Type": "application/json"], 188 | body: try JSONEncoder().encode(["message": "VM settings updated successfully"]) 189 | ) 190 | } catch { 191 | return HTTPResponse( 192 | statusCode: .badRequest, 193 | headers: ["Content-Type": "application/json"], 194 | body: try JSONEncoder().encode(APIError(message: error.localizedDescription)) 195 | ) 196 | } 197 | } 198 | 199 | func handleStopVM(name: String, storage: String? = nil) async throws -> HTTPResponse { 200 | Logger.info( 201 | "Stopping VM", metadata: ["name": name, "storage": String(describing: storage)]) 202 | 203 | do { 204 | Logger.info("Creating VM controller", metadata: ["name": name]) 205 | let vmController = LumeController() 206 | 207 | Logger.info("Calling stopVM on controller", metadata: ["name": name]) 208 | try await vmController.stopVM(name: name, storage: storage) 209 | 210 | Logger.info( 211 | "VM stopped, waiting 5 seconds for locks to clear", metadata: ["name": name]) 212 | 213 | // Add a delay to ensure locks are fully released before returning 214 | for i in 1...5 { 215 | try? await Task.sleep(nanoseconds: 1_000_000_000) 216 | Logger.info("Lock clearing delay", metadata: ["name": name, "seconds": "\(i)/5"]) 217 | } 218 | 219 | // Verify the VM is really in a stopped state 220 | Logger.info("Verifying VM is stopped", metadata: ["name": name]) 221 | let vm = try? vmController.get(name: name, storage: storage) 222 | if let vm = vm, vm.details.status == "running" { 223 | Logger.info( 224 | "VM still reports as running despite stop operation", 225 | metadata: ["name": name, "severity": "warning"]) 226 | } else { 227 | Logger.info( 228 | "Verification complete: VM is in stopped state", metadata: ["name": name]) 229 | } 230 | 231 | Logger.info("Returning successful response", metadata: ["name": name]) 232 | return HTTPResponse( 233 | statusCode: .ok, 234 | headers: ["Content-Type": "application/json"], 235 | body: try JSONEncoder().encode(["message": "VM stopped successfully"]) 236 | ) 237 | } catch { 238 | Logger.error( 239 | "Failed to stop VM", 240 | metadata: [ 241 | "name": name, 242 | "error": error.localizedDescription, 243 | "storage": String(describing: storage), 244 | ]) 245 | return HTTPResponse( 246 | statusCode: .badRequest, 247 | headers: ["Content-Type": "application/json"], 248 | body: try JSONEncoder().encode(APIError(message: error.localizedDescription)) 249 | ) 250 | } 251 | } 252 | 253 | func handleRunVM(name: String, body: Data?) async throws -> HTTPResponse { 254 | Logger.info("Running VM", metadata: ["name": name]) 255 | 256 | // Log the raw body data if available 257 | if let body = body, let bodyString = String(data: body, encoding: .utf8) { 258 | Logger.info("Run VM raw request body", metadata: ["name": name, "body": bodyString]) 259 | } else { 260 | Logger.info("No request body or could not decode as string", metadata: ["name": name]) 261 | } 262 | 263 | do { 264 | Logger.info("Creating VM controller and parsing request", metadata: ["name": name]) 265 | let request = 266 | body.flatMap { try? JSONDecoder().decode(RunVMRequest.self, from: $0) } 267 | ?? RunVMRequest( 268 | noDisplay: nil, sharedDirectories: nil, recoveryMode: nil, storage: nil) 269 | 270 | Logger.info( 271 | "Parsed request", 272 | metadata: [ 273 | "name": name, 274 | "noDisplay": String(describing: request.noDisplay), 275 | "sharedDirectories": "\(request.sharedDirectories?.count ?? 0)", 276 | "storage": String(describing: request.storage), 277 | ]) 278 | 279 | Logger.info("Parsing shared directories", metadata: ["name": name]) 280 | let dirs = try request.parse() 281 | Logger.info( 282 | "Successfully parsed shared directories", 283 | metadata: ["name": name, "count": "\(dirs.count)"]) 284 | 285 | // Start VM in background 286 | Logger.info("Starting VM in background", metadata: ["name": name]) 287 | startVM( 288 | name: name, 289 | noDisplay: request.noDisplay ?? false, 290 | sharedDirectories: dirs, 291 | recoveryMode: request.recoveryMode ?? false, 292 | storage: request.storage 293 | ) 294 | Logger.info("VM start initiated in background", metadata: ["name": name]) 295 | 296 | // Return response immediately 297 | return HTTPResponse( 298 | statusCode: .accepted, 299 | headers: ["Content-Type": "application/json"], 300 | body: try JSONEncoder().encode([ 301 | "message": "VM start initiated", 302 | "name": name, 303 | "status": "pending", 304 | ]) 305 | ) 306 | } catch { 307 | Logger.error( 308 | "Failed to run VM", 309 | metadata: [ 310 | "name": name, 311 | "error": error.localizedDescription, 312 | ]) 313 | return HTTPResponse( 314 | statusCode: .badRequest, 315 | headers: ["Content-Type": "application/json"], 316 | body: try JSONEncoder().encode(APIError(message: error.localizedDescription)) 317 | ) 318 | } 319 | } 320 | 321 | // MARK: - Image Management Handlers 322 | 323 | func handleIPSW() async throws -> HTTPResponse { 324 | do { 325 | let vmController = LumeController() 326 | let url = try await vmController.getLatestIPSWURL() 327 | return HTTPResponse( 328 | statusCode: .ok, 329 | headers: ["Content-Type": "application/json"], 330 | body: try JSONEncoder().encode(["url": url.absoluteString]) 331 | ) 332 | } catch { 333 | return HTTPResponse( 334 | statusCode: .badRequest, 335 | headers: ["Content-Type": "application/json"], 336 | body: try JSONEncoder().encode(APIError(message: error.localizedDescription)) 337 | ) 338 | } 339 | } 340 | 341 | func handlePull(_ body: Data?) async throws -> HTTPResponse { 342 | guard let body = body, 343 | let request = try? JSONDecoder().decode(PullRequest.self, from: body) 344 | else { 345 | return HTTPResponse( 346 | statusCode: .badRequest, 347 | headers: ["Content-Type": "application/json"], 348 | body: try JSONEncoder().encode(APIError(message: "Invalid request body")) 349 | ) 350 | } 351 | 352 | do { 353 | let vmController = LumeController() 354 | try await vmController.pullImage( 355 | image: request.image, 356 | name: request.name, 357 | registry: request.registry, 358 | organization: request.organization, 359 | storage: request.storage 360 | ) 361 | 362 | return HTTPResponse( 363 | statusCode: .ok, 364 | headers: ["Content-Type": "application/json"], 365 | body: try JSONEncoder().encode([ 366 | "message": "Image pulled successfully", 367 | "image": request.image, 368 | "name": request.name ?? "default", 369 | ]) 370 | ) 371 | } catch { 372 | return HTTPResponse( 373 | statusCode: .badRequest, 374 | headers: ["Content-Type": "application/json"], 375 | body: try JSONEncoder().encode(APIError(message: error.localizedDescription)) 376 | ) 377 | } 378 | } 379 | 380 | func handlePruneImages() async throws -> HTTPResponse { 381 | do { 382 | let vmController = LumeController() 383 | try await vmController.pruneImages() 384 | return HTTPResponse( 385 | statusCode: .ok, 386 | headers: ["Content-Type": "application/json"], 387 | body: try JSONEncoder().encode(["message": "Successfully removed cached images"]) 388 | ) 389 | } catch { 390 | return HTTPResponse( 391 | statusCode: .badRequest, 392 | headers: ["Content-Type": "application/json"], 393 | body: try JSONEncoder().encode(APIError(message: error.localizedDescription)) 394 | ) 395 | } 396 | } 397 | 398 | func handlePush(_ body: Data?) async throws -> HTTPResponse { 399 | guard let body = body, 400 | let request = try? JSONDecoder().decode(PushRequest.self, from: body) 401 | else { 402 | return HTTPResponse( 403 | statusCode: .badRequest, 404 | headers: ["Content-Type": "application/json"], 405 | body: try JSONEncoder().encode(APIError(message: "Invalid request body")) 406 | ) 407 | } 408 | 409 | // Trigger push asynchronously, return Accepted immediately 410 | Task.detached { @MainActor @Sendable in 411 | do { 412 | let vmController = LumeController() 413 | try await vmController.pushImage( 414 | name: request.name, 415 | imageName: request.imageName, 416 | tags: request.tags, 417 | registry: request.registry, 418 | organization: request.organization, 419 | storage: request.storage, 420 | chunkSizeMb: request.chunkSizeMb, 421 | verbose: false, // Verbose typically handled by server logs 422 | dryRun: false, // Default API behavior is likely non-dry-run 423 | reassemble: false // Default API behavior is likely non-reassemble 424 | ) 425 | print( 426 | "Background push completed successfully for image: \(request.imageName):\(request.tags.joined(separator: ","))" 427 | ) 428 | } catch { 429 | print( 430 | "Background push failed for image: \(request.imageName):\(request.tags.joined(separator: ",")) - Error: \(error.localizedDescription)" 431 | ) 432 | } 433 | } 434 | 435 | return HTTPResponse( 436 | statusCode: .accepted, 437 | headers: ["Content-Type": "application/json"], 438 | body: try JSONEncoder().encode([ 439 | "message": AnyEncodable("Push initiated in background"), 440 | "name": AnyEncodable(request.name), 441 | "imageName": AnyEncodable(request.imageName), 442 | "tags": AnyEncodable(request.tags), 443 | ]) 444 | ) 445 | } 446 | 447 | func handleGetImages(_ request: HTTPRequest) async throws -> HTTPResponse { 448 | let pathAndQuery = request.path.split(separator: "?", maxSplits: 1) 449 | let queryParams = 450 | pathAndQuery.count > 1 451 | ? pathAndQuery[1] 452 | .split(separator: "&") 453 | .reduce(into: [String: String]()) { dict, param in 454 | let parts = param.split(separator: "=", maxSplits: 1) 455 | if parts.count == 2 { 456 | dict[String(parts[0])] = String(parts[1]) 457 | } 458 | } : [:] 459 | 460 | let organization = queryParams["organization"] ?? "trycua" 461 | 462 | do { 463 | let vmController = LumeController() 464 | let imageList = try await vmController.getImages(organization: organization) 465 | 466 | // Create a response format that matches the CLI output 467 | let response = imageList.local.map { 468 | [ 469 | "repository": $0.repository, 470 | "imageId": $0.imageId, 471 | ] 472 | } 473 | 474 | return HTTPResponse( 475 | statusCode: .ok, 476 | headers: ["Content-Type": "application/json"], 477 | body: try JSONEncoder().encode(response) 478 | ) 479 | } catch { 480 | return HTTPResponse( 481 | statusCode: .badRequest, 482 | headers: ["Content-Type": "application/json"], 483 | body: try JSONEncoder().encode(APIError(message: error.localizedDescription)) 484 | ) 485 | } 486 | } 487 | 488 | // MARK: - Config Management Handlers 489 | 490 | func handleGetConfig() async throws -> HTTPResponse { 491 | do { 492 | let vmController = LumeController() 493 | let settings = vmController.getSettings() 494 | return try .json(settings) 495 | } catch { 496 | return .badRequest(message: error.localizedDescription) 497 | } 498 | } 499 | 500 | struct ConfigRequest: Codable { 501 | let homeDirectory: String? 502 | let cacheDirectory: String? 503 | let cachingEnabled: Bool? 504 | } 505 | 506 | func handleUpdateConfig(_ body: Data?) async throws -> HTTPResponse { 507 | guard let body = body, 508 | let request = try? JSONDecoder().decode(ConfigRequest.self, from: body) 509 | else { 510 | return HTTPResponse( 511 | statusCode: .badRequest, 512 | headers: ["Content-Type": "application/json"], 513 | body: try JSONEncoder().encode(APIError(message: "Invalid request body")) 514 | ) 515 | } 516 | 517 | do { 518 | let vmController = LumeController() 519 | 520 | if let homeDir = request.homeDirectory { 521 | try vmController.setHomeDirectory(homeDir) 522 | } 523 | 524 | if let cacheDir = request.cacheDirectory { 525 | try vmController.setCacheDirectory(path: cacheDir) 526 | } 527 | 528 | if let cachingEnabled = request.cachingEnabled { 529 | try vmController.setCachingEnabled(cachingEnabled) 530 | } 531 | 532 | return HTTPResponse( 533 | statusCode: .ok, 534 | headers: ["Content-Type": "application/json"], 535 | body: try JSONEncoder().encode(["message": "Configuration updated successfully"]) 536 | ) 537 | } catch { 538 | return HTTPResponse( 539 | statusCode: .badRequest, 540 | headers: ["Content-Type": "application/json"], 541 | body: try JSONEncoder().encode(APIError(message: error.localizedDescription)) 542 | ) 543 | } 544 | } 545 | 546 | func handleGetLocations() async throws -> HTTPResponse { 547 | do { 548 | let vmController = LumeController() 549 | let locations = vmController.getLocations() 550 | return try .json(locations) 551 | } catch { 552 | return .badRequest(message: error.localizedDescription) 553 | } 554 | } 555 | 556 | struct LocationRequest: Codable { 557 | let name: String 558 | let path: String 559 | } 560 | 561 | func handleAddLocation(_ body: Data?) async throws -> HTTPResponse { 562 | guard let body = body, 563 | let request = try? JSONDecoder().decode(LocationRequest.self, from: body) 564 | else { 565 | return HTTPResponse( 566 | statusCode: .badRequest, 567 | headers: ["Content-Type": "application/json"], 568 | body: try JSONEncoder().encode(APIError(message: "Invalid request body")) 569 | ) 570 | } 571 | 572 | do { 573 | let vmController = LumeController() 574 | try vmController.addLocation(name: request.name, path: request.path) 575 | 576 | return HTTPResponse( 577 | statusCode: .ok, 578 | headers: ["Content-Type": "application/json"], 579 | body: try JSONEncoder().encode([ 580 | "message": "Location added successfully", 581 | "name": request.name, 582 | "path": request.path, 583 | ]) 584 | ) 585 | } catch { 586 | return HTTPResponse( 587 | statusCode: .badRequest, 588 | headers: ["Content-Type": "application/json"], 589 | body: try JSONEncoder().encode(APIError(message: error.localizedDescription)) 590 | ) 591 | } 592 | } 593 | 594 | func handleRemoveLocation(_ name: String) async throws -> HTTPResponse { 595 | do { 596 | let vmController = LumeController() 597 | try vmController.removeLocation(name: name) 598 | return HTTPResponse( 599 | statusCode: .ok, 600 | headers: ["Content-Type": "application/json"], 601 | body: try JSONEncoder().encode(["message": "Location removed successfully"]) 602 | ) 603 | } catch { 604 | return HTTPResponse( 605 | statusCode: .badRequest, 606 | headers: ["Content-Type": "application/json"], 607 | body: try JSONEncoder().encode(APIError(message: error.localizedDescription)) 608 | ) 609 | } 610 | } 611 | 612 | func handleSetDefaultLocation(_ name: String) async throws -> HTTPResponse { 613 | do { 614 | let vmController = LumeController() 615 | try vmController.setDefaultLocation(name: name) 616 | return HTTPResponse( 617 | statusCode: .ok, 618 | headers: ["Content-Type": "application/json"], 619 | body: try JSONEncoder().encode(["message": "Default location set successfully"]) 620 | ) 621 | } catch { 622 | return HTTPResponse( 623 | statusCode: .badRequest, 624 | headers: ["Content-Type": "application/json"], 625 | body: try JSONEncoder().encode(APIError(message: error.localizedDescription)) 626 | ) 627 | } 628 | } 629 | 630 | // MARK: - Log Handlers 631 | 632 | func handleGetLogs(type: String?, lines: Int?) async throws -> HTTPResponse { 633 | do { 634 | let logType = type?.lowercased() ?? "all" 635 | let infoPath = "/tmp/lume_daemon.log" 636 | let errorPath = "/tmp/lume_daemon.error.log" 637 | 638 | let fileManager = FileManager.default 639 | var response: [String: String] = [:] 640 | 641 | // Function to read log files 642 | func readLogFile(path: String) -> String? { 643 | guard fileManager.fileExists(atPath: path) else { 644 | return nil 645 | } 646 | 647 | do { 648 | let content = try String(contentsOfFile: path, encoding: .utf8) 649 | 650 | // If lines parameter is provided, return only the specified number of lines from the end 651 | if let lineCount = lines { 652 | let allLines = content.components(separatedBy: .newlines) 653 | let startIndex = max(0, allLines.count - lineCount) 654 | let lastLines = Array(allLines[startIndex...]) 655 | return lastLines.joined(separator: "\n") 656 | } 657 | 658 | return content 659 | } catch { 660 | return "Error reading log file: \(error.localizedDescription)" 661 | } 662 | } 663 | 664 | // Get logs based on requested type 665 | if logType == "info" || logType == "all" { 666 | response["info"] = readLogFile(path: infoPath) ?? "Info log file not found" 667 | } 668 | 669 | if logType == "error" || logType == "all" { 670 | response["error"] = readLogFile(path: errorPath) ?? "Error log file not found" 671 | } 672 | 673 | return try .json(response) 674 | } catch { 675 | return .badRequest(message: error.localizedDescription) 676 | } 677 | } 678 | 679 | // MARK: - Private Helper Methods 680 | 681 | nonisolated private func startVM( 682 | name: String, 683 | noDisplay: Bool, 684 | sharedDirectories: [SharedDirectory] = [], 685 | recoveryMode: Bool = false, 686 | storage: String? = nil 687 | ) { 688 | Logger.info( 689 | "Starting VM in detached task", 690 | metadata: [ 691 | "name": name, 692 | "noDisplay": "\(noDisplay)", 693 | "recoveryMode": "\(recoveryMode)", 694 | "storage": String(describing: storage), 695 | ]) 696 | 697 | Task.detached { @MainActor @Sendable in 698 | Logger.info("Background task started for VM", metadata: ["name": name]) 699 | do { 700 | Logger.info("Creating VM controller in background task", metadata: ["name": name]) 701 | let vmController = LumeController() 702 | 703 | Logger.info( 704 | "Calling runVM on controller", 705 | metadata: [ 706 | "name": name, 707 | "noDisplay": "\(noDisplay)", 708 | ]) 709 | try await vmController.runVM( 710 | name: name, 711 | noDisplay: noDisplay, 712 | sharedDirectories: sharedDirectories, 713 | recoveryMode: recoveryMode, 714 | storage: storage 715 | ) 716 | Logger.info("VM started successfully in background task", metadata: ["name": name]) 717 | } catch { 718 | Logger.error( 719 | "Failed to start VM in background task", 720 | metadata: [ 721 | "name": name, 722 | "error": error.localizedDescription, 723 | ]) 724 | } 725 | } 726 | Logger.info("Background task dispatched for VM", metadata: ["name": name]) 727 | } 728 | } 729 | ``` -------------------------------------------------------------------------------- /blog/build-your-own-operator-on-macos-2.md: -------------------------------------------------------------------------------- ```markdown 1 | # Build Your Own Operator on macOS - Part 2 2 | 3 | *Published on April 27, 2025 by Francesco Bonacci* 4 | 5 | In our [previous post](build-your-own-operator-on-macos-1.md), we built a basic Computer-Use Operator from scratch using OpenAI's `computer-use-preview` model and our [cua-computer](https://pypi.org/project/cua-computer) package. While educational, implementing the control loop manually can be tedious and error-prone. 6 | 7 | In this follow-up, we'll explore our [cua-agent](https://pypi.org/project/cua-agent) framework - a high-level abstraction that handles all the complexity of VM interaction, screenshot processing, model communication, and action execution automatically. 8 | 9 | <div align="center"> 10 | <video src="https://github.com/user-attachments/assets/0be7e3e3-eead-4646-a4a3-5bb392501ee7" width="600" controls></video> 11 | </div> 12 | 13 | ## What You'll Learn 14 | 15 | By the end of this tutorial, you'll be able to: 16 | - Set up the `cua-agent` framework with various agent loop types and model providers 17 | - Understand the different agent loop types and their capabilities 18 | - Work with local models for cost-effective workflows 19 | - Use a simple UI for your operator 20 | 21 | **Prerequisites:** 22 | - Completed setup from Part 1 ([lume CLI installed](https://github.com/trycua/cua?tab=readme-ov-file#option-2-full-computer-use-agent-capabilities), macOS CUA image already pulled) 23 | - Python 3.10+. We recommend using Conda (or Anaconda) to create an ad hoc Python environment. 24 | - API keys for OpenAI and/or Anthropic (optional for local models) 25 | 26 | **Estimated Time:** 30-45 minutes 27 | 28 | ## Introduction to cua-agent 29 | 30 | The `cua-agent` framework is designed to simplify building Computer-Use Agents. It abstracts away the complex interaction loop we built manually in Part 1, letting you focus on defining tasks rather than implementing the machinery. Among other features, it includes: 31 | 32 | - **Multiple Provider Support**: Works with OpenAI, Anthropic, UI-Tars, local models (via Ollama), or any OpenAI-compatible model (e.g. LM Studio, vLLM, LocalAI, OpenRouter, Groq, etc.) 33 | - **Flexible Loop Types**: Different implementations optimized for various models (e.g. OpenAI vs. Anthropic) 34 | - **Structured Responses**: Clean, consistent output following the OpenAI Agent SDK specification we touched on in Part 1 35 | - **Local Model Support**: Run cost-effectively with locally hosted models (Ollama, LM Studio, vLLM, LocalAI, etc.) 36 | - **Gradio UI**: Optional visual interface for interacting with your agent 37 | 38 | ## Installation 39 | 40 | Let's start by installing the `cua-agent` package. You can install it with all features or selectively install only what you need. 41 | 42 | From your python 3.10+ environment, run: 43 | 44 | ```bash 45 | # For all features 46 | pip install "cua-agent[all]" 47 | 48 | # Or selectively install only what you need 49 | pip install "cua-agent[openai]" # OpenAI support 50 | pip install "cua-agent[anthropic]" # Anthropic support 51 | pip install "cua-agent[uitars]" # UI-Tars support 52 | pip install "cua-agent[omni]" # OmniParser + VLMs support 53 | pip install "cua-agent[ui]" # Gradio UI 54 | ``` 55 | 56 | ## Setting Up Your Environment 57 | 58 | Before running any code examples, let's set up a proper environment: 59 | 60 | 1. **Create a new directory** for your project: 61 | ```bash 62 | mkdir cua-agent-tutorial 63 | cd cua-agent-tutorial 64 | ``` 65 | 66 | 2. **Set up a Python environment** using one of these methods: 67 | 68 | **Option A: Using conda command line** 69 | ```bash 70 | # Using conda 71 | conda create -n cua-agent python=3.10 72 | conda activate cua-agent 73 | ``` 74 | 75 | **Option B: Using Anaconda Navigator UI** 76 | - Open Anaconda Navigator 77 | - Click on "Environments" in the left sidebar 78 | - Click the "Create" button at the bottom 79 | - Name your environment "cua-agent" 80 | - Select Python 3.10 81 | - Click "Create" 82 | - Once created, select the environment and click "Open Terminal" to activate it 83 | 84 | **Option C: Using venv** 85 | ```bash 86 | python -m venv cua-env 87 | source cua-env/bin/activate # On macOS/Linux 88 | ``` 89 | 90 | 3. **Install the cua-agent package**: 91 | ```bash 92 | pip install "cua-agent[all]" 93 | ``` 94 | 95 | 4. **Set up your API keys as environment variables**: 96 | ```bash 97 | # For OpenAI models 98 | export OPENAI_API_KEY=your_openai_key_here 99 | 100 | # For Anthropic models (if needed) 101 | export ANTHROPIC_API_KEY=your_anthropic_key_here 102 | ``` 103 | 104 | 5. **Create a Python file or notebook**: 105 | 106 | **Option A: Create a Python script** 107 | ```bash 108 | # For a Python script 109 | touch cua_agent_example.py 110 | ``` 111 | 112 | **Option B: Use VS Code notebooks** 113 | - Open VS Code 114 | - Install the Python extension if you haven't already 115 | - Create a new file with a `.ipynb` extension (e.g., `cua_agent_tutorial.ipynb`) 116 | - Select your Python environment when prompted 117 | - You can now create and run code cells in the notebook interface 118 | 119 | Now you're ready to run the code examples! 120 | 121 | ## Understanding Agent Loops 122 | 123 | If you recall from Part 1, we had to implement a custom interaction loop to interact with the compute-use-preview model. 124 | 125 | In the `cua-agent` framework, an **Agent Loop** is the core abstraction that implements the continuous interaction cycle between an AI model and the computer environment. It manages the flow of: 126 | 1. Capturing screenshots of the computer's state 127 | 2. Processing these screenshots (with or without UI element detection) 128 | 3. Sending this visual context to an AI model along with the task instructions 129 | 4. Receiving the model's decisions on what actions to take 130 | 5. Safely executing these actions in the environment 131 | 6. Repeating this cycle until the task is complete 132 | 133 | The loop handles all the complex error handling, retries, context management, and model-specific interaction patterns so you don't have to implement them yourself. 134 | 135 | While the core concept remains the same across all agent loops, different AI models require specialized handling for optimal performance. To address this, the framework provides 4 different agent loop implementations, each designed for different computer-use modalities. 136 | | Agent Loop | Supported Models | Description | Set-Of-Marks | 137 | |:-----------|:-----------------|:------------|:-------------| 138 | | `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA Preview model | Not Required | 139 | | `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use Beta Tools | Not Required | 140 | | `AgentLoop.UITARS` | • `ByteDance-Seed/UI-TARS-1.5-7B` | Uses ByteDance's UI-TARS 1.5 model | Not Required | 141 | | `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser | 142 | 143 | Each loop handles the same basic pattern we implemented manually in Part 1: 144 | 1. Take a screenshot of the VM 145 | 2. Send the screenshot and task to the AI model 146 | 3. Receive an action to perform 147 | 4. Execute the action 148 | 5. Repeat until the task is complete 149 | 150 | ### Why Different Agent Loops? 151 | 152 | The `cua-agent` framework provides multiple agent loop implementations to abstract away the complexity of interacting with different CUA models. Each provider has unique API structures, response formats, conventions and capabilities that require specialized handling: 153 | 154 | - **OpenAI Loop**: Uses the Responses API with a specific `computer_call_output` format for sending screenshots after actions. Requires handling safety checks and maintains a chain of requests using `previous_response_id`. 155 | 156 | - **Anthropic Loop**: Implements a [multi-agent loop pattern](https://docs.anthropic.com/en/docs/agents-and-tools/computer-use#understanding-the-multi-agent-loop) with a sophisticated message handling system, supporting various API providers (Anthropic, Bedrock, Vertex) with token management and prompt caching capabilities. 157 | 158 | - **UI-TARS Loop**: Requires custom message formatting and specialized parsing to extract actions from text responses using a "box token" system for UI element identification. 159 | 160 | - **OMNI Loop**: Uses [Microsoft's OmniParser](https://github.com/microsoft/OmniParser) to create a [Set-of-Marks (SoM)](https://arxiv.org/abs/2310.11441) representation of the UI, enabling any vision-language model to interact with interfaces without specialized UI training. 161 | 162 | - **AgentLoop.OMNI**: The most flexible option that works with virtually any vision-language model including local and open-source ones. Perfect for cost-effective development or when you need to use models without native computer-use capabilities. 163 | 164 | These abstractions allow you to easily switch between providers without changing your application code. All loop implementations are available in the [cua-agent GitHub repository](https://github.com/trycua/cua/tree/main/libs/agent/agent/providers). 165 | 166 | Choosing the right agent loop depends not only on your API access and technical requirements but also on the specific tasks you need to accomplish. To make an informed decision, it's helpful to understand how these underlying models perform across different computing environments – from desktop operating systems to web browsers and mobile interfaces. 167 | 168 | ## Computer-Use Model Capabilities 169 | 170 | The performance of different Computer-Use models varies significantly across tasks. These benchmark evaluations measure an agent's ability to follow instructions and complete real-world tasks in different computing environments. 171 | 172 | | Benchmark type | Benchmark | UI-TARS-1.5 | OpenAI CUA | Claude 3.7 | Previous SOTA | Human | 173 | |----------------|--------------------------------------------------------------------------------------------------------------------------------------------------|-------------|-------------|-------------|----------------------|-------------| 174 | | **Computer Use** | [OSworld](https://arxiv.org/abs/2404.07972) (100 steps) | **42.5** | 36.4 | 28 | 38.1 (200 step) | 72.4 | 175 | | | [Windows Agent Arena](https://arxiv.org/abs/2409.08264) (50 steps) | **42.1** | - | - | 29.8 | - | 176 | | **Browser Use** | [WebVoyager](https://arxiv.org/abs/2401.13919) | 84.8 | **87** | 84.1 | 87 | - | 177 | | | [Online-Mind2web](https://arxiv.org/abs/2504.01382) | **75.8** | 71 | 62.9 | 71 | - | 178 | | **Phone Use** | [Android World](https://arxiv.org/abs/2405.14573) | **64.2** | - | - | 59.5 | - | 179 | 180 | ### When to Use Each Loop 181 | 182 | - **AgentLoop.OPENAI**: Choose when you have OpenAI Tier 3 access and need the most capable computer-use agent for web-based tasks. Uses the same [OpenAI Computer-Use Loop](https://platform.openai.com/docs/guides/tools-computer-use) as Part 1, delivering strong performance on browser-based benchmarks. 183 | 184 | - **AgentLoop.ANTHROPIC**: Ideal for users with Anthropic API access who need strong reasoning capabilities with computer-use abilities. Works with `claude-3-5-sonnet-20240620` and `claude-3-7-sonnet-20250219` models following [Anthropic's Computer-Use tools](https://docs.anthropic.com/en/docs/agents-and-tools/computer-use#understanding-the-multi-agent-loop). 185 | 186 | - **AgentLoop.UITARS**: Best for scenarios requiring more powerful OS/desktop, and latency-sensitive automation, as UI-TARS-1.5 leads in OS capabilities benchmarks. Requires running the model locally or accessing it through compatible endpoints (e.g. on Hugging Face). 187 | 188 | - **AgentLoop.OMNI**: The most flexible option that works with virtually any vision-language model including local and open-source ones. Perfect for cost-effective development or when you need to use models without native computer-use capabilities. 189 | 190 | Now that we understand the capabilities and strengths of different models, let's see how easy it is to implement a Computer-Use Agent using the `cua-agent` framework. Let's look at the implementation details. 191 | 192 | ## Creating Your First Computer-Use Agent 193 | 194 | With the `cua-agent` framework, creating a Computer-Use Agent becomes remarkably straightforward. The framework handles all the complexities of model interaction, screenshot processing, and action execution behind the scenes. Let's look at a simple example of how to build your first agent: 195 | 196 | **How to run this example:** 197 | 198 | 1. Create a new file named `simple_task.py` in your text editor or IDE (like VS Code, PyCharm, or Cursor) 199 | 2. Copy and paste the following code: 200 | 201 | ```python 202 | import asyncio 203 | from computer import Computer 204 | from agent import ComputerAgent 205 | 206 | async def run_simple_task(): 207 | async with Computer() as macos_computer: 208 | # Create agent with OpenAI loop 209 | agent = ComputerAgent( 210 | model="openai/computer-use-preview", 211 | tools=[macos_computer] 212 | ) 213 | 214 | # Define a simple task 215 | task = "Open Safari and search for 'Python tutorials'" 216 | 217 | # Run the task and process responses 218 | async for result in agent.run(task): 219 | print(f"Action: {result.get('text')}") 220 | 221 | # Run the example 222 | if __name__ == "__main__": 223 | asyncio.run(run_simple_task()) 224 | ``` 225 | 226 | 3. Save the file 227 | 4. Open a terminal, navigate to your project directory, and run: 228 | ```bash 229 | python simple_task.py 230 | ``` 231 | 232 | 5. The code will initialize the macOS virtual machine, create an agent, and execute the task of opening Safari and searching for Python tutorials. 233 | 234 | You can also run this in a VS Code notebook: 235 | 1. Create a new notebook in VS Code (.ipynb file) 236 | 2. Copy the code into a cell (without the `if __name__ == "__main__":` part) 237 | 3. Run the cell to execute the code 238 | 239 | You can find the full code in our [notebook](https://github.com/trycua/cua/blob/main/notebooks/blog/build-your-own-operator-on-macos-2.ipynb). 240 | 241 | Compare this to the manual implementation from Part 1 - we've reduced dozens of lines of code to just a few. The cua-agent framework handles all the complex logic internally, letting you focus on the overarching agentic system. 242 | 243 | ## Working with Multiple Tasks 244 | 245 | Another advantage of the cua-agent framework is easily chaining multiple tasks. Instead of managing complex state between tasks, you can simply provide a sequence of instructions to be executed in order: 246 | 247 | **How to run this example:** 248 | 249 | 1. Create a new file named `multi_task.py` with the following code: 250 | 251 | ```python 252 | import asyncio 253 | from computer import Computer 254 | from agent import ComputerAgent 255 | 256 | async def run_multi_task_workflow(): 257 | async with Computer() as macos_computer: 258 | agent = ComputerAgent( 259 | model="anthropic/claude-3-5-sonnet-20241022", 260 | tools=[macos_computer] 261 | ) 262 | 263 | tasks = [ 264 | "Open Safari and go to github.com", 265 | "Search for 'trycua/cua'", 266 | "Open the repository page", 267 | "Click on the 'Issues' tab", 268 | "Read the first open issue" 269 | ] 270 | 271 | for i, task in enumerate(tasks): 272 | print(f"\nTask {i+1}/{len(tasks)}: {task}") 273 | async for result in agent.run(task): 274 | # Print just the action description for brevity 275 | if result.get("text"): 276 | print(f" → {result.get('text')}") 277 | print(f"✅ Task {i+1} completed") 278 | 279 | if __name__ == "__main__": 280 | asyncio.run(run_multi_task_workflow()) 281 | ``` 282 | 283 | 2. Save the file 284 | 3. Make sure you have set your Anthropic API key: 285 | ```bash 286 | export ANTHROPIC_API_KEY=your_anthropic_key_here 287 | ``` 288 | 4. Run the script: 289 | ```bash 290 | python multi_task.py 291 | ``` 292 | 293 | This pattern is particularly useful for creating workflows that navigate through multiple steps of an application or process. The agent maintains visual context between tasks, making it more likely to successfully complete complex sequences of actions. 294 | 295 | ## Understanding the Response Format 296 | 297 | Each action taken by the agent returns a structured response following the OpenAI Agent SDK specification. This standardized format makes it easy to extract detailed information about what the agent is doing and why: 298 | 299 | ```python 300 | async for result in agent.run(task): 301 | # Basic information 302 | print(f"Response ID: {result.get('id')}") 303 | print(f"Response Text: {result.get('text')}") 304 | 305 | # Detailed token usage statistics 306 | usage = result.get('usage') 307 | if usage: 308 | print(f"Input Tokens: {usage.get('input_tokens')}") 309 | print(f"Output Tokens: {usage.get('output_tokens')}") 310 | 311 | # Reasoning and actions 312 | for output in result.get('output', []): 313 | if output.get('type') == 'reasoning': 314 | print(f"Reasoning: {output.get('summary', [{}])[0].get('text')}") 315 | elif output.get('type') == 'computer_call': 316 | action = output.get('action', {}) 317 | print(f"Action: {action.get('type')} at ({action.get('x')}, {action.get('y')})") 318 | ``` 319 | 320 | This structured format allows you to: 321 | - Log detailed information about agent actions 322 | - Provide real-time feedback to users 323 | - Track token usage for cost monitoring 324 | - Access the reasoning behind decisions for debugging or user explanation 325 | 326 | ## Using Local Models with OMNI 327 | 328 | One of the most powerful features of the framework is the ability to use local models via the OMNI loop. This approach dramatically reduces costs while maintaining acceptable reliability for many agentic workflows: 329 | 330 | **How to run this example:** 331 | 332 | 1. First, you'll need to install Ollama for running local models: 333 | - Visit [ollama.com](https://ollama.com) and download the installer for your OS 334 | - Follow the installation instructions 335 | - Pull the Gemma 3 model: 336 | ```bash 337 | ollama pull gemma3:4b-it-q4_K_M 338 | ``` 339 | 340 | 2. Create a file named `local_model.py` with this code: 341 | 342 | ```python 343 | import asyncio 344 | from computer import Computer 345 | from agent import ComputerAgent 346 | 347 | async def run_with_local_model(): 348 | async with Computer() as macos_computer: 349 | agent = ComputerAgent( 350 | model="omniparser+ollama_chat/gemma3", 351 | tools=[macos_computer] 352 | ) 353 | 354 | task = "Open the Calculator app and perform a simple calculation" 355 | 356 | async for result in agent.run(task): 357 | print(f"Action: {result.get('text')}") 358 | 359 | if __name__ == "__main__": 360 | asyncio.run(run_with_local_model()) 361 | ``` 362 | 363 | 3. Run the script: 364 | ```bash 365 | python local_model.py 366 | ``` 367 | 368 | You can also use other local model servers with the OAICOMPAT provider, which enables compatibility with any API endpoint following the OpenAI API structure: 369 | 370 | ```python 371 | agent = ComputerAgent( 372 | model=LLM( 373 | provider=LLMProvider.OAICOMPAT, 374 | name="gemma-3-12b-it", 375 | provider_base_url="http://localhost:1234/v1" # LM Studio endpoint 376 | ), 377 | tools=[macos_computer] 378 | ) 379 | ``` 380 | 381 | Common local endpoints include: 382 | - LM Studio: `http://localhost:1234/v1` 383 | - vLLM: `http://localhost:8000/v1` 384 | - LocalAI: `http://localhost:8080/v1` 385 | - Ollama with OpenAI compat: `http://localhost:11434/v1` 386 | 387 | This approach is perfect for: 388 | - Development and testing without incurring API costs 389 | - Offline or air-gapped environments where API access isn't possible 390 | - Privacy-sensitive applications where data can't leave your network 391 | - Experimenting with different models to find the best fit for your use case 392 | 393 | ## Deploying and Using UI-TARS 394 | 395 | UI-TARS is ByteDance's Computer-Use model designed for navigating OS-level interfaces. It shows excellent performance on desktop OS tasks. To use UI-TARS, you'll first need to deploy the model. 396 | 397 | ### Deployment Options 398 | 399 | 1. **Local Deployment**: Follow the [UI-TARS deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md) to run the model locally. 400 | 401 | 2. **Hugging Face Endpoint**: Deploy UI-TARS on Hugging Face Inference Endpoints, which will give you a URL like: 402 | `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` 403 | 404 | 3. **Using with cua-agent**: Once deployed, you can use UI-TARS with the cua-agent framework: 405 | 406 | ```python 407 | agent = ComputerAgent( 408 | model=LLM( 409 | provider=LLMProvider.OAICOMPAT, 410 | name="tgi", 411 | provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1" 412 | ), 413 | tools=[macos_computer] 414 | ) 415 | ``` 416 | 417 | UI-TARS is particularly useful for desktop automation tasks, as it shows the highest performance on OS-level benchmarks like OSworld and Windows Agent Arena. 418 | 419 | ## Understanding Agent Responses in Detail 420 | 421 | The `run()` method of your agent yields structured responses that follow the OpenAI Agent SDK specification. This provides a rich set of information beyond just the basic action text: 422 | 423 | ```python 424 | async for result in agent.run(task): 425 | # Basic ID and text 426 | print("Response ID:", result.get("id")) 427 | print("Response Text:", result.get("text")) 428 | 429 | # Token usage statistics 430 | usage = result.get("usage") 431 | if usage: 432 | print("\nUsage Details:") 433 | print(f" Input Tokens: {usage.get('input_tokens')}") 434 | if "input_tokens_details" in usage: 435 | print(f" Input Tokens Details: {usage.get('input_tokens_details')}") 436 | print(f" Output Tokens: {usage.get('output_tokens')}") 437 | if "output_tokens_details" in usage: 438 | print(f" Output Tokens Details: {usage.get('output_tokens_details')}") 439 | print(f" Total Tokens: {usage.get('total_tokens')}") 440 | 441 | # Detailed reasoning and actions 442 | outputs = result.get("output", []) 443 | for output in outputs: 444 | output_type = output.get("type") 445 | if output_type == "reasoning": 446 | print("\nReasoning:") 447 | for summary in output.get("summary", []): 448 | print(f" {summary.get('text')}") 449 | elif output_type == "computer_call": 450 | action = output.get("action", {}) 451 | print("\nComputer Action:") 452 | print(f" Type: {action.get('type')}") 453 | print(f" Position: ({action.get('x')}, {action.get('y')})") 454 | if action.get("text"): 455 | print(f" Text: {action.get('text')}") 456 | ``` 457 | 458 | This detailed information is invaluable for debugging, logging, and understanding the agent's decision-making process in an agentic system. More details can be found in the [OpenAI Agent SDK Specification](https://platform.openai.com/docs/guides/responses-vs-chat-completions). 459 | 460 | ## Building a Gradio UI 461 | 462 | For a visual interface to your agent, the package also includes a Gradio UI: 463 | 464 | **How to run the Gradio UI:** 465 | 466 | 1. Create a file named `launch_ui.py` with the following code: 467 | 468 | ```python 469 | from agent.ui.gradio.app import create_gradio_ui 470 | 471 | # Create and launch the UI 472 | if __name__ == "__main__": 473 | app = create_gradio_ui() 474 | app.launch(share=False) # Set share=False for local access only 475 | ``` 476 | 477 | 2. Install the UI dependencies if you haven't already: 478 | ```bash 479 | pip install "cua-agent[ui]" 480 | ``` 481 | 482 | 3. Run the script: 483 | ```bash 484 | python launch_ui.py 485 | ``` 486 | 487 | 4. Open your browser to the displayed URL (usually http://127.0.0.1:7860) 488 | 489 | **Creating a Shareable Link (Optional):** 490 | 491 | You can also create a temporary public URL to access your Gradio UI from anywhere: 492 | 493 | ```python 494 | # In launch_ui.py 495 | if __name__ == "__main__": 496 | app = create_gradio_ui() 497 | app.launch(share=True) # Creates a public link 498 | ``` 499 | 500 | When you run this, Gradio will display both a local URL and a public URL like: 501 | ``` 502 | Running on local URL: http://127.0.0.1:7860 503 | Running on public URL: https://abcd1234.gradio.live 504 | ``` 505 | 506 | **Security Note:** Be cautious when sharing your Gradio UI publicly: 507 | - The public URL gives anyone with the link full access to your agent 508 | - Consider using basic authentication for additional protection: 509 | ```python 510 | app.launch(share=True, auth=("username", "password")) 511 | ``` 512 | - Only use this feature for personal or team use, not for production environments 513 | - The temporary link expires when you stop the Gradio application 514 | 515 | This provides: 516 | - Model provider selection 517 | - Agent loop selection 518 | - Task input field 519 | - Real-time display of VM screenshots 520 | - Action history 521 | 522 | ### Setting API Keys for the UI 523 | 524 | To use the UI with different providers, set your API keys as environment variables: 525 | 526 | ```bash 527 | # For OpenAI models 528 | export OPENAI_API_KEY=your_openai_key_here 529 | 530 | # For Anthropic models 531 | export ANTHROPIC_API_KEY=your_anthropic_key_here 532 | 533 | # Launch with both keys set 534 | OPENAI_API_KEY=your_key ANTHROPIC_API_KEY=your_key python launch_ui.py 535 | ``` 536 | 537 | ### UI Settings Persistence 538 | 539 | The Gradio UI automatically saves your configuration to maintain your preferences between sessions: 540 | 541 | - Settings like Agent Loop, Model Choice, Custom Base URL, and configuration options are saved to `.gradio_settings.json` in the project's root directory 542 | - These settings are loaded automatically when you restart the UI 543 | - API keys entered in the custom provider field are **not** saved for security reasons 544 | - It's recommended to add `.gradio_settings.json` to your `.gitignore` file 545 | 546 | ## Advanced Example: GitHub Repository Workflow 547 | 548 | Let's look at a more complex example that automates a GitHub workflow: 549 | 550 | **How to run this advanced example:** 551 | 552 | 1. Create a file named `github_workflow.py` with the following code: 553 | 554 | ```python 555 | import asyncio 556 | import logging 557 | from computer import Computer 558 | from agent import ComputerAgent 559 | 560 | async def github_workflow(): 561 | async with Computer(verbosity=logging.INFO) as macos_computer: 562 | agent = ComputerAgent( 563 | model="openai/computer-use-preview", 564 | save_trajectory=True, # Save screenshots for debugging 565 | only_n_most_recent_images=3, # Only keep last 3 images in context 566 | verbosity=logging.INFO, 567 | tools=[macos_computer] 568 | ) 569 | 570 | tasks = [ 571 | "Look for a repository named trycua/cua on GitHub.", 572 | "Check the open issues, open the most recent one and read it.", 573 | "Clone the repository in users/lume/projects if it doesn't exist yet.", 574 | "Open the repository with Cursor (on the dock, black background and white cube icon).", 575 | "From Cursor, open Composer if not already open.", 576 | "Focus on the Composer text area, then write and submit a task to help resolve the GitHub issue.", 577 | ] 578 | 579 | for i, task in enumerate(tasks): 580 | print(f"\nExecuting task {i+1}/{len(tasks)}: {task}") 581 | async for result in agent.run(task): 582 | print(f"Action: {result.get('text')}") 583 | print(f"✅ Task {i+1}/{len(tasks)} completed") 584 | 585 | if __name__ == "__main__": 586 | asyncio.run(github_workflow()) 587 | ``` 588 | 589 | 2. Make sure your OpenAI API key is set: 590 | ```bash 591 | export OPENAI_API_KEY=your_openai_key_here 592 | ``` 593 | 594 | 3. Run the script: 595 | ```bash 596 | python github_workflow.py 597 | ``` 598 | 599 | 4. Watch as the agent completes the entire workflow: 600 | - The agent will navigate to GitHub 601 | - Find and investigate issues in the repository 602 | - Clone the repository to the local machine 603 | - Open it in Cursor 604 | - Use Cursor's AI features to work on a solution 605 | 606 | This example: 607 | 1. Searches GitHub for a repository 608 | 2. Reads an issue 609 | 3. Clones the repository 610 | 4. Opens it in an IDE 611 | 5. Uses AI to write a solution 612 | 613 | ## Comparing Implementation Approaches 614 | 615 | Let's compare our manual implementation from Part 1 with the framework approach: 616 | 617 | ### Manual Implementation (Part 1) 618 | - Required writing custom code for the interaction loop 619 | - Needed explicit handling of different action types 620 | - Required direct management of the OpenAI API calls 621 | - Around 50-100 lines of code for basic functionality 622 | - Limited to OpenAI's computer-use model 623 | 624 | ### Framework Implementation (Part 2) 625 | - Abstracts the interaction loop 626 | - Handles all action types automatically 627 | - Manages API calls internally 628 | - Only 10-15 lines of code for the same functionality 629 | - Works with multiple model providers 630 | - Includes UI capabilities 631 | 632 | ## Conclusion 633 | 634 | The `cua-agent` framework transforms what was a complex implementation task into a simple, high-level interface for building Computer-Use Agents. By abstracting away the technical details, it lets you focus on defining the tasks rather than the machinery. 635 | 636 | ### When to Use Each Approach 637 | - **Manual Implementation (Part 1)**: When you need complete control over the interaction loop or are implementing a custom solution 638 | - **Framework (Part 2)**: For most applications where you want to quickly build and deploy Computer-Use Agents 639 | 640 | ### Next Steps 641 | With the basics covered, you might want to explore: 642 | - Customizing the agent's behavior with additional parameters 643 | - Building more complex workflows spanning multiple applications 644 | - Integrating your agent into other applications 645 | - Contributing to the open-source project on GitHub 646 | 647 | ### Resources 648 | - [cua-agent GitHub repository](https://github.com/trycua/cua/tree/main/libs/agent) 649 | - [Agent Notebook Examples](https://github.com/trycua/cua/blob/main/notebooks/agent_nb.ipynb) 650 | - [OpenAI Agent SDK Specification](https://platform.openai.com/docs/api-reference/responses) 651 | - [Anthropic API Documentation](https://docs.anthropic.com/en/api/getting-started) 652 | - [UI-TARS GitHub](https://github.com/ByteDance/UI-TARS) 653 | - [OmniParser GitHub](https://github.com/microsoft/OmniParser) 654 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/agent.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | ComputerAgent - Main agent class that selects and runs agent loops 3 | """ 4 | 5 | import asyncio 6 | from pathlib import Path 7 | from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Callable, Set, Tuple 8 | 9 | from litellm.responses.utils import Usage 10 | 11 | from .types import ( 12 | Messages, 13 | AgentCapability, 14 | ToolError, 15 | IllegalArgumentError 16 | ) 17 | from .responses import make_tool_error_item, replace_failed_computer_calls_with_function_calls 18 | from .decorators import find_agent_config 19 | import json 20 | import litellm 21 | import litellm.utils 22 | import inspect 23 | from .adapters import ( 24 | HuggingFaceLocalAdapter, 25 | HumanAdapter, 26 | MLXVLMAdapter, 27 | ) 28 | from .callbacks import ( 29 | ImageRetentionCallback, 30 | LoggingCallback, 31 | TrajectorySaverCallback, 32 | BudgetManagerCallback, 33 | TelemetryCallback, 34 | OperatorNormalizerCallback, 35 | PromptInstructionsCallback, 36 | ) 37 | from .computers import ( 38 | AsyncComputerHandler, 39 | is_agent_computer, 40 | make_computer_handler 41 | ) 42 | 43 | def assert_callable_with(f, *args, **kwargs): 44 | """Check if function can be called with given arguments.""" 45 | try: 46 | inspect.signature(f).bind(*args, **kwargs) 47 | return True 48 | except TypeError as e: 49 | sig = inspect.signature(f) 50 | raise IllegalArgumentError(f"Expected {sig}, got args={args} kwargs={kwargs}") from e 51 | 52 | def get_json(obj: Any, max_depth: int = 10) -> Any: 53 | def custom_serializer(o: Any, depth: int = 0, seen: Optional[Set[int]] = None) -> Any: 54 | if seen is None: 55 | seen = set() 56 | 57 | # Use model_dump() if available 58 | if hasattr(o, 'model_dump'): 59 | return o.model_dump() 60 | 61 | # Check depth limit 62 | if depth > max_depth: 63 | return f"<max_depth_exceeded:{max_depth}>" 64 | 65 | # Check for circular references using object id 66 | obj_id = id(o) 67 | if obj_id in seen: 68 | return f"<circular_reference:{type(o).__name__}>" 69 | 70 | # Handle Computer objects 71 | if hasattr(o, '__class__') and 'computer' in getattr(o, '__class__').__name__.lower(): 72 | return f"<computer:{o.__class__.__name__}>" 73 | 74 | # Handle objects with __dict__ 75 | if hasattr(o, '__dict__'): 76 | seen.add(obj_id) 77 | try: 78 | result = {} 79 | for k, v in o.__dict__.items(): 80 | if v is not None: 81 | # Recursively serialize with updated depth and seen set 82 | serialized_value = custom_serializer(v, depth + 1, seen.copy()) 83 | result[k] = serialized_value 84 | return result 85 | finally: 86 | seen.discard(obj_id) 87 | 88 | # Handle common types that might contain nested objects 89 | elif isinstance(o, dict): 90 | seen.add(obj_id) 91 | try: 92 | return { 93 | k: custom_serializer(v, depth + 1, seen.copy()) 94 | for k, v in o.items() 95 | if v is not None 96 | } 97 | finally: 98 | seen.discard(obj_id) 99 | 100 | elif isinstance(o, (list, tuple, set)): 101 | seen.add(obj_id) 102 | try: 103 | return [ 104 | custom_serializer(item, depth + 1, seen.copy()) 105 | for item in o 106 | if item is not None 107 | ] 108 | finally: 109 | seen.discard(obj_id) 110 | 111 | # For basic types that json.dumps can handle 112 | elif isinstance(o, (str, int, float, bool)) or o is None: 113 | return o 114 | 115 | # Fallback to string representation 116 | else: 117 | return str(o) 118 | 119 | def remove_nones(obj: Any) -> Any: 120 | if isinstance(obj, dict): 121 | return {k: remove_nones(v) for k, v in obj.items() if v is not None} 122 | elif isinstance(obj, list): 123 | return [remove_nones(item) for item in obj if item is not None] 124 | return obj 125 | 126 | # Serialize with circular reference and depth protection 127 | serialized = custom_serializer(obj) 128 | 129 | # Convert to JSON string and back to ensure JSON compatibility 130 | json_str = json.dumps(serialized) 131 | parsed = json.loads(json_str) 132 | 133 | # Final cleanup of any remaining None values 134 | return remove_nones(parsed) 135 | 136 | def sanitize_message(msg: Any) -> Any: 137 | """Return a copy of the message with image_url omitted for computer_call_output messages.""" 138 | if msg.get("type") == "computer_call_output": 139 | output = msg.get("output", {}) 140 | if isinstance(output, dict): 141 | sanitized = msg.copy() 142 | sanitized["output"] = {**output, "image_url": "[omitted]"} 143 | return sanitized 144 | return msg 145 | 146 | def get_output_call_ids(messages: List[Dict[str, Any]]) -> List[str]: 147 | call_ids = [] 148 | for message in messages: 149 | if message.get("type") == "computer_call_output" or message.get("type") == "function_call_output": 150 | call_ids.append(message.get("call_id")) 151 | return call_ids 152 | 153 | class ComputerAgent: 154 | """ 155 | Main agent class that automatically selects the appropriate agent loop 156 | based on the model and executes tool calls. 157 | """ 158 | 159 | def __init__( 160 | self, 161 | model: str, 162 | tools: Optional[List[Any]] = None, 163 | custom_loop: Optional[Callable] = None, 164 | only_n_most_recent_images: Optional[int] = None, 165 | callbacks: Optional[List[Any]] = None, 166 | instructions: Optional[str] = None, 167 | verbosity: Optional[int] = None, 168 | trajectory_dir: Optional[str | Path | dict] = None, 169 | max_retries: Optional[int] = 3, 170 | screenshot_delay: Optional[float | int] = 0.5, 171 | use_prompt_caching: Optional[bool] = False, 172 | max_trajectory_budget: Optional[float | dict] = None, 173 | telemetry_enabled: Optional[bool] = True, 174 | trust_remote_code: Optional[bool] = False, 175 | **kwargs 176 | ): 177 | """ 178 | Initialize ComputerAgent. 179 | 180 | Args: 181 | model: Model name (e.g., "claude-3-5-sonnet-20241022", "computer-use-preview", "omni+vertex_ai/gemini-pro") 182 | tools: List of tools (computer objects, decorated functions, etc.) 183 | custom_loop: Custom agent loop function to use instead of auto-selection 184 | only_n_most_recent_images: If set, only keep the N most recent images in message history. Adds ImageRetentionCallback automatically. 185 | callbacks: List of AsyncCallbackHandler instances for preprocessing/postprocessing 186 | instructions: Optional system instructions to be passed to the model 187 | verbosity: Logging level (logging.DEBUG, logging.INFO, etc.). If set, adds LoggingCallback automatically 188 | trajectory_dir: If set, saves trajectory data (screenshots, responses) to this directory. Adds TrajectorySaverCallback automatically. 189 | max_retries: Maximum number of retries for failed API calls 190 | screenshot_delay: Delay before screenshots in seconds 191 | use_prompt_caching: If set, use prompt caching to avoid reprocessing the same prompt. Intended for use with anthropic providers. 192 | max_trajectory_budget: If set, adds BudgetManagerCallback to track usage costs and stop when budget is exceeded 193 | telemetry_enabled: If set, adds TelemetryCallback to track anonymized usage data. Enabled by default. 194 | trust_remote_code: If set, trust remote code when loading local models. Disabled by default. 195 | **kwargs: Additional arguments passed to the agent loop 196 | """ 197 | # If the loop is "human/human", we need to prefix a grounding model fallback 198 | if model in ["human/human", "human"]: 199 | model = "openai/computer-use-preview+human/human" 200 | 201 | self.model = model 202 | self.tools = tools or [] 203 | self.custom_loop = custom_loop 204 | self.only_n_most_recent_images = only_n_most_recent_images 205 | self.callbacks = callbacks or [] 206 | self.instructions = instructions 207 | self.verbosity = verbosity 208 | self.trajectory_dir = trajectory_dir 209 | self.max_retries = max_retries 210 | self.screenshot_delay = screenshot_delay 211 | self.use_prompt_caching = use_prompt_caching 212 | self.telemetry_enabled = telemetry_enabled 213 | self.kwargs = kwargs 214 | self.trust_remote_code = trust_remote_code 215 | 216 | # == Add built-in callbacks == 217 | 218 | # Prepend operator normalizer callback 219 | self.callbacks.insert(0, OperatorNormalizerCallback()) 220 | 221 | # Add prompt instructions callback if provided 222 | if self.instructions: 223 | self.callbacks.append(PromptInstructionsCallback(self.instructions)) 224 | 225 | # Add telemetry callback if telemetry_enabled is set 226 | if self.telemetry_enabled: 227 | if isinstance(self.telemetry_enabled, bool): 228 | self.callbacks.append(TelemetryCallback(self)) 229 | else: 230 | self.callbacks.append(TelemetryCallback(self, **self.telemetry_enabled)) 231 | 232 | # Add logging callback if verbosity is set 233 | if self.verbosity is not None: 234 | self.callbacks.append(LoggingCallback(level=self.verbosity)) 235 | 236 | # Add image retention callback if only_n_most_recent_images is set 237 | if self.only_n_most_recent_images: 238 | self.callbacks.append(ImageRetentionCallback(self.only_n_most_recent_images)) 239 | 240 | # Add trajectory saver callback if trajectory_dir is set 241 | if self.trajectory_dir: 242 | if isinstance(self.trajectory_dir, dict): 243 | self.callbacks.append(TrajectorySaverCallback(**self.trajectory_dir)) 244 | elif isinstance(self.trajectory_dir, (str, Path)): 245 | self.callbacks.append(TrajectorySaverCallback(str(self.trajectory_dir))) 246 | 247 | # Add budget manager if max_trajectory_budget is set 248 | if max_trajectory_budget: 249 | if isinstance(max_trajectory_budget, dict): 250 | self.callbacks.append(BudgetManagerCallback(**max_trajectory_budget)) 251 | else: 252 | self.callbacks.append(BudgetManagerCallback(max_trajectory_budget)) 253 | 254 | # == Enable local model providers w/ LiteLLM == 255 | 256 | # Register local model providers 257 | hf_adapter = HuggingFaceLocalAdapter( 258 | device="auto", 259 | trust_remote_code=self.trust_remote_code or False 260 | ) 261 | human_adapter = HumanAdapter() 262 | mlx_adapter = MLXVLMAdapter() 263 | litellm.custom_provider_map = [ 264 | {"provider": "huggingface-local", "custom_handler": hf_adapter}, 265 | {"provider": "human", "custom_handler": human_adapter}, 266 | {"provider": "mlx", "custom_handler": mlx_adapter} 267 | ] 268 | litellm.suppress_debug_info = True 269 | 270 | # == Initialize computer agent == 271 | 272 | # Find the appropriate agent loop 273 | if custom_loop: 274 | self.agent_loop = custom_loop 275 | self.agent_config_info = None 276 | else: 277 | config_info = find_agent_config(model) 278 | if not config_info: 279 | raise ValueError(f"No agent config found for model: {model}") 280 | # Instantiate the agent config class 281 | self.agent_loop = config_info.agent_class() 282 | self.agent_config_info = config_info 283 | 284 | self.tool_schemas = [] 285 | self.computer_handler = None 286 | 287 | async def _initialize_computers(self): 288 | """Initialize computer objects""" 289 | if not self.tool_schemas: 290 | # Process tools and create tool schemas 291 | self.tool_schemas = self._process_tools() 292 | 293 | # Find computer tool and create interface adapter 294 | computer_handler = None 295 | for schema in self.tool_schemas: 296 | if schema["type"] == "computer": 297 | computer_handler = await make_computer_handler(schema["computer"]) 298 | break 299 | self.computer_handler = computer_handler 300 | 301 | def _process_input(self, input: Messages) -> List[Dict[str, Any]]: 302 | """Process input messages and create schemas for the agent loop""" 303 | if isinstance(input, str): 304 | return [{"role": "user", "content": input}] 305 | return [get_json(msg) for msg in input] 306 | 307 | def _process_tools(self) -> List[Dict[str, Any]]: 308 | """Process tools and create schemas for the agent loop""" 309 | schemas = [] 310 | 311 | for tool in self.tools: 312 | # Check if it's a computer object (has interface attribute) 313 | if is_agent_computer(tool): 314 | # This is a computer tool - will be handled by agent loop 315 | schemas.append({ 316 | "type": "computer", 317 | "computer": tool 318 | }) 319 | elif callable(tool): 320 | # Use litellm.utils.function_to_dict to extract schema from docstring 321 | try: 322 | function_schema = litellm.utils.function_to_dict(tool) 323 | schemas.append({ 324 | "type": "function", 325 | "function": function_schema 326 | }) 327 | except Exception as e: 328 | print(f"Warning: Could not process tool {tool}: {e}") 329 | else: 330 | print(f"Warning: Unknown tool type: {tool}") 331 | 332 | return schemas 333 | 334 | def _get_tool(self, name: str) -> Optional[Callable]: 335 | """Get a tool by name""" 336 | for tool in self.tools: 337 | if hasattr(tool, '__name__') and tool.__name__ == name: 338 | return tool 339 | elif hasattr(tool, 'func') and tool.func.__name__ == name: 340 | return tool 341 | return None 342 | 343 | # ============================================================================ 344 | # AGENT RUN LOOP LIFECYCLE HOOKS 345 | # ============================================================================ 346 | 347 | async def _on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None: 348 | """Initialize run tracking by calling callbacks.""" 349 | for callback in self.callbacks: 350 | if hasattr(callback, 'on_run_start'): 351 | await callback.on_run_start(kwargs, old_items) 352 | 353 | async def _on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None: 354 | """Finalize run tracking by calling callbacks.""" 355 | for callback in self.callbacks: 356 | if hasattr(callback, 'on_run_end'): 357 | await callback.on_run_end(kwargs, old_items, new_items) 358 | 359 | async def _on_run_continue(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> bool: 360 | """Check if run should continue by calling callbacks.""" 361 | for callback in self.callbacks: 362 | if hasattr(callback, 'on_run_continue'): 363 | should_continue = await callback.on_run_continue(kwargs, old_items, new_items) 364 | if not should_continue: 365 | return False 366 | return True 367 | 368 | async def _on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 369 | """Prepare messages for the LLM call by applying callbacks.""" 370 | result = messages 371 | for callback in self.callbacks: 372 | if hasattr(callback, 'on_llm_start'): 373 | result = await callback.on_llm_start(result) 374 | return result 375 | 376 | async def _on_llm_end(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 377 | """Postprocess messages after the LLM call by applying callbacks.""" 378 | result = messages 379 | for callback in self.callbacks: 380 | if hasattr(callback, 'on_llm_end'): 381 | result = await callback.on_llm_end(result) 382 | return result 383 | 384 | async def _on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None: 385 | """Called when responses are received.""" 386 | for callback in self.callbacks: 387 | if hasattr(callback, 'on_responses'): 388 | await callback.on_responses(get_json(kwargs), get_json(responses)) 389 | 390 | async def _on_computer_call_start(self, item: Dict[str, Any]) -> None: 391 | """Called when a computer call is about to start.""" 392 | for callback in self.callbacks: 393 | if hasattr(callback, 'on_computer_call_start'): 394 | await callback.on_computer_call_start(get_json(item)) 395 | 396 | async def _on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None: 397 | """Called when a computer call has completed.""" 398 | for callback in self.callbacks: 399 | if hasattr(callback, 'on_computer_call_end'): 400 | await callback.on_computer_call_end(get_json(item), get_json(result)) 401 | 402 | async def _on_function_call_start(self, item: Dict[str, Any]) -> None: 403 | """Called when a function call is about to start.""" 404 | for callback in self.callbacks: 405 | if hasattr(callback, 'on_function_call_start'): 406 | await callback.on_function_call_start(get_json(item)) 407 | 408 | async def _on_function_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None: 409 | """Called when a function call has completed.""" 410 | for callback in self.callbacks: 411 | if hasattr(callback, 'on_function_call_end'): 412 | await callback.on_function_call_end(get_json(item), get_json(result)) 413 | 414 | async def _on_text(self, item: Dict[str, Any]) -> None: 415 | """Called when a text message is encountered.""" 416 | for callback in self.callbacks: 417 | if hasattr(callback, 'on_text'): 418 | await callback.on_text(get_json(item)) 419 | 420 | async def _on_api_start(self, kwargs: Dict[str, Any]) -> None: 421 | """Called when an LLM API call is about to start.""" 422 | for callback in self.callbacks: 423 | if hasattr(callback, 'on_api_start'): 424 | await callback.on_api_start(get_json(kwargs)) 425 | 426 | async def _on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None: 427 | """Called when an LLM API call has completed.""" 428 | for callback in self.callbacks: 429 | if hasattr(callback, 'on_api_end'): 430 | await callback.on_api_end(get_json(kwargs), get_json(result)) 431 | 432 | async def _on_usage(self, usage: Dict[str, Any]) -> None: 433 | """Called when usage information is received.""" 434 | for callback in self.callbacks: 435 | if hasattr(callback, 'on_usage'): 436 | await callback.on_usage(get_json(usage)) 437 | 438 | async def _on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None: 439 | """Called when a screenshot is taken.""" 440 | for callback in self.callbacks: 441 | if hasattr(callback, 'on_screenshot'): 442 | await callback.on_screenshot(screenshot, name) 443 | 444 | # ============================================================================ 445 | # AGENT OUTPUT PROCESSING 446 | # ============================================================================ 447 | 448 | async def _handle_item(self, item: Any, computer: Optional[AsyncComputerHandler] = None, ignore_call_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]: 449 | """Handle each item; may cause a computer action + screenshot.""" 450 | call_id = item.get("call_id") 451 | if ignore_call_ids and call_id and call_id in ignore_call_ids: 452 | return [] 453 | 454 | item_type = item.get("type", None) 455 | 456 | if item_type == "message": 457 | await self._on_text(item) 458 | # # Print messages 459 | # if item.get("content"): 460 | # for content_item in item.get("content"): 461 | # if content_item.get("text"): 462 | # print(content_item.get("text")) 463 | return [] 464 | 465 | try: 466 | if item_type == "computer_call": 467 | await self._on_computer_call_start(item) 468 | if not computer: 469 | raise ValueError("Computer handler is required for computer calls") 470 | 471 | # Perform computer actions 472 | action = item.get("action") 473 | action_type = action.get("type") 474 | if action_type is None: 475 | print(f"Action type cannot be `None`: action={action}, action_type={action_type}") 476 | return [] 477 | 478 | # Extract action arguments (all fields except 'type') 479 | action_args = {k: v for k, v in action.items() if k != "type"} 480 | 481 | # print(f"{action_type}({action_args})") 482 | 483 | # Execute the computer action 484 | computer_method = getattr(computer, action_type, None) 485 | if computer_method: 486 | assert_callable_with(computer_method, **action_args) 487 | await computer_method(**action_args) 488 | else: 489 | raise ToolError(f"Unknown computer action: {action_type}") 490 | 491 | # Take screenshot after action 492 | if self.screenshot_delay and self.screenshot_delay > 0: 493 | await asyncio.sleep(self.screenshot_delay) 494 | screenshot_base64 = await computer.screenshot() 495 | await self._on_screenshot(screenshot_base64, "screenshot_after") 496 | 497 | # Handle safety checks 498 | pending_checks = item.get("pending_safety_checks", []) 499 | acknowledged_checks = [] 500 | for check in pending_checks: 501 | check_message = check.get("message", str(check)) 502 | acknowledged_checks.append(check) 503 | # TODO: implement a callback for safety checks 504 | # if acknowledge_safety_check_callback(check_message, allow_always=True): 505 | # acknowledged_checks.append(check) 506 | # else: 507 | # raise ValueError(f"Safety check failed: {check_message}") 508 | 509 | # Create call output 510 | call_output = { 511 | "type": "computer_call_output", 512 | "call_id": item.get("call_id"), 513 | "acknowledged_safety_checks": acknowledged_checks, 514 | "output": { 515 | "type": "input_image", 516 | "image_url": f"data:image/png;base64,{screenshot_base64}", 517 | }, 518 | } 519 | 520 | # # Additional URL safety checks for browser environments 521 | # if await computer.get_environment() == "browser": 522 | # current_url = await computer.get_current_url() 523 | # call_output["output"]["current_url"] = current_url 524 | # # TODO: implement a callback for URL safety checks 525 | # # check_blocklisted_url(current_url) 526 | 527 | result = [call_output] 528 | await self._on_computer_call_end(item, result) 529 | return result 530 | 531 | if item_type == "function_call": 532 | await self._on_function_call_start(item) 533 | # Perform function call 534 | function = self._get_tool(item.get("name")) 535 | if not function: 536 | raise ToolError(f"Function {item.get("name")} not found") 537 | 538 | args = json.loads(item.get("arguments")) 539 | 540 | # Validate arguments before execution 541 | assert_callable_with(function, **args) 542 | 543 | # Execute function - use asyncio.to_thread for non-async functions 544 | if inspect.iscoroutinefunction(function): 545 | result = await function(**args) 546 | else: 547 | result = await asyncio.to_thread(function, **args) 548 | 549 | # Create function call output 550 | call_output = { 551 | "type": "function_call_output", 552 | "call_id": item.get("call_id"), 553 | "output": str(result), 554 | } 555 | 556 | result = [call_output] 557 | await self._on_function_call_end(item, result) 558 | return result 559 | except ToolError as e: 560 | return [make_tool_error_item(repr(e), call_id)] 561 | 562 | return [] 563 | 564 | # ============================================================================ 565 | # MAIN AGENT LOOP 566 | # ============================================================================ 567 | 568 | async def run( 569 | self, 570 | messages: Messages, 571 | stream: bool = False, 572 | **kwargs 573 | ) -> AsyncGenerator[Dict[str, Any], None]: 574 | """ 575 | Run the agent with the given messages using Computer protocol handler pattern. 576 | 577 | Args: 578 | messages: List of message dictionaries 579 | stream: Whether to stream the response 580 | **kwargs: Additional arguments 581 | 582 | Returns: 583 | AsyncGenerator that yields response chunks 584 | """ 585 | if not self.agent_config_info: 586 | raise ValueError("Agent configuration not found") 587 | 588 | capabilities = self.get_capabilities() 589 | if "step" not in capabilities: 590 | raise ValueError(f"Agent loop {self.agent_config_info.agent_class.__name__} does not support step predictions") 591 | 592 | await self._initialize_computers() 593 | 594 | # Merge kwargs 595 | merged_kwargs = {**self.kwargs, **kwargs} 596 | 597 | old_items = self._process_input(messages) 598 | new_items = [] 599 | 600 | # Initialize run tracking 601 | run_kwargs = { 602 | "messages": messages, 603 | "stream": stream, 604 | "model": self.model, 605 | "agent_loop": self.agent_config_info.agent_class.__name__, 606 | **merged_kwargs 607 | } 608 | await self._on_run_start(run_kwargs, old_items) 609 | 610 | while new_items[-1].get("role") != "assistant" if new_items else True: 611 | # Lifecycle hook: Check if we should continue based on callbacks (e.g., budget manager) 612 | should_continue = await self._on_run_continue(run_kwargs, old_items, new_items) 613 | if not should_continue: 614 | break 615 | 616 | # Lifecycle hook: Prepare messages for the LLM call 617 | # Use cases: 618 | # - PII anonymization 619 | # - Image retention policy 620 | combined_messages = old_items + new_items 621 | combined_messages = replace_failed_computer_calls_with_function_calls(combined_messages) 622 | preprocessed_messages = await self._on_llm_start(combined_messages) 623 | 624 | loop_kwargs = { 625 | "messages": preprocessed_messages, 626 | "model": self.model, 627 | "tools": self.tool_schemas, 628 | "stream": False, 629 | "computer_handler": self.computer_handler, 630 | "max_retries": self.max_retries, 631 | "use_prompt_caching": self.use_prompt_caching, 632 | **merged_kwargs 633 | } 634 | 635 | # Run agent loop iteration 636 | result = await self.agent_loop.predict_step( 637 | **loop_kwargs, 638 | _on_api_start=self._on_api_start, 639 | _on_api_end=self._on_api_end, 640 | _on_usage=self._on_usage, 641 | _on_screenshot=self._on_screenshot, 642 | ) 643 | result = get_json(result) 644 | 645 | # Lifecycle hook: Postprocess messages after the LLM call 646 | # Use cases: 647 | # - PII deanonymization (if you want tool calls to see PII) 648 | result["output"] = await self._on_llm_end(result.get("output", [])) 649 | await self._on_responses(loop_kwargs, result) 650 | 651 | # Yield agent response 652 | yield result 653 | 654 | # Add agent response to new_items 655 | new_items += result.get("output") 656 | 657 | # Get output call ids 658 | output_call_ids = get_output_call_ids(result.get("output", [])) 659 | 660 | # Handle computer actions 661 | for item in result.get("output"): 662 | partial_items = await self._handle_item(item, self.computer_handler, ignore_call_ids=output_call_ids) 663 | new_items += partial_items 664 | 665 | # Yield partial response 666 | yield { 667 | "output": partial_items, 668 | "usage": Usage( 669 | prompt_tokens=0, 670 | completion_tokens=0, 671 | total_tokens=0, 672 | ) 673 | } 674 | 675 | await self._on_run_end(loop_kwargs, old_items, new_items) 676 | 677 | async def predict_click( 678 | self, 679 | instruction: str, 680 | image_b64: Optional[str] = None 681 | ) -> Optional[Tuple[int, int]]: 682 | """ 683 | Predict click coordinates based on image and instruction. 684 | 685 | Args: 686 | instruction: Instruction for where to click 687 | image_b64: Base64 encoded image (optional, will take screenshot if not provided) 688 | 689 | Returns: 690 | None or tuple with (x, y) coordinates 691 | """ 692 | if not self.agent_config_info: 693 | raise ValueError("Agent configuration not found") 694 | 695 | capabilities = self.get_capabilities() 696 | if "click" not in capabilities: 697 | raise ValueError(f"Agent loop {self.agent_config_info.agent_class.__name__} does not support click predictions") 698 | if hasattr(self.agent_loop, 'predict_click'): 699 | if not image_b64: 700 | if not self.computer_handler: 701 | raise ValueError("Computer tool or image_b64 is required for predict_click") 702 | image_b64 = await self.computer_handler.screenshot() 703 | return await self.agent_loop.predict_click( 704 | model=self.model, 705 | image_b64=image_b64, 706 | instruction=instruction 707 | ) 708 | return None 709 | 710 | def get_capabilities(self) -> List[AgentCapability]: 711 | """ 712 | Get list of capabilities supported by the current agent config. 713 | 714 | Returns: 715 | List of capability strings (e.g., ["step", "click"]) 716 | """ 717 | if not self.agent_config_info: 718 | raise ValueError("Agent configuration not found") 719 | 720 | if hasattr(self.agent_loop, 'get_capabilities'): 721 | return self.agent_loop.get_capabilities() 722 | return ["step"] # Default capability ```