This is page 6 of 21. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .all-contributorsrc ├── .cursorignore ├── .devcontainer │ ├── devcontainer.json │ ├── post-install.sh │ └── README.md ├── .dockerignore ├── .gitattributes ├── .github │ ├── FUNDING.yml │ ├── scripts │ │ ├── get_pyproject_version.py │ │ └── tests │ │ ├── __init__.py │ │ ├── README.md │ │ └── test_get_pyproject_version.py │ └── workflows │ ├── ci-lume.yml │ ├── docker-publish-kasm.yml │ ├── docker-publish-xfce.yml │ ├── docker-reusable-publish.yml │ ├── npm-publish-computer.yml │ ├── npm-publish-core.yml │ ├── publish-lume.yml │ ├── pypi-publish-agent.yml │ ├── pypi-publish-computer-server.yml │ ├── pypi-publish-computer.yml │ ├── pypi-publish-core.yml │ ├── pypi-publish-mcp-server.yml │ ├── pypi-publish-pylume.yml │ ├── pypi-publish-som.yml │ ├── pypi-reusable-publish.yml │ └── test-validation-script.yml ├── .gitignore ├── .vscode │ ├── docs.code-workspace │ ├── launch.json │ ├── libs-ts.code-workspace │ ├── lume.code-workspace │ ├── lumier.code-workspace │ ├── py.code-workspace │ └── settings.json ├── blog │ ├── app-use.md │ ├── assets │ │ ├── composite-agents.png │ │ ├── docker-ubuntu-support.png │ │ ├── hack-booth.png │ │ ├── hack-closing-ceremony.jpg │ │ ├── hack-cua-ollama-hud.jpeg │ │ ├── hack-leaderboard.png │ │ ├── hack-the-north.png │ │ ├── hack-winners.jpeg │ │ ├── hack-workshop.jpeg │ │ ├── hud-agent-evals.png │ │ └── trajectory-viewer.jpeg │ ├── bringing-computer-use-to-the-web.md │ ├── build-your-own-operator-on-macos-1.md │ ├── build-your-own-operator-on-macos-2.md │ ├── composite-agents.md │ ├── cua-hackathon.md │ ├── hack-the-north.md │ ├── hud-agent-evals.md │ ├── human-in-the-loop.md │ ├── introducing-cua-cloud-containers.md │ ├── lume-to-containerization.md │ ├── sandboxed-python-execution.md │ ├── training-computer-use-models-trajectories-1.md │ ├── trajectory-viewer.md │ ├── ubuntu-docker-support.md │ └── windows-sandbox.md ├── CONTRIBUTING.md ├── Development.md ├── Dockerfile ├── docs │ ├── .gitignore │ ├── .prettierrc │ ├── content │ │ └── docs │ │ ├── agent-sdk │ │ │ ├── agent-loops.mdx │ │ │ ├── benchmarks │ │ │ │ ├── index.mdx │ │ │ │ ├── interactive.mdx │ │ │ │ ├── introduction.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── osworld-verified.mdx │ │ │ │ ├── screenspot-pro.mdx │ │ │ │ └── screenspot-v2.mdx │ │ │ ├── callbacks │ │ │ │ ├── agent-lifecycle.mdx │ │ │ │ ├── cost-saving.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── logging.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── pii-anonymization.mdx │ │ │ │ └── trajectories.mdx │ │ │ ├── chat-history.mdx │ │ │ ├── custom-computer-handlers.mdx │ │ │ ├── custom-tools.mdx │ │ │ ├── customizing-computeragent.mdx │ │ │ ├── integrations │ │ │ │ ├── hud.mdx │ │ │ │ └── meta.json │ │ │ ├── message-format.mdx │ │ │ ├── meta.json │ │ │ ├── migration-guide.mdx │ │ │ ├── prompt-caching.mdx │ │ │ ├── supported-agents │ │ │ │ ├── composed-agents.mdx │ │ │ │ ├── computer-use-agents.mdx │ │ │ │ ├── grounding-models.mdx │ │ │ │ ├── human-in-the-loop.mdx │ │ │ │ └── meta.json │ │ │ ├── supported-model-providers │ │ │ │ ├── index.mdx │ │ │ │ └── local-models.mdx │ │ │ └── usage-tracking.mdx │ │ ├── computer-sdk │ │ │ ├── cloud-vm-management.mdx │ │ │ ├── commands.mdx │ │ │ ├── computer-ui.mdx │ │ │ ├── computers.mdx │ │ │ ├── meta.json │ │ │ └── sandboxed-python.mdx │ │ ├── index.mdx │ │ ├── libraries │ │ │ ├── agent │ │ │ │ └── index.mdx │ │ │ ├── computer │ │ │ │ └── index.mdx │ │ │ ├── computer-server │ │ │ │ ├── Commands.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── REST-API.mdx │ │ │ │ └── WebSocket-API.mdx │ │ │ ├── core │ │ │ │ └── index.mdx │ │ │ ├── lume │ │ │ │ ├── cli-reference.mdx │ │ │ │ ├── faq.md │ │ │ │ ├── http-api.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── meta.json │ │ │ │ └── prebuilt-images.mdx │ │ │ ├── lumier │ │ │ │ ├── building-lumier.mdx │ │ │ │ ├── docker-compose.mdx │ │ │ │ ├── docker.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ └── meta.json │ │ │ ├── mcp-server │ │ │ │ ├── client-integrations.mdx │ │ │ │ ├── configuration.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── llm-integrations.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── tools.mdx │ │ │ │ └── usage.mdx │ │ │ └── som │ │ │ ├── configuration.mdx │ │ │ └── index.mdx │ │ ├── meta.json │ │ ├── quickstart-cli.mdx │ │ ├── quickstart-devs.mdx │ │ └── telemetry.mdx │ ├── next.config.mjs │ ├── package-lock.json │ ├── package.json │ ├── pnpm-lock.yaml │ ├── postcss.config.mjs │ ├── public │ │ └── img │ │ ├── agent_gradio_ui.png │ │ ├── agent.png │ │ ├── cli.png │ │ ├── computer.png │ │ ├── som_box_threshold.png │ │ └── som_iou_threshold.png │ ├── README.md │ ├── source.config.ts │ ├── src │ │ ├── app │ │ │ ├── (home) │ │ │ │ ├── [[...slug]] │ │ │ │ │ └── page.tsx │ │ │ │ └── layout.tsx │ │ │ ├── api │ │ │ │ └── search │ │ │ │ └── route.ts │ │ │ ├── favicon.ico │ │ │ ├── global.css │ │ │ ├── layout.config.tsx │ │ │ ├── layout.tsx │ │ │ ├── llms.mdx │ │ │ │ └── [[...slug]] │ │ │ │ └── route.ts │ │ │ └── llms.txt │ │ │ └── route.ts │ │ ├── assets │ │ │ ├── discord-black.svg │ │ │ ├── discord-white.svg │ │ │ ├── logo-black.svg │ │ │ └── logo-white.svg │ │ ├── components │ │ │ ├── iou.tsx │ │ │ └── mermaid.tsx │ │ ├── lib │ │ │ ├── llms.ts │ │ │ └── source.ts │ │ └── mdx-components.tsx │ └── tsconfig.json ├── examples │ ├── agent_examples.py │ ├── agent_ui_examples.py │ ├── cloud_api_examples.py │ ├── computer_examples_windows.py │ ├── computer_examples.py │ ├── computer_ui_examples.py │ ├── computer-example-ts │ │ ├── .env.example │ │ ├── .gitignore │ │ ├── .prettierrc │ │ ├── package-lock.json │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── README.md │ │ ├── src │ │ │ ├── helpers.ts │ │ │ └── index.ts │ │ └── tsconfig.json │ ├── docker_examples.py │ ├── evals │ │ ├── hud_eval_examples.py │ │ └── wikipedia_most_linked.txt │ ├── pylume_examples.py │ ├── sandboxed_functions_examples.py │ ├── som_examples.py │ ├── utils.py │ └── winsandbox_example.py ├── img │ ├── agent_gradio_ui.png │ ├── agent.png │ ├── cli.png │ ├── computer.png │ ├── logo_black.png │ └── logo_white.png ├── libs │ ├── kasm │ │ ├── Dockerfile │ │ ├── LICENSE │ │ ├── README.md │ │ └── src │ │ └── ubuntu │ │ └── install │ │ └── firefox │ │ ├── custom_startup.sh │ │ ├── firefox.desktop │ │ └── install_firefox.sh │ ├── lume │ │ ├── .cursorignore │ │ ├── CONTRIBUTING.md │ │ ├── Development.md │ │ ├── img │ │ │ └── cli.png │ │ ├── Package.resolved │ │ ├── Package.swift │ │ ├── README.md │ │ ├── resources │ │ │ └── lume.entitlements │ │ ├── scripts │ │ │ ├── build │ │ │ │ ├── build-debug.sh │ │ │ │ ├── build-release-notarized.sh │ │ │ │ └── build-release.sh │ │ │ └── install.sh │ │ ├── src │ │ │ ├── Commands │ │ │ │ ├── Clone.swift │ │ │ │ ├── Config.swift │ │ │ │ ├── Create.swift │ │ │ │ ├── Delete.swift │ │ │ │ ├── Get.swift │ │ │ │ ├── Images.swift │ │ │ │ ├── IPSW.swift │ │ │ │ ├── List.swift │ │ │ │ ├── Logs.swift │ │ │ │ ├── Options │ │ │ │ │ └── FormatOption.swift │ │ │ │ ├── Prune.swift │ │ │ │ ├── Pull.swift │ │ │ │ ├── Push.swift │ │ │ │ ├── Run.swift │ │ │ │ ├── Serve.swift │ │ │ │ ├── Set.swift │ │ │ │ └── Stop.swift │ │ │ ├── ContainerRegistry │ │ │ │ ├── ImageContainerRegistry.swift │ │ │ │ ├── ImageList.swift │ │ │ │ └── ImagesPrinter.swift │ │ │ ├── Errors │ │ │ │ └── Errors.swift │ │ │ ├── FileSystem │ │ │ │ ├── Home.swift │ │ │ │ ├── Settings.swift │ │ │ │ ├── VMConfig.swift │ │ │ │ ├── VMDirectory.swift │ │ │ │ └── VMLocation.swift │ │ │ ├── LumeController.swift │ │ │ ├── Main.swift │ │ │ ├── Server │ │ │ │ ├── Handlers.swift │ │ │ │ ├── HTTP.swift │ │ │ │ ├── Requests.swift │ │ │ │ ├── Responses.swift │ │ │ │ └── Server.swift │ │ │ ├── Utils │ │ │ │ ├── CommandRegistry.swift │ │ │ │ ├── CommandUtils.swift │ │ │ │ ├── Logger.swift │ │ │ │ ├── NetworkUtils.swift │ │ │ │ ├── Path.swift │ │ │ │ ├── ProcessRunner.swift │ │ │ │ ├── ProgressLogger.swift │ │ │ │ ├── String.swift │ │ │ │ └── Utils.swift │ │ │ ├── Virtualization │ │ │ │ ├── DarwinImageLoader.swift │ │ │ │ ├── DHCPLeaseParser.swift │ │ │ │ ├── ImageLoaderFactory.swift │ │ │ │ └── VMVirtualizationService.swift │ │ │ ├── VM │ │ │ │ ├── DarwinVM.swift │ │ │ │ ├── LinuxVM.swift │ │ │ │ ├── VM.swift │ │ │ │ ├── VMDetails.swift │ │ │ │ ├── VMDetailsPrinter.swift │ │ │ │ ├── VMDisplayResolution.swift │ │ │ │ └── VMFactory.swift │ │ │ └── VNC │ │ │ ├── PassphraseGenerator.swift │ │ │ └── VNCService.swift │ │ └── tests │ │ ├── Mocks │ │ │ ├── MockVM.swift │ │ │ ├── MockVMVirtualizationService.swift │ │ │ └── MockVNCService.swift │ │ ├── VM │ │ │ └── VMDetailsPrinterTests.swift │ │ ├── VMTests.swift │ │ ├── VMVirtualizationServiceTests.swift │ │ └── VNCServiceTests.swift │ ├── lumier │ │ ├── .dockerignore │ │ ├── Dockerfile │ │ ├── README.md │ │ └── src │ │ ├── bin │ │ │ └── entry.sh │ │ ├── config │ │ │ └── constants.sh │ │ ├── hooks │ │ │ └── on-logon.sh │ │ └── lib │ │ ├── utils.sh │ │ └── vm.sh │ ├── python │ │ ├── agent │ │ │ ├── .bumpversion.cfg │ │ │ ├── agent │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── adapters │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── huggingfacelocal_adapter.py │ │ │ │ │ ├── human_adapter.py │ │ │ │ │ ├── mlxvlm_adapter.py │ │ │ │ │ └── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── qwen2_5_vl.py │ │ │ │ ├── agent.py │ │ │ │ ├── callbacks │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── budget_manager.py │ │ │ │ │ ├── image_retention.py │ │ │ │ │ ├── logging.py │ │ │ │ │ ├── operator_validator.py │ │ │ │ │ ├── pii_anonymization.py │ │ │ │ │ ├── prompt_instructions.py │ │ │ │ │ ├── telemetry.py │ │ │ │ │ └── trajectory_saver.py │ │ │ │ ├── cli.py │ │ │ │ ├── computers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cua.py │ │ │ │ │ └── custom.py │ │ │ │ ├── decorators.py │ │ │ │ ├── human_tool │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ ├── server.py │ │ │ │ │ └── ui.py │ │ │ │ ├── integrations │ │ │ │ │ └── hud │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── agent.py │ │ │ │ │ └── proxy.py │ │ │ │ ├── loops │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── anthropic.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── composed_grounded.py │ │ │ │ │ ├── gemini.py │ │ │ │ │ ├── glm45v.py │ │ │ │ │ ├── gta1.py │ │ │ │ │ ├── holo.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── model_types.csv │ │ │ │ │ ├── moondream3.py │ │ │ │ │ ├── omniparser.py │ │ │ │ │ ├── openai.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── uitars.py │ │ │ │ ├── proxy │ │ │ │ │ ├── examples.py │ │ │ │ │ └── handlers.py │ │ │ │ ├── responses.py │ │ │ │ ├── types.py │ │ │ │ └── ui │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ └── gradio │ │ │ │ ├── __init__.py │ │ │ │ ├── app.py │ │ │ │ └── ui_components.py │ │ │ ├── benchmarks │ │ │ │ ├── .gitignore │ │ │ │ ├── contrib.md │ │ │ │ ├── interactive.py │ │ │ │ ├── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ └── gta1.py │ │ │ │ ├── README.md │ │ │ │ ├── ss-pro.py │ │ │ │ ├── ss-v2.py │ │ │ │ └── utils.py │ │ │ ├── example.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer │ │ │ ├── .bumpversion.cfg │ │ │ ├── computer │ │ │ │ ├── __init__.py │ │ │ │ ├── computer.py │ │ │ │ ├── diorama_computer.py │ │ │ │ ├── helpers.py │ │ │ │ ├── interface │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ ├── models.py │ │ │ │ │ └── windows.py │ │ │ │ ├── logger.py │ │ │ │ ├── models.py │ │ │ │ ├── providers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cloud │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── docker │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── lume │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── lume_api.py │ │ │ │ │ ├── lumier │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── types.py │ │ │ │ │ └── winsandbox │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── provider.py │ │ │ │ │ └── setup_script.ps1 │ │ │ │ ├── ui │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ └── gradio │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── app.py │ │ │ │ └── utils.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer-server │ │ │ ├── .bumpversion.cfg │ │ │ ├── computer_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── cli.py │ │ │ │ ├── diorama │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── diorama_computer.py │ │ │ │ │ ├── diorama.py │ │ │ │ │ ├── draw.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── safezone.py │ │ │ │ ├── handlers │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── windows.py │ │ │ │ ├── main.py │ │ │ │ ├── server.py │ │ │ │ └── watchdog.py │ │ │ ├── examples │ │ │ │ ├── __init__.py │ │ │ │ └── usage_example.py │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ ├── run_server.py │ │ │ └── test_connection.py │ │ ├── core │ │ │ ├── .bumpversion.cfg │ │ │ ├── core │ │ │ │ ├── __init__.py │ │ │ │ └── telemetry │ │ │ │ ├── __init__.py │ │ │ │ └── posthog.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── mcp-server │ │ │ ├── .bumpversion.cfg │ │ │ ├── CONCURRENT_SESSIONS.md │ │ │ ├── mcp_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── server.py │ │ │ │ └── session_manager.py │ │ │ ├── pdm.lock │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ └── scripts │ │ │ ├── install_mcp_server.sh │ │ │ └── start_mcp_server.sh │ │ ├── pylume │ │ │ ├── __init__.py │ │ │ ├── .bumpversion.cfg │ │ │ ├── pylume │ │ │ │ ├── __init__.py │ │ │ │ ├── client.py │ │ │ │ ├── exceptions.py │ │ │ │ ├── lume │ │ │ │ ├── models.py │ │ │ │ ├── pylume.py │ │ │ │ └── server.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ └── som │ │ ├── .bumpversion.cfg │ │ ├── LICENSE │ │ ├── poetry.toml │ │ ├── pyproject.toml │ │ ├── README.md │ │ ├── som │ │ │ ├── __init__.py │ │ │ ├── detect.py │ │ │ ├── detection.py │ │ │ ├── models.py │ │ │ ├── ocr.py │ │ │ ├── util │ │ │ │ └── utils.py │ │ │ └── visualization.py │ │ └── tests │ │ └── test_omniparser.py │ ├── typescript │ │ ├── .gitignore │ │ ├── .nvmrc │ │ ├── agent │ │ │ ├── examples │ │ │ │ ├── playground-example.html │ │ │ │ └── README.md │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── client.ts │ │ │ │ ├── index.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ └── client.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── biome.json │ │ ├── computer │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── computer │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── providers │ │ │ │ │ │ ├── base.ts │ │ │ │ │ │ ├── cloud.ts │ │ │ │ │ │ └── index.ts │ │ │ │ │ └── types.ts │ │ │ │ ├── index.ts │ │ │ │ ├── interface │ │ │ │ │ ├── base.ts │ │ │ │ │ ├── factory.ts │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── linux.ts │ │ │ │ │ ├── macos.ts │ │ │ │ │ └── windows.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ ├── computer │ │ │ │ │ └── cloud.test.ts │ │ │ │ ├── interface │ │ │ │ │ ├── factory.test.ts │ │ │ │ │ ├── index.test.ts │ │ │ │ │ ├── linux.test.ts │ │ │ │ │ ├── macos.test.ts │ │ │ │ │ └── windows.test.ts │ │ │ │ └── setup.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── core │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── index.ts │ │ │ │ └── telemetry │ │ │ │ ├── clients │ │ │ │ │ ├── index.ts │ │ │ │ │ └── posthog.ts │ │ │ │ └── index.ts │ │ │ ├── tests │ │ │ │ └── telemetry.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── pnpm-workspace.yaml │ │ └── README.md │ └── xfce │ ├── .dockerignore │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ └── src │ ├── scripts │ │ ├── resize-display.sh │ │ ├── start-computer-server.sh │ │ ├── start-novnc.sh │ │ ├── start-vnc.sh │ │ └── xstartup.sh │ ├── supervisor │ │ └── supervisord.conf │ └── xfce-config │ ├── helpers.rc │ ├── xfce4-power-manager.xml │ └── xfce4-session.xml ├── LICENSE.md ├── Makefile ├── notebooks │ ├── agent_nb.ipynb │ ├── blog │ │ ├── build-your-own-operator-on-macos-1.ipynb │ │ └── build-your-own-operator-on-macos-2.ipynb │ ├── composite_agents_docker_nb.ipynb │ ├── computer_nb.ipynb │ ├── computer_server_nb.ipynb │ ├── customizing_computeragent.ipynb │ ├── eval_osworld.ipynb │ ├── ollama_nb.ipynb │ ├── pylume_nb.ipynb │ ├── README.md │ ├── sota_hackathon_cloud.ipynb │ └── sota_hackathon.ipynb ├── pdm.lock ├── pyproject.toml ├── pyrightconfig.json ├── README.md ├── samples │ └── community │ ├── global-online │ │ └── README.md │ └── hack-the-north │ └── README.md ├── scripts │ ├── build-uv.sh │ ├── build.ps1 │ ├── build.sh │ ├── cleanup.sh │ ├── playground-docker.sh │ ├── playground.sh │ └── run-docker-dev.sh └── tests ├── pytest.ini ├── shell_cmd.py ├── test_files.py ├── test_mcp_server_session_management.py ├── test_mcp_server_streaming.py ├── test_shell_bash.py ├── test_telemetry.py ├── test_venv.py └── test_watchdog.py ``` # Files -------------------------------------------------------------------------------- /scripts/build.ps1: -------------------------------------------------------------------------------- ``` 1 | # PowerShell Build Script for CUA 2 | # Exit on error 3 | $ErrorActionPreference = "Stop" 4 | 5 | # Colors for output 6 | $RED = "Red" 7 | $GREEN = "Green" 8 | $BLUE = "Blue" 9 | 10 | # Function to print step information 11 | function Print-Step { 12 | param([string]$Message) 13 | Write-Host "==> $Message" -ForegroundColor $BLUE 14 | } 15 | 16 | # Function to print success message 17 | function Print-Success { 18 | param([string]$Message) 19 | Write-Host "==> Success: $Message" -ForegroundColor $GREEN 20 | } 21 | 22 | # Function to print error message 23 | function Print-Error { 24 | param([string]$Message) 25 | Write-Host "==> Error: $Message" -ForegroundColor $RED 26 | } 27 | 28 | # Get the script's directory and project root 29 | $SCRIPT_DIR = Split-Path -Parent $MyInvocation.MyCommand.Path 30 | $PROJECT_ROOT = Split-Path -Parent $SCRIPT_DIR 31 | 32 | # Change to project root 33 | Set-Location $PROJECT_ROOT 34 | 35 | # Load environment variables from .env.local 36 | if (Test-Path ".env.local") { 37 | Print-Step "Loading environment variables from .env.local..." 38 | Get-Content ".env.local" | ForEach-Object { 39 | if ($_ -match "^([^#][^=]*?)=(.*)$") { 40 | [Environment]::SetEnvironmentVariable($matches[1], $matches[2], "Process") 41 | } 42 | } 43 | Print-Success "Environment variables loaded" 44 | } else { 45 | Print-Error ".env.local file not found" 46 | exit 1 47 | } 48 | 49 | # Check if conda is available 50 | try { 51 | conda --version | Out-Null 52 | Print-Success "Conda is available" 53 | } catch { 54 | Print-Error "Conda is not available. Please install Anaconda or Miniconda first." 55 | exit 1 56 | } 57 | 58 | # Create or update conda environment 59 | Print-Step "Creating/updating conda environment 'cua' with Python 3.12..." 60 | try { 61 | # Check if environment exists 62 | $envExists = conda env list | Select-String "^cua\s" 63 | if ($envExists) { 64 | Print-Step "Environment 'cua' already exists. Updating..." 65 | conda env update -n cua -f environment.yml --prune 66 | } else { 67 | Print-Step "Creating new environment 'cua'..." 68 | conda create -n cua python=3.12 -y 69 | } 70 | Print-Success "Conda environment 'cua' ready" 71 | } catch { 72 | Print-Error "Failed to create/update conda environment" 73 | exit 1 74 | } 75 | 76 | # Activate conda environment 77 | Print-Step "Activating conda environment 'cua'..." 78 | try { 79 | conda activate cua 80 | Print-Success "Environment activated" 81 | } catch { 82 | Print-Error "Failed to activate conda environment 'cua'" 83 | Print-Step "Please run: conda activate cua" 84 | Print-Step "Then re-run this script" 85 | exit 1 86 | } 87 | 88 | # Clean up existing environments and cache 89 | Print-Step "Cleaning up existing environments..." 90 | Get-ChildItem -Path . -Recurse -Directory -Name "__pycache__" | ForEach-Object { Remove-Item -Path $_ -Recurse -Force } 91 | Get-ChildItem -Path . -Recurse -Directory -Name ".pytest_cache" | ForEach-Object { Remove-Item -Path $_ -Recurse -Force } 92 | Get-ChildItem -Path . -Recurse -Directory -Name "dist" | ForEach-Object { Remove-Item -Path $_ -Recurse -Force } 93 | Get-ChildItem -Path . -Recurse -Directory -Name "*.egg-info" | ForEach-Object { Remove-Item -Path $_ -Recurse -Force } 94 | 95 | # Function to install a package and its dependencies 96 | function Install-Package { 97 | param( 98 | [string]$PackageDir, 99 | [string]$PackageName, 100 | [string]$Extras = "" 101 | ) 102 | 103 | Print-Step "Installing $PackageName..." 104 | Set-Location $PackageDir 105 | 106 | if (Test-Path "pyproject.toml") { 107 | if ($Extras) { 108 | pip install -e ".[$Extras]" 109 | } else { 110 | pip install -e . 111 | } 112 | } else { 113 | Print-Error "No pyproject.toml found in $PackageDir" 114 | Set-Location $PROJECT_ROOT 115 | return $false 116 | } 117 | 118 | Set-Location $PROJECT_ROOT 119 | return $true 120 | } 121 | 122 | # Install packages in order of dependency 123 | Print-Step "Installing packages in development mode..." 124 | 125 | # Install core first (base package with telemetry support) 126 | if (-not (Install-Package "libs/python/core" "core")) { exit 1 } 127 | 128 | # Install pylume (base dependency) 129 | if (-not (Install-Package "libs/python/pylume" "pylume")) { exit 1 } 130 | 131 | # Install computer with all its dependencies and extras 132 | if (-not (Install-Package "libs/python/computer" "computer" "all")) { exit 1 } 133 | 134 | # Install omniparser 135 | if (-not (Install-Package "libs/python/som" "som")) { exit 1 } 136 | 137 | # Install agent with all its dependencies and extras 138 | if (-not (Install-Package "libs/python/agent" "agent" "all")) { exit 1 } 139 | 140 | # Install computer-server 141 | if (-not (Install-Package "libs/python/computer-server" "computer-server")) { exit 1 } 142 | 143 | # Install mcp-server 144 | if (-not (Install-Package "libs/python/mcp-server" "mcp-server")) { exit 1 } 145 | 146 | # Install development tools from root project 147 | Print-Step "Installing development dependencies..." 148 | pip install -e ".[dev,test,docs]" 149 | 150 | # Create a .env file for VS Code to use the virtual environment 151 | Print-Step "Creating .env file for VS Code..." 152 | $pythonPath = "$PROJECT_ROOT/libs/python/core;$PROJECT_ROOT/libs/python/computer;$PROJECT_ROOT/libs/python/agent;$PROJECT_ROOT/libs/python/som;$PROJECT_ROOT/libs/python/pylume;$PROJECT_ROOT/libs/python/computer-server;$PROJECT_ROOT/libs/python/mcp-server" 153 | "PYTHONPATH=$pythonPath" | Out-File -FilePath ".env" -Encoding UTF8 154 | 155 | Print-Success "All packages installed successfully!" 156 | Print-Step "Your conda environment 'cua' is ready. To activate it:" 157 | Write-Host " conda activate cua" -ForegroundColor Yellow 158 | ``` -------------------------------------------------------------------------------- /docs/content/docs/agent-sdk/integrations/hud.mdx: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | title: HUD Evals 3 | description: Use ComputerAgent with HUD for benchmarking and evaluation 4 | --- 5 | 6 | <Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.</Callout> 7 | 8 | The HUD integration allows an agent to be benchmarked using the [HUD framework](https://www.hud.so/). Through the HUD integration, the agent controls a computer inside HUD, where tests are run to evaluate the success of each task. 9 | 10 | ## Installation 11 | 12 | First, install the required package: 13 | 14 | ```bash 15 | pip install "cua-agent[hud]" 16 | ## or install hud-python directly 17 | # pip install hud-python==0.4.12 18 | ``` 19 | 20 | ## Environment Variables 21 | 22 | Before running any evaluations, you’ll need to set up your environment variables for HUD and your model providers: 23 | 24 | ```bash 25 | # HUD access 26 | export HUD_API_KEY="your_hud_api_key" 27 | 28 | # Model provider keys (at least one required) 29 | export OPENAI_API_KEY="your_openai_key" 30 | export ANTHROPIC_API_KEY="your_anthropic_key" 31 | ``` 32 | 33 | ## Running a Single Task 34 | 35 | You can run a single task from a HUD dataset for quick verification. 36 | 37 | ### Example 38 | 39 | ```python 40 | from agent.integrations.hud import run_single_task 41 | 42 | await run_single_task( 43 | dataset="hud-evals/OSWorld-Verified", # or another HUD dataset 44 | model="openai/computer-use-preview+openai/gpt-5-nano", # any supported model string 45 | task_id=155, # e.g., reopen last closed tab 46 | ) 47 | ``` 48 | 49 | ### Parameters 50 | 51 | - `task_id` (`int`): Default: `0` 52 | Index of the task to run from the dataset. 53 | 54 | ## Running a Full Dataset 55 | 56 | To benchmark your agent at scale, you can run an entire dataset (or a subset) in parallel. 57 | 58 | ### Example 59 | 60 | ```python 61 | from agent.integrations.hud import run_full_dataset 62 | 63 | results = await run_full_dataset( 64 | dataset="hud-evals/OSWorld-Verified", # can also pass a Dataset or list[dict] 65 | model="openai/computer-use-preview", 66 | split="train[:3]", # try a few tasks to start 67 | max_concurrent=20, # tune to your infra 68 | max_steps=50 # safety cap per task 69 | ) 70 | ``` 71 | 72 | ### Parameters 73 | 74 | - `job_name` (`str` | `None`): 75 | Optional human-readable name for the evaluation job (shows up in HUD UI). 76 | - `max_concurrent` (`int`): Default: `30` 77 | Number of tasks to run in parallel. Scale this based on your infra. 78 | - `max_steps` (`int`): Default: `50` 79 | Safety cap on steps per task to prevent infinite loops. 80 | - `split` (`str`): Default: `"train"` 81 | Dataset split or subset to run. Uses the [Hugging Face split format](https://huggingface.co/docs/datasets/v1.11.0/splits.html), e.g., `"train[:10]"` for the first 10 tasks. 82 | 83 | ## Additional Parameters 84 | 85 | Both single-task and full-dataset runs share a common set of configuration options. These let you fine-tune how the evaluation runs. 86 | 87 | - `dataset` (`str` | `Dataset` | `list[dict]`): **Required** 88 | HUD dataset name (e.g. `"hud-evals/OSWorld-Verified"`), a loaded `Dataset`, or a list of tasks. 89 | - `model` (`str`): Default: `"computer-use-preview"` 90 | Model string, e.g. `"openai/computer-use-preview+openai/gpt-5-nano"`. Supports composition with `+` (planning + grounding). 91 | - `allowed_tools` (`list[str]`): Default: `["openai_computer"]` 92 | Restrict which tools the agent may use. 93 | - `tools` (`list[Any]`): 94 | Extra tool configs to inject. 95 | - `custom_loop` (`Callable`): 96 | Optional custom agent loop function. If provided, overrides automatic loop selection. 97 | - `only_n_most_recent_images` (`int`): Default: `5` for full dataset, `None` for single task. 98 | Retain only the last N screenshots in memory. 99 | - `callbacks` (`list[Any]`): 100 | Hook functions for logging, telemetry, or side effects. 101 | - `verbosity` (`int`): 102 | Logging level. Set `2` for debugging every call/action. 103 | - `trajectory_dir` (`str` | `dict`): 104 | Save local copies of trajectories for replay/analysis. 105 | - `max_retries` (`int`): Default: `3` 106 | Number of retries for failed model/tool calls. 107 | - `screenshot_delay` (`float` | `int`): Default: `0.5` 108 | Delay (seconds) between screenshots to avoid race conditions. 109 | - `use_prompt_caching` (`bool`): Default: `False` 110 | Cache repeated prompts to reduce API calls. 111 | - `max_trajectory_budget` (`float` | `dict`): 112 | Limit on trajectory size/budget (e.g., tokens, steps). 113 | - `telemetry_enabled` (`bool`): Default: `True` 114 | Whether to send telemetry/traces to HUD. 115 | - `**kwargs` (`any`): 116 | Any additional keyword arguments are passed through to the agent loop or model provider. 117 | 118 | ## Available Benchmarks 119 | 120 | HUD provides multiple benchmark datasets for realistic evaluation. 121 | 122 | 1. **[OSWorld-Verified](/agent-sdk/benchmarks/osworld-verified)** – Benchmark on 369+ real-world desktop tasks across Chrome, LibreOffice, GIMP, VS Code, etc. 123 | *Best for*: evaluating full computer-use agents in realistic environments. 124 | *Verified variant*: fixes 300+ issues from earlier versions for reliability. 125 | 126 | **Coming soon:** SheetBench (spreadsheet automation) and other specialized HUD datasets. 127 | 128 | See the [HUD docs](https://docs.hud.so/environment-creation) for more eval environments. 129 | 130 | ## Tips 131 | 132 | * **Debugging:** set `verbosity=2` to see every model call and tool action. 133 | * **Performance:** lower `screenshot_delay` for faster runs; raise it if you see race conditions. 134 | * **Safety:** always set `max_steps` (defaults to 50) to prevent runaway loops. 135 | * **Custom tools:** pass extra `tools=[...]` into the agent config if you need beyond `openai_computer`. ``` -------------------------------------------------------------------------------- /docs/content/docs/agent-sdk/message-format.mdx: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | title: Message Format 3 | --- 4 | 5 | This page documents the Python message and response schema used by the Agent SDK. 6 | It mirrors the structure shown in Chat History and provides precise type definitions you can target in your own code. 7 | 8 | All examples below use Python type hints with `TypedDict` and `Literal` from the standard `typing` module. 9 | 10 | ## Response 11 | 12 | The agent yields response chunks as an async generator of objects with `output` and `usage`. 13 | 14 | ```python 15 | from typing import List, TypedDict 16 | 17 | class Usage(TypedDict, total=False): 18 | prompt_tokens: int 19 | completion_tokens: int 20 | total_tokens: int 21 | response_cost: float # USD cost if available 22 | 23 | class AgentResponse(TypedDict): 24 | output: List["AgentMessage"] 25 | usage: Usage 26 | ``` 27 | 28 | ## Messages 29 | 30 | Agent messages represent the state of the conversation and the agent's actions. 31 | 32 | ```python 33 | from typing import List, Literal, Optional, TypedDict, Union 34 | 35 | # Union of all message variants 36 | AgentMessage = Union[ 37 | "UserMessage", 38 | "AssistantMessage", 39 | "ReasoningMessage", 40 | "ComputerCallMessage", 41 | "ComputerCallOutputMessage", 42 | "FunctionCallMessage", 43 | "FunctionCallOutputMessage", 44 | ] 45 | 46 | # Input message (role: user/system/developer) 47 | class UserMessage(TypedDict, total=False): 48 | type: Literal["message"] # optional for user input 49 | role: Literal["user", "system", "developer"] 50 | content: Union[str, List["InputContent"]] 51 | 52 | # Output message (assistant text) 53 | class AssistantMessage(TypedDict): 54 | type: Literal["message"] 55 | role: Literal["assistant"] 56 | content: List["OutputContent"] 57 | 58 | # Output reasoning/thinking message 59 | class ReasoningMessage(TypedDict): 60 | type: Literal["reasoning"] 61 | summary: List["SummaryContent"] 62 | 63 | # Output computer action call (agent intends to act) 64 | class ComputerCallMessage(TypedDict): 65 | type: Literal["computer_call"] 66 | call_id: str 67 | status: Literal["completed", "failed", "pending"] 68 | action: "ComputerAction" 69 | 70 | # Output computer action result (always a screenshot) 71 | class ComputerCallOutputMessage(TypedDict): 72 | type: Literal["computer_call_output"] 73 | call_id: str 74 | output: "ComputerResultContent" 75 | 76 | # Output function call (agent calls a Python tool) 77 | class FunctionCallMessage(TypedDict): 78 | type: Literal["function_call"] 79 | call_id: str 80 | status: Literal["completed", "failed", "pending"] 81 | name: str 82 | arguments: str # JSON-serialized kwargs 83 | 84 | # Output function call result (text) 85 | class FunctionCallOutputMessage(TypedDict): 86 | type: Literal["function_call_output"] 87 | call_id: str 88 | output: str 89 | ``` 90 | 91 | ## Message Content 92 | 93 | These content items appear inside `content` arrays for the message types above. 94 | 95 | ```python 96 | # Input content kinds 97 | class InputContent(TypedDict): 98 | type: Literal["input_image", "input_text"] 99 | text: Optional[str] 100 | image_url: Optional[str] # e.g., data URL 101 | 102 | # Assistant output content 103 | class OutputContent(TypedDict): 104 | type: Literal["output_text"] 105 | text: str 106 | 107 | # Reasoning/summary output content 108 | class SummaryContent(TypedDict): 109 | type: Literal["summary_text"] 110 | text: str 111 | 112 | # Computer call outputs (screenshots) 113 | class ComputerResultContent(TypedDict): 114 | type: Literal["computer_screenshot", "input_image"] 115 | image_url: str # data URL (e.g., "data:image/png;base64,....") 116 | ``` 117 | 118 | ## Actions 119 | 120 | Computer actions represent concrete operations the agent will perform on the computer. 121 | 122 | Two broad families exist depending on the provider: OpenAI-style and Anthropic-style. 123 | 124 | ```python 125 | # Union of all supported computer actions 126 | ComputerAction = Union[ 127 | "ClickAction", 128 | "DoubleClickAction", 129 | "DragAction", 130 | "KeyPressAction", 131 | "MoveAction", 132 | "ScreenshotAction", 133 | "ScrollAction", 134 | "TypeAction", 135 | "WaitAction", 136 | # Anthropic variants 137 | "LeftMouseDownAction", 138 | "LeftMouseUpAction", 139 | ] 140 | 141 | # OpenAI Computer Actions 142 | class ClickAction(TypedDict): 143 | type: Literal["click"] 144 | button: Literal["left", "right", "wheel", "back", "forward"] 145 | x: int 146 | y: int 147 | 148 | class DoubleClickAction(TypedDict, total=False): 149 | type: Literal["double_click"] 150 | button: Literal["left", "right", "wheel", "back", "forward"] 151 | x: int 152 | y: int 153 | 154 | class DragAction(TypedDict, total=False): 155 | type: Literal["drag"] 156 | button: Literal["left", "right", "wheel", "back", "forward"] 157 | path: List[tuple[int, int]] # [(x1, y1), (x2, y2), ...] 158 | 159 | class KeyPressAction(TypedDict): 160 | type: Literal["keypress"] 161 | keys: List[str] # e.g., ["ctrl", "a"] 162 | 163 | class MoveAction(TypedDict): 164 | type: Literal["move"] 165 | x: int 166 | y: int 167 | 168 | class ScreenshotAction(TypedDict): 169 | type: Literal["screenshot"] 170 | 171 | class ScrollAction(TypedDict): 172 | type: Literal["scroll"] 173 | scroll_x: int 174 | scroll_y: int 175 | x: int 176 | y: int 177 | 178 | class TypeAction(TypedDict): 179 | type: Literal["type"] 180 | text: str 181 | 182 | class WaitAction(TypedDict): 183 | type: Literal["wait"] 184 | 185 | # Anthropic Computer Actions 186 | class LeftMouseDownAction(TypedDict): 187 | type: Literal["left_mouse_down"] 188 | x: int 189 | y: int 190 | 191 | class LeftMouseUpAction(TypedDict): 192 | type: Literal["left_mouse_up"] 193 | x: int 194 | y: int 195 | ``` 196 | 197 | ## Notes 198 | 199 | - The agent runtime may add provider-specific fields when available (e.g., usage cost). Unknown fields should be ignored for forward compatibility. 200 | - Computer action outputs are screenshots as data URLs. For security and storage, some serializers may redact or omit large fields in persisted metadata. 201 | - The message flow typically alternates between reasoning, actions, screenshots, and concluding assistant text. See [Chat History](./chat-history) for a step-by-step example. 202 | ``` -------------------------------------------------------------------------------- /docs/content/docs/computer-sdk/cloud-vm-management.mdx: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | title: Cloud VM Management 3 | description: Manage your Cua Cloud sandboxes (VMs) via Python SDK or HTTP API 4 | --- 5 | 6 | import { Tab, Tabs } from 'fumadocs-ui/components/tabs'; 7 | 8 | 9 | Using the Cua Cloud API, you can manage your Cua Cloud sandboxes (VMs) with Python or HTTP (curl). 10 | 11 | All examples require a CUA API key. You can obtain one from the [Dashboard](https://www.cua.ai/dashboard/keys). 12 | 13 | --- 14 | 15 | ## List VMs 16 | 17 | <Tabs items={["Python", "curl"]}> 18 | <Tab value="Python"> 19 | 20 | ```python 21 | import os 22 | import asyncio 23 | from computer.providers.cloud.provider import CloudProvider 24 | 25 | async def main(): 26 | api_key = os.getenv("CUA_API_KEY") or "your-api-key" 27 | # Optional: point to a different API base 28 | # os.environ["CUA_API_BASE"] = "https://api.cua.ai" 29 | 30 | provider = CloudProvider(api_key=api_key, verbose=False) 31 | async with provider: 32 | vms = await provider.list_vms() 33 | for vm in vms: 34 | print({ 35 | "name": vm["name"], 36 | "status": vm["status"], 37 | "api_url": vm.get("api_url"), 38 | "vnc_url": vm.get("vnc_url"), 39 | }) 40 | 41 | if __name__ == "__main__": 42 | asyncio.run(main()) 43 | ``` 44 | 45 | </Tab> 46 | <Tab value="curl"> 47 | 48 | ```bash 49 | curl -H "Authorization: Bearer $CUA_API_KEY" \ 50 | "https://api.cua.ai/v1/vms" 51 | ``` 52 | 53 | Responses: 54 | - 200: Array of minimal VM objects with fields `{ name, password, status }` 55 | - 401: Unauthorized (missing/invalid API key) 56 | 57 | ```json 58 | [ 59 | { 60 | "name": "s-windows-x4snp46ebf", 61 | "password": "49b8daa3", 62 | "status": "running" 63 | } 64 | ] 65 | ``` 66 | 67 | Status values: 68 | 69 | - `pending`: VM deployment in progress 70 | - `running`: VM is active and accessible 71 | - `stopped`: VM is stopped but not terminated 72 | - `terminated`: VM has been permanently destroyed 73 | - `failed`: VM deployment or operation failed 74 | 75 | </Tab> 76 | </Tabs> 77 | 78 | --- 79 | 80 | ## Start a VM 81 | Provide the VM name you want to start. 82 | 83 | <Tabs items={["Python", "curl"]}> 84 | <Tab value="Python"> 85 | 86 | ```python 87 | import os 88 | import asyncio 89 | from computer.providers.cloud.provider import CloudProvider 90 | 91 | async def main(): 92 | api_key = os.getenv("CUA_API_KEY") or "your-api-key" 93 | name = "my-vm-name" # e.g., "m-linux-96lcxd2c2k" 94 | 95 | provider = CloudProvider(api_key=api_key) 96 | async with provider: 97 | resp = await provider.run_vm(name) 98 | print(resp) # { "name": name, "status": "starting" } 99 | 100 | if __name__ == "__main__": 101 | asyncio.run(main()) 102 | ``` 103 | 104 | </Tab> 105 | <Tab value="curl"> 106 | 107 | ```bash 108 | curl -X POST \ 109 | -H "Authorization: Bearer $CUA_API_KEY" \ 110 | "https://api.cua.ai/v1/vms/my-vm-name/start" -i 111 | ``` 112 | 113 | Responses: 114 | - 204: No Content (start accepted) 115 | - 401: Unauthorized (missing/invalid API key) 116 | - 404: VM not found or not owned by the user 117 | 118 | ```text 119 | HTTP/1.1 204 No Content 120 | ``` 121 | 122 | </Tab> 123 | </Tabs> 124 | 125 | --- 126 | 127 | ## Stop a VM 128 | Stops the VM asynchronously. 129 | 130 | <Tabs items={["Python", "curl"]}> 131 | <Tab value="Python"> 132 | 133 | ```python 134 | import os 135 | import asyncio 136 | from computer.providers.cloud.provider import CloudProvider 137 | 138 | async def main(): 139 | api_key = os.getenv("CUA_API_KEY") or "your-api-key" 140 | name = "my-vm-name" 141 | 142 | provider = CloudProvider(api_key=api_key) 143 | async with provider: 144 | resp = await provider.stop_vm(name) 145 | print(resp) # { "name": name, "status": "stopping" } 146 | 147 | if __name__ == "__main__": 148 | asyncio.run(main()) 149 | ``` 150 | 151 | </Tab> 152 | <Tab value="curl"> 153 | 154 | ```bash 155 | curl -X POST \ 156 | -H "Authorization: Bearer $CUA_API_KEY" \ 157 | "https://api.cua.ai/v1/vms/my-vm-name/stop" 158 | ``` 159 | 160 | Responses: 161 | - 202: Accepted with `{ "status": "stopping" }` 162 | - 401: Unauthorized (missing/invalid API key) 163 | - 404: VM not found or not owned by the user 164 | 165 | ```json 166 | { "status": "stopping" } 167 | ``` 168 | 169 | </Tab> 170 | </Tabs> 171 | 172 | --- 173 | 174 | ## Restart a VM 175 | Restarts the VM asynchronously. 176 | 177 | <Tabs items={["Python", "curl"]}> 178 | <Tab value="Python"> 179 | 180 | ```python 181 | import os 182 | import asyncio 183 | from computer.providers.cloud.provider import CloudProvider 184 | 185 | async def main(): 186 | api_key = os.getenv("CUA_API_KEY") or "your-api-key" 187 | name = "my-vm-name" 188 | 189 | provider = CloudProvider(api_key=api_key) 190 | async with provider: 191 | resp = await provider.restart_vm(name) 192 | print(resp) # { "name": name, "status": "restarting" } 193 | 194 | if __name__ == "__main__": 195 | asyncio.run(main()) 196 | ``` 197 | 198 | </Tab> 199 | <Tab value="curl"> 200 | 201 | ```bash 202 | curl -X POST \ 203 | -H "Authorization: Bearer $CUA_API_KEY" \ 204 | "https://api.cua.ai/v1/vms/my-vm-name/restart" 205 | ``` 206 | 207 | Responses: 208 | - 202: Accepted with `{ "status": "restarting" }` 209 | - 401: Unauthorized (missing/invalid API key) 210 | - 404: VM not found or not owned by the user 211 | 212 | ```json 213 | { "status": "restarting" } 214 | ``` 215 | 216 | </Tab> 217 | </Tabs> 218 | 219 | --- 220 | 221 | ## Query a VM by name 222 | Query the computer-server running on the VM. Useful for checking details like status or OS type. 223 | 224 | <Tabs items={["Python", "curl"]}> 225 | <Tab value="Python"> 226 | 227 | ```python 228 | import os 229 | import asyncio 230 | from computer.providers.cloud.provider import CloudProvider 231 | 232 | async def main(): 233 | api_key = os.getenv("CUA_API_KEY") or "your-api-key" 234 | name = "my-vm-name" 235 | 236 | provider = CloudProvider(api_key=api_key) 237 | async with provider: 238 | info = await provider.get_vm(name) 239 | print(info) 240 | 241 | if __name__ == "__main__": 242 | asyncio.run(main()) 243 | ``` 244 | 245 | </Tab> 246 | <Tab value="curl"> 247 | 248 | ```bash 249 | curl "https://my-vm-name.containers.cloud.cua.ai:8443/status" 250 | ``` 251 | 252 | Responses: 253 | - 200: Server available 254 | 255 | ```json 256 | { "status": "ok", "os_type": "linux", "features": ["agent"] } 257 | ``` 258 | 259 | </Tab> 260 | </Tabs> 261 | ``` -------------------------------------------------------------------------------- /libs/typescript/agent/src/client.ts: -------------------------------------------------------------------------------- ```typescript 1 | import {Peer} from "peerjs"; 2 | import type { 3 | AgentRequest, 4 | AgentResponse, 5 | ConnectionType, 6 | AgentClientOptions, 7 | } from "./types"; 8 | 9 | export class AgentClient { 10 | private url: string; 11 | private connectionType: ConnectionType; 12 | private options: AgentClientOptions; 13 | private peer?: Peer; 14 | private connection?: any; 15 | 16 | constructor(url: string, options: AgentClientOptions = {}) { 17 | this.url = url; 18 | this.options = { 19 | timeout: 30000, 20 | retries: 3, 21 | ...options, 22 | }; 23 | 24 | // Determine connection type from URL 25 | if (url.startsWith("http://") || url.startsWith("https://")) { 26 | this.connectionType = url.startsWith("https://") ? "https" : "http"; 27 | } else if (url.startsWith("peer://")) { 28 | this.connectionType = "peer"; 29 | } else { 30 | throw new Error( 31 | "Invalid URL format. Must start with http://, https://, or peer://" 32 | ); 33 | } 34 | } 35 | 36 | // Main responses API matching the desired usage pattern 37 | public responses = { 38 | create: async (request: AgentRequest): Promise<AgentResponse> => { 39 | return this.sendRequest(request); 40 | }, 41 | }; 42 | 43 | private async sendRequest(request: AgentRequest): Promise<AgentResponse> { 44 | switch (this.connectionType) { 45 | case "http": 46 | case "https": 47 | return this.sendHttpRequest(request); 48 | case "peer": 49 | return this.sendPeerRequest(request); 50 | default: 51 | throw new Error(`Unsupported connection type: ${this.connectionType}`); 52 | } 53 | } 54 | 55 | private async sendHttpRequest(request: AgentRequest): Promise<AgentResponse> { 56 | const controller = new AbortController(); 57 | const timeoutId = setTimeout( 58 | () => controller.abort(), 59 | this.options.timeout 60 | ); 61 | 62 | try { 63 | const headers: Record<string, string> = { 64 | "Content-Type": "application/json", 65 | }; 66 | if (this.options.apiKey) { 67 | headers["X-API-Key"] = this.options.apiKey; 68 | } 69 | 70 | const response = await fetch(`${this.url}/responses`, { 71 | method: "POST", 72 | headers, 73 | body: JSON.stringify(request), 74 | signal: controller.signal, 75 | }); 76 | 77 | clearTimeout(timeoutId); 78 | 79 | if (!response.ok) { 80 | throw new Error(`HTTP error! status: ${response.status}`); 81 | } 82 | 83 | const data = await response.json(); 84 | return data as AgentResponse; 85 | } catch (error) { 86 | clearTimeout(timeoutId); 87 | if (error instanceof Error) { 88 | throw new Error(`Failed to send HTTP request: ${error.message}`); 89 | } 90 | throw error; 91 | } 92 | } 93 | 94 | private async sendPeerRequest(request: AgentRequest): Promise<AgentResponse> { 95 | // Extract peer ID from peer:// URL 96 | const peerId = this.url.replace("peer://", ""); 97 | 98 | if (!this.peer) { 99 | // Initialize peer connection with default options as requested 100 | this.peer = new Peer(); 101 | 102 | return new Promise<AgentResponse>((resolve, reject) => { 103 | const timeout = setTimeout(() => { 104 | reject(new Error("Peer connection timeout")); 105 | }, this.options.timeout); 106 | 107 | this.peer!.on("open", () => { 108 | // Connect to the target peer 109 | this.connection = this.peer!.connect(peerId); 110 | 111 | this.connection.on("open", () => { 112 | // Send the request 113 | this.connection!.send(JSON.stringify(request)); 114 | }); 115 | 116 | this.connection.on("data", (data: any) => { 117 | clearTimeout(timeout); 118 | try { 119 | const response = 120 | typeof data === "string" ? JSON.parse(data) : data; 121 | resolve(response as AgentResponse); 122 | } catch (error) { 123 | reject(new Error("Failed to parse peer response")); 124 | } 125 | }); 126 | 127 | this.connection.on("error", (error: any) => { 128 | clearTimeout(timeout); 129 | reject(new Error(`Peer connection error: ${error}`)); 130 | }); 131 | }); 132 | 133 | this.peer!.on("error", (error: any) => { 134 | clearTimeout(timeout); 135 | reject(new Error(`Peer error: ${error}`)); 136 | }); 137 | }); 138 | } else { 139 | // Reuse existing connection 140 | return new Promise<AgentResponse>((resolve, reject) => { 141 | const timeout = setTimeout(() => { 142 | reject(new Error("Peer request timeout")); 143 | }, this.options.timeout); 144 | 145 | if (this.connection && this.connection.open) { 146 | this.connection.send(JSON.stringify(request)); 147 | 148 | const handleData = (data: any) => { 149 | clearTimeout(timeout); 150 | this.connection!.off("data", handleData); 151 | try { 152 | const response = 153 | typeof data === "string" ? JSON.parse(data) : data; 154 | resolve(response as AgentResponse); 155 | } catch (error) { 156 | reject(new Error("Failed to parse peer response")); 157 | } 158 | }; 159 | 160 | this.connection.on("data", handleData); 161 | } else { 162 | clearTimeout(timeout); 163 | reject(new Error("Peer connection not available")); 164 | } 165 | }); 166 | } 167 | } 168 | 169 | // Health check method 170 | async health(): Promise<{ status: string }> { 171 | if (this.connectionType === "peer") { 172 | return { status: this.peer?.open ? "connected" : "disconnected" }; 173 | } 174 | 175 | try { 176 | const response = await fetch(`${this.url}/health`); 177 | if (response.ok) { 178 | return { status: "healthy" }; 179 | } 180 | return { status: "unhealthy" }; 181 | } catch { 182 | return { status: "unreachable" }; 183 | } 184 | } 185 | 186 | // Clean up resources 187 | async disconnect(): Promise<void> { 188 | if (this.connection) { 189 | this.connection.close(); 190 | this.connection = undefined; 191 | } 192 | if (this.peer) { 193 | this.peer.destroy(); 194 | this.peer = undefined; 195 | } 196 | } 197 | } 198 | ``` -------------------------------------------------------------------------------- /scripts/build-uv.sh: -------------------------------------------------------------------------------- ```bash 1 | #!/bin/bash 2 | 3 | # Exit on error 4 | set -e 5 | 6 | # Colors for output 7 | RED='\033[0;31m' 8 | GREEN='\033[0;32m' 9 | BLUE='\033[0;34m' 10 | YELLOW='\033[1;33m' 11 | NC='\033[0m' # No Color 12 | 13 | # Function to print step information 14 | print_step() { 15 | echo -e "${BLUE}==> $1${NC}" 16 | } 17 | 18 | # Function to print success message 19 | print_success() { 20 | echo -e "${GREEN}==> Success: $1${NC}" 21 | } 22 | 23 | # Function to print error message 24 | print_error() { 25 | echo -e "${RED}==> Error: $1${NC}" >&2 26 | } 27 | 28 | # Function to print warning message 29 | print_warning() { 30 | echo -e "${YELLOW}==> Warning: $1${NC}" 31 | } 32 | 33 | # Function to check if UV is installed 34 | check_uv() { 35 | if command -v uv &> /dev/null; then 36 | print_success "UV is already installed" 37 | uv --version 38 | return 0 39 | else 40 | return 1 41 | fi 42 | } 43 | 44 | # Function to install UV 45 | install_uv() { 46 | print_step "UV not found. Installing UV..." 47 | 48 | # Detect OS 49 | if [[ "$OSTYPE" == "linux-gnu"* ]] || [[ "$OSTYPE" == "darwin"* ]]; then 50 | print_step "Installing UV for Unix-like system..." 51 | curl -LsSf https://astral.sh/uv/install.sh | sh 52 | 53 | # Add UV to PATH for current session 54 | export PATH="$HOME/.cargo/bin:$PATH" 55 | 56 | # Check if installation was successful 57 | if command -v uv &> /dev/null; then 58 | print_success "UV installed successfully" 59 | uv --version 60 | else 61 | print_error "UV installation failed" 62 | print_step "Please restart your terminal and try again, or install manually:" 63 | echo " curl -LsSf https://astral.sh/uv/install.sh | sh" 64 | exit 1 65 | fi 66 | elif [[ "$OSTYPE" == "msys" ]] || [[ "$OSTYPE" == "cygwin" ]]; then 67 | print_error "For Windows, please use PowerShell and run:" 68 | echo " powershell -ExecutionPolicy ByPass -c \"irm https://astral.sh/uv/install.ps1 | iex\"" 69 | exit 1 70 | else 71 | print_error "Unsupported operating system: $OSTYPE" 72 | print_step "Please install UV manually from: https://docs.astral.sh/uv/getting-started/installation/" 73 | exit 1 74 | fi 75 | } 76 | 77 | # Get the script's directory 78 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 79 | PROJECT_ROOT="$( cd "${SCRIPT_DIR}/.." && pwd )" 80 | 81 | # Change to project root 82 | cd "$PROJECT_ROOT" 83 | 84 | # Check if UV is installed, install if not 85 | if ! check_uv; then 86 | install_uv 87 | fi 88 | 89 | # Load environment variables from .env.local 90 | if [ -f .env.local ]; then 91 | print_step "Loading environment variables from .env.local..." 92 | set -a 93 | source .env.local 94 | set +a 95 | print_success "Environment variables loaded" 96 | else 97 | print_error ".env.local file not found" 98 | exit 1 99 | fi 100 | 101 | # Clean up existing environments and cache 102 | print_step "Cleaning up existing environments..." 103 | find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true 104 | find . -type d -name ".pytest_cache" -exec rm -rf {} + 2>/dev/null || true 105 | find . -type d -name "dist" -exec rm -rf {} + 2>/dev/null || true 106 | find . -type d -name ".venv" -exec rm -rf {} + 2>/dev/null || true 107 | find . -type d -name "*.egg-info" -exec rm -rf {} + 2>/dev/null || true 108 | print_success "Environment cleanup complete" 109 | 110 | # Install Python 3.12 using UV 111 | print_step "Installing Python 3.12 using UV..." 112 | uv python install 3.12 113 | print_success "Python 3.12 installed" 114 | 115 | # Create virtual environment using UV 116 | print_step "Creating virtual environment with UV..." 117 | uv venv .venv --python 3.12 118 | print_success "Virtual environment created" 119 | 120 | # Activate virtual environment 121 | print_step "Activating virtual environment..." 122 | source .venv/bin/activate 123 | print_success "Virtual environment activated" 124 | 125 | # Function to install a package and its dependencies using UV 126 | install_package() { 127 | local package_dir=$1 128 | local package_name=$2 129 | local extras=$3 130 | print_step "Installing ${package_name} with UV..." 131 | cd "$package_dir" 132 | 133 | if [ -f "pyproject.toml" ]; then 134 | if [ -n "$extras" ]; then 135 | uv pip install -e ".[${extras}]" 136 | else 137 | uv pip install -e . 138 | fi 139 | else 140 | print_error "No pyproject.toml found in ${package_dir}" 141 | return 1 142 | fi 143 | 144 | cd "$PROJECT_ROOT" 145 | } 146 | 147 | # Install packages in order of dependency 148 | print_step "Installing packages in development mode with UV..." 149 | 150 | # Install core first (base package with telemetry support) 151 | install_package "libs/python/core" "core" 152 | 153 | # Install pylume (base dependency) 154 | install_package "libs/python/pylume" "pylume" 155 | 156 | # Install computer with all its dependencies and extras 157 | install_package "libs/python/computer" "computer" "all" 158 | 159 | # Install omniparser 160 | install_package "libs/python/som" "som" 161 | 162 | # Install agent with all its dependencies and extras 163 | install_package "libs/python/agent" "agent" "all" 164 | 165 | # Install computer-server 166 | install_package "libs/python/computer-server" "computer-server" 167 | 168 | # Install mcp-server 169 | install_package "libs/python/mcp-server" "mcp-server" 170 | 171 | # Install development tools from root project 172 | print_step "Installing development dependencies with UV..." 173 | uv pip install -e ".[dev,test,docs]" 174 | 175 | # Create a .env file for VS Code to use the virtual environment 176 | print_step "Creating .env file for VS Code..." 177 | echo "PYTHONPATH=${PROJECT_ROOT}/libs/python/core:${PROJECT_ROOT}/libs/python/computer:${PROJECT_ROOT}/libs/python/agent:${PROJECT_ROOT}/libs/python/som:${PROJECT_ROOT}/libs/python/pylume:${PROJECT_ROOT}/libs/python/computer-server:${PROJECT_ROOT}/libs/python/mcp-server" > .env 178 | 179 | print_success "All packages installed successfully with UV!" 180 | print_step "Your virtual environment is ready. To activate it:" 181 | echo " source .venv/bin/activate" 182 | print_step "UV provides fast dependency resolution and installation." 183 | print_step "You can also use 'uv run' to run commands in the virtual environment without activation." 184 | ``` -------------------------------------------------------------------------------- /libs/python/computer/computer/providers/winsandbox/setup_script.ps1: -------------------------------------------------------------------------------- ``` 1 | # Setup script for Windows Sandbox CUA Computer provider 2 | # This script runs when the sandbox starts 3 | 4 | Write-Host "Starting CUA Computer setup in Windows Sandbox..." 5 | 6 | # Function to find the mapped Python installation from pywinsandbox 7 | function Find-MappedPython { 8 | Write-Host "Looking for mapped Python installation from pywinsandbox..." 9 | 10 | # pywinsandbox maps the host Python installation to the sandbox 11 | # Look for mapped shared folders on the desktop (common pywinsandbox pattern) 12 | $desktopPath = "C:\Users\WDAGUtilityAccount\Desktop" 13 | $sharedFolders = Get-ChildItem -Path $desktopPath -Directory -ErrorAction SilentlyContinue 14 | 15 | foreach ($folder in $sharedFolders) { 16 | # Look for Python executables in shared folders 17 | $pythonPaths = @( 18 | "$($folder.FullName)\python.exe", 19 | "$($folder.FullName)\Scripts\python.exe", 20 | "$($folder.FullName)\bin\python.exe" 21 | ) 22 | 23 | foreach ($pythonPath in $pythonPaths) { 24 | if (Test-Path $pythonPath) { 25 | try { 26 | $version = & $pythonPath --version 2>&1 27 | if ($version -match "Python") { 28 | Write-Host "Found mapped Python: $pythonPath - $version" 29 | return $pythonPath 30 | } 31 | } catch { 32 | continue 33 | } 34 | } 35 | } 36 | 37 | # Also check subdirectories that might contain Python 38 | $subDirs = Get-ChildItem -Path $folder.FullName -Directory -ErrorAction SilentlyContinue 39 | foreach ($subDir in $subDirs) { 40 | $pythonPath = "$($subDir.FullName)\python.exe" 41 | if (Test-Path $pythonPath) { 42 | try { 43 | $version = & $pythonPath --version 2>&1 44 | if ($version -match "Python") { 45 | Write-Host "Found mapped Python in subdirectory: $pythonPath - $version" 46 | return $pythonPath 47 | } 48 | } catch { 49 | continue 50 | } 51 | } 52 | } 53 | } 54 | 55 | # Fallback: try common Python commands that might be available 56 | $pythonCommands = @("python", "py", "python3") 57 | foreach ($cmd in $pythonCommands) { 58 | try { 59 | $version = & $cmd --version 2>&1 60 | if ($version -match "Python") { 61 | Write-Host "Found Python via command '$cmd': $version" 62 | return $cmd 63 | } 64 | } catch { 65 | continue 66 | } 67 | } 68 | 69 | throw "Could not find any Python installation (mapped or otherwise)" 70 | } 71 | 72 | try { 73 | # Step 1: Find the mapped Python installation 74 | Write-Host "Step 1: Finding mapped Python installation..." 75 | $pythonExe = Find-MappedPython 76 | Write-Host "Using Python: $pythonExe" 77 | 78 | # Verify Python works and show version 79 | $pythonVersion = & $pythonExe --version 2>&1 80 | Write-Host "Python version: $pythonVersion" 81 | 82 | # Step 2: Create a dedicated virtual environment in mapped Desktop folder (persistent) 83 | Write-Host "Step 2: Creating virtual environment (if needed)..." 84 | $cachePath = "C:\Users\WDAGUtilityAccount\Desktop\wsb_cache" 85 | $venvPath = "C:\Users\WDAGUtilityAccount\Desktop\wsb_cache\venv" 86 | if (!(Test-Path $venvPath)) { 87 | Write-Host "Creating venv at: $venvPath" 88 | & $pythonExe -m venv $venvPath 89 | } else { 90 | Write-Host "Venv already exists at: $venvPath" 91 | } 92 | # Hide the folder to keep Desktop clean 93 | try { 94 | $item = Get-Item $cachePath -ErrorAction SilentlyContinue 95 | if ($item) { 96 | if (-not ($item.Attributes -band [IO.FileAttributes]::Hidden)) { 97 | $item.Attributes = $item.Attributes -bor [IO.FileAttributes]::Hidden 98 | } 99 | } 100 | } catch { } 101 | $venvPython = Join-Path $venvPath "Scripts\python.exe" 102 | if (!(Test-Path $venvPython)) { 103 | throw "Virtual environment Python not found at $venvPython" 104 | } 105 | Write-Host "Using venv Python: $venvPython" 106 | 107 | # Step 3: Install cua-computer-server into the venv 108 | Write-Host "Step 3: Installing cua-computer-server..." 109 | 110 | Write-Host "Upgrading pip..." 111 | & $venvPython -m pip install --upgrade pip --quiet 112 | 113 | Write-Host "Installing cua-computer-server..." 114 | & $venvPython -m pip install cua-computer-server 115 | 116 | Write-Host "cua-computer-server installation completed." 117 | 118 | # Step 4: Start computer server in background using the venv Python 119 | Write-Host "Step 4: Starting computer server in background..." 120 | Write-Host "Starting computer server with: $venvPython" 121 | 122 | # Start the computer server in the background 123 | $serverProcess = Start-Process -FilePath $venvPython -ArgumentList "-m", "computer_server.main" -WindowStyle Hidden -PassThru 124 | Write-Host "Computer server started in background with PID: $($serverProcess.Id)" 125 | 126 | # Give it a moment to start 127 | Start-Sleep -Seconds 3 128 | 129 | # Check if the process is still running 130 | if (Get-Process -Id $serverProcess.Id -ErrorAction SilentlyContinue) { 131 | Write-Host "Computer server is running successfully in background" 132 | } else { 133 | throw "Computer server failed to start or exited immediately" 134 | } 135 | 136 | } catch { 137 | Write-Error "Setup failed: $_" 138 | Write-Host "Error details: $($_.Exception.Message)" 139 | Write-Host "Stack trace: $($_.ScriptStackTrace)" 140 | Write-Host "" 141 | Write-Host "Press any key to close this window..." 142 | $null = $Host.UI.RawUI.ReadKey("NoEcho,IncludeKeyDown") 143 | exit 1 144 | } 145 | 146 | Write-Host "" 147 | Write-Host "Setup completed successfully!" 148 | Write-Host "Press any key to close this window..." 149 | $null = $Host.UI.RawUI.ReadKey("NoEcho,IncludeKeyDown") 150 | ``` -------------------------------------------------------------------------------- /libs/python/som/som/ocr.py: -------------------------------------------------------------------------------- ```python 1 | from typing import List, Dict, Any, Tuple, Union 2 | import logging 3 | import signal 4 | from contextlib import contextmanager 5 | from pathlib import Path 6 | import easyocr 7 | from PIL import Image 8 | import numpy as np 9 | import torch 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class TimeoutException(Exception): 15 | pass 16 | 17 | 18 | @contextmanager 19 | def timeout(seconds: int): 20 | import threading 21 | 22 | # Check if we're in the main thread 23 | if threading.current_thread() is threading.main_thread(): 24 | def timeout_handler(signum, frame): 25 | raise TimeoutException("OCR process timed out") 26 | 27 | original_handler = signal.signal(signal.SIGALRM, timeout_handler) 28 | signal.alarm(seconds) 29 | 30 | try: 31 | yield 32 | finally: 33 | signal.alarm(0) 34 | signal.signal(signal.SIGALRM, original_handler) 35 | else: 36 | # In a non-main thread, we can't use signal 37 | logger.warning("Timeout function called from non-main thread; signal-based timeout disabled") 38 | try: 39 | yield 40 | finally: 41 | pass 42 | 43 | 44 | class OCRProcessor: 45 | """Class for handling OCR text detection.""" 46 | 47 | _shared_reader = None # Class-level shared reader instance 48 | 49 | def __init__(self): 50 | """Initialize the OCR processor.""" 51 | self.reader = None 52 | # Determine best available device 53 | self.device = "cpu" 54 | if torch.cuda.is_available(): 55 | self.device = "cuda" 56 | elif ( 57 | hasattr(torch, "backends") 58 | and hasattr(torch.backends, "mps") 59 | and torch.backends.mps.is_available() 60 | ): 61 | self.device = "mps" 62 | logger.info(f"OCR processor initialized with device: {self.device}") 63 | 64 | def _ensure_reader(self): 65 | """Ensure EasyOCR reader is initialized. 66 | 67 | Uses a class-level cached reader to avoid reinitializing on every instance. 68 | """ 69 | # First check if we already have a class-level reader 70 | if OCRProcessor._shared_reader is not None: 71 | self.reader = OCRProcessor._shared_reader 72 | return 73 | 74 | # Otherwise initialize a new one 75 | if self.reader is None: 76 | try: 77 | logger.info("Initializing EasyOCR reader...") 78 | import easyocr 79 | 80 | # Use GPU if available 81 | use_gpu = self.device in ["cuda", "mps"] 82 | self.reader = easyocr.Reader(["en"], gpu=use_gpu) 83 | 84 | # Verify reader initialization 85 | if self.reader is None: 86 | raise ValueError("Failed to initialize EasyOCR reader") 87 | 88 | # Cache the reader at class level 89 | OCRProcessor._shared_reader = self.reader 90 | 91 | logger.info(f"EasyOCR reader initialized successfully with GPU={use_gpu}") 92 | except Exception as e: 93 | logger.error(f"Failed to initialize EasyOCR reader: {str(e)}") 94 | # Set to a placeholder that will be checked 95 | self.reader = None 96 | raise RuntimeError(f"EasyOCR initialization failed: {str(e)}") from e 97 | 98 | def detect_text( 99 | self, image: Image.Image, confidence_threshold: float = 0.5, timeout_seconds: int = 5 100 | ) -> List[Dict[str, Any]]: 101 | """Detect text in an image using EasyOCR. 102 | 103 | Args: 104 | image: PIL Image to process 105 | confidence_threshold: Minimum confidence for text detection 106 | timeout_seconds: Maximum time to wait for OCR 107 | 108 | Returns: 109 | List of text detection dictionaries 110 | """ 111 | try: 112 | # Try to initialize reader, catch any exceptions 113 | try: 114 | self._ensure_reader() 115 | except Exception as e: 116 | logger.error(f"Failed to initialize OCR reader: {str(e)}") 117 | return [] 118 | 119 | # Ensure reader was properly initialized 120 | if self.reader is None: 121 | logger.error("OCR reader is None after initialization") 122 | return [] 123 | 124 | # Convert PIL Image to numpy array 125 | image_np = np.array(image) 126 | 127 | try: 128 | with timeout(timeout_seconds): 129 | results = self.reader.readtext( 130 | image_np, paragraph=False, text_threshold=confidence_threshold 131 | ) 132 | except TimeoutException: 133 | logger.warning("OCR timed out") 134 | return [] 135 | except Exception as e: 136 | logger.warning(f"OCR failed: {str(e)}") 137 | return [] 138 | 139 | detections = [] 140 | img_width, img_height = image.size 141 | 142 | for box, text, conf in results: 143 | # Ensure conf is float 144 | conf_float = float(conf) 145 | if conf_float < confidence_threshold: 146 | continue 147 | 148 | # Convert box format to [x1, y1, x2, y2] 149 | # Ensure box points are properly typed as float 150 | x1 = min(float(point[0]) for point in box) / img_width 151 | y1 = min(float(point[1]) for point in box) / img_height 152 | x2 = max(float(point[0]) for point in box) / img_width 153 | y2 = max(float(point[1]) for point in box) / img_height 154 | 155 | detections.append( 156 | { 157 | "type": "text", 158 | "bbox": [x1, y1, x2, y2], 159 | "content": text, 160 | "confidence": conf, 161 | "interactivity": False, # Text is typically non-interactive 162 | } 163 | ) 164 | 165 | return detections 166 | except Exception as e: 167 | logger.error(f"Unexpected error in OCR processing: {str(e)}") 168 | return [] 169 | ``` -------------------------------------------------------------------------------- /.github/workflows/pypi-publish-mcp-server.yml: -------------------------------------------------------------------------------- ```yaml 1 | name: Publish MCP Server Package 2 | 3 | on: 4 | push: 5 | tags: 6 | - "mcp-server-v*" 7 | workflow_dispatch: 8 | inputs: 9 | version: 10 | description: "Version to publish (without v prefix)" 11 | required: true 12 | default: "0.1.0" 13 | workflow_call: 14 | inputs: 15 | version: 16 | description: "Version to publish" 17 | required: true 18 | type: string 19 | outputs: 20 | version: 21 | description: "The version that was published" 22 | value: ${{ jobs.prepare.outputs.version }} 23 | 24 | # Adding permissions at workflow level 25 | permissions: 26 | contents: write 27 | 28 | jobs: 29 | prepare: 30 | runs-on: macos-latest 31 | outputs: 32 | version: ${{ steps.get-version.outputs.version }} 33 | agent_version: ${{ steps.update-deps.outputs.agent_version }} 34 | computer_version: ${{ steps.update-deps.outputs.computer_version }} 35 | steps: 36 | - uses: actions/checkout@v4 37 | 38 | - name: Determine version 39 | id: get-version 40 | run: | 41 | if [ "${{ github.event_name }}" == "push" ]; then 42 | # Extract version from tag (for package-specific tags) 43 | if [[ "${{ github.ref }}" =~ ^refs/tags/mcp-server-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then 44 | VERSION=${BASH_REMATCH[1]} 45 | else 46 | echo "Invalid tag format for mcp-server" 47 | exit 1 48 | fi 49 | elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then 50 | # Use version from workflow dispatch 51 | VERSION=${{ github.event.inputs.version }} 52 | else 53 | # Use version from workflow_call 54 | VERSION=${{ inputs.version }} 55 | fi 56 | echo "VERSION=$VERSION" 57 | echo "version=$VERSION" >> $GITHUB_OUTPUT 58 | 59 | - name: Set up Python 60 | uses: actions/setup-python@v4 61 | with: 62 | python-version: "3.11" 63 | 64 | - name: Update dependencies to latest versions 65 | id: update-deps 66 | run: | 67 | cd libs/python/mcp-server 68 | 69 | # Install required package for PyPI API access 70 | pip install requests 71 | 72 | # Create a Python script for PyPI version checking 73 | cat > get_latest_versions.py << 'EOF' 74 | import requests 75 | import json 76 | import sys 77 | 78 | def get_package_version(package_name, fallback="0.1.0"): 79 | try: 80 | response = requests.get(f'https://pypi.org/pypi/{package_name}/json') 81 | print(f"API Response Status for {package_name}: {response.status_code}", file=sys.stderr) 82 | 83 | if response.status_code != 200: 84 | print(f"API request failed for {package_name}, using fallback version", file=sys.stderr) 85 | return fallback 86 | 87 | data = json.loads(response.text) 88 | 89 | if 'info' not in data: 90 | print(f"Missing 'info' key in API response for {package_name}, using fallback version", file=sys.stderr) 91 | return fallback 92 | 93 | return data['info']['version'] 94 | except Exception as e: 95 | print(f"Error fetching version for {package_name}: {str(e)}", file=sys.stderr) 96 | return fallback 97 | 98 | # Get latest versions 99 | print(get_package_version('cua-agent')) 100 | print(get_package_version('cua-computer')) 101 | EOF 102 | 103 | # Execute the script to get the versions 104 | VERSIONS=($(python get_latest_versions.py)) 105 | LATEST_AGENT=${VERSIONS[0]} 106 | LATEST_COMPUTER=${VERSIONS[1]} 107 | 108 | echo "Latest cua-agent version: $LATEST_AGENT" 109 | echo "Latest cua-computer version: $LATEST_COMPUTER" 110 | 111 | # Output the versions for the next job 112 | echo "agent_version=$LATEST_AGENT" >> $GITHUB_OUTPUT 113 | echo "computer_version=$LATEST_COMPUTER" >> $GITHUB_OUTPUT 114 | 115 | # Determine major version for version constraint 116 | AGENT_MAJOR=$(echo $LATEST_AGENT | cut -d. -f1) 117 | COMPUTER_MAJOR=$(echo $LATEST_COMPUTER | cut -d. -f1) 118 | 119 | NEXT_AGENT_MAJOR=$((AGENT_MAJOR + 1)) 120 | NEXT_COMPUTER_MAJOR=$((COMPUTER_MAJOR + 1)) 121 | 122 | # Update dependencies in pyproject.toml 123 | if [[ "$OSTYPE" == "darwin"* ]]; then 124 | # macOS version of sed needs an empty string for -i 125 | # Update cua-agent with all extras 126 | sed -i '' "s/\"cua-agent\[all\]>=.*,<.*\"/\"cua-agent[all]>=$LATEST_AGENT,<$NEXT_AGENT_MAJOR.0.0\"/" pyproject.toml 127 | sed -i '' "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml 128 | else 129 | # Linux version 130 | sed -i "s/\"cua-agent\[all\]>=.*,<.*\"/\"cua-agent[all]>=$LATEST_AGENT,<$NEXT_AGENT_MAJOR.0.0\"/" pyproject.toml 131 | sed -i "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml 132 | fi 133 | 134 | # Display the updated dependencies 135 | echo "Updated dependencies in pyproject.toml:" 136 | grep -E "cua-agent|cua-computer" pyproject.toml 137 | 138 | publish: 139 | needs: prepare 140 | uses: ./.github/workflows/pypi-reusable-publish.yml 141 | with: 142 | package_name: "mcp-server" 143 | package_dir: "libs/python/mcp-server" 144 | version: ${{ needs.prepare.outputs.version }} 145 | is_lume_package: false 146 | base_package_name: "cua-mcp-server" 147 | secrets: 148 | PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} 149 | 150 | set-env-variables: 151 | needs: [prepare, publish] 152 | runs-on: macos-latest 153 | steps: 154 | - name: Set environment variables for use in other jobs 155 | run: | 156 | echo "AGENT_VERSION=${{ needs.prepare.outputs.agent_version }}" >> $GITHUB_ENV 157 | echo "COMPUTER_VERSION=${{ needs.prepare.outputs.computer_version }}" >> $GITHUB_ENV 158 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/loops/gta1.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | GTA1 agent loop implementation for click prediction using litellm.acompletion 3 | Paper: https://arxiv.org/pdf/2507.05791 4 | Code: https://github.com/Yan98/GTA1 5 | """ 6 | 7 | import asyncio 8 | import json 9 | import re 10 | import base64 11 | from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple 12 | from io import BytesIO 13 | import uuid 14 | from PIL import Image 15 | import litellm 16 | import math 17 | 18 | from ..decorators import register_agent 19 | from ..types import Messages, AgentResponse, Tools, AgentCapability 20 | from ..loops.base import AsyncAgentConfig 21 | 22 | SYSTEM_PROMPT = ''' 23 | You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. The image resolution is height {height} and width {width}. For elements with area, return the center point. 24 | 25 | Output the coordinate pair exactly: 26 | (x,y) 27 | '''.strip() 28 | 29 | def extract_coordinates(raw_string: str) -> Tuple[float, float]: 30 | """Extract coordinates from model output.""" 31 | try: 32 | matches = re.findall(r"\((-?\d*\.?\d+),\s*(-?\d*\.?\d+)\)", raw_string) 33 | return tuple(map(float, matches[0])) # type: ignore 34 | except: 35 | return (0.0, 0.0) 36 | 37 | def smart_resize(height: int, width: int, factor: int = 28, min_pixels: int = 3136, max_pixels: int = 8847360) -> Tuple[int, int]: 38 | """Smart resize function similar to qwen_vl_utils.""" 39 | # Calculate the total pixels 40 | total_pixels = height * width 41 | 42 | # If already within bounds, return original dimensions 43 | if min_pixels <= total_pixels <= max_pixels: 44 | # Round to nearest factor 45 | new_height = (height // factor) * factor 46 | new_width = (width // factor) * factor 47 | return new_height, new_width 48 | 49 | # Calculate scaling factor 50 | if total_pixels > max_pixels: 51 | scale = (max_pixels / total_pixels) ** 0.5 52 | else: 53 | scale = (min_pixels / total_pixels) ** 0.5 54 | 55 | # Apply scaling 56 | new_height = int(height * scale) 57 | new_width = int(width * scale) 58 | 59 | # Round to nearest factor 60 | new_height = (new_height // factor) * factor 61 | new_width = (new_width // factor) * factor 62 | 63 | # Ensure minimum size 64 | new_height = max(new_height, factor) 65 | new_width = max(new_width, factor) 66 | 67 | return new_height, new_width 68 | 69 | @register_agent(models=r".*GTA1.*") 70 | class GTA1Config(AsyncAgentConfig): 71 | """GTA1 agent configuration implementing AsyncAgentConfig protocol for click prediction.""" 72 | 73 | def __init__(self): 74 | self.current_model = None 75 | self.last_screenshot_b64 = None 76 | 77 | 78 | async def predict_step( 79 | self, 80 | messages: List[Dict[str, Any]], 81 | model: str, 82 | tools: Optional[List[Dict[str, Any]]] = None, 83 | max_retries: Optional[int] = None, 84 | stream: bool = False, 85 | computer_handler=None, 86 | _on_api_start=None, 87 | _on_api_end=None, 88 | _on_usage=None, 89 | _on_screenshot=None, 90 | **kwargs 91 | ) -> Dict[str, Any]: 92 | raise NotImplementedError() 93 | 94 | async def predict_click( 95 | self, 96 | model: str, 97 | image_b64: str, 98 | instruction: str, 99 | **kwargs 100 | ) -> Optional[Tuple[float, float]]: 101 | """ 102 | Predict click coordinates using GTA1 model via litellm.acompletion. 103 | 104 | Args: 105 | model: The GTA1 model name 106 | image_b64: Base64 encoded image 107 | instruction: Instruction for where to click 108 | 109 | Returns: 110 | Tuple of (x, y) coordinates or None if prediction fails 111 | """ 112 | # Decode base64 image 113 | image_data = base64.b64decode(image_b64) 114 | image = Image.open(BytesIO(image_data)) 115 | width, height = image.width, image.height 116 | 117 | # Smart resize the image (similar to qwen_vl_utils) 118 | resized_height, resized_width = smart_resize( 119 | height, width, 120 | factor=28, # Default factor for Qwen models 121 | min_pixels=3136, 122 | max_pixels=4096 * 2160 123 | ) 124 | resized_image = image.resize((resized_width, resized_height)) 125 | scale_x, scale_y = width / resized_width, height / resized_height 126 | 127 | # Convert resized image back to base64 128 | buffered = BytesIO() 129 | resized_image.save(buffered, format="PNG") 130 | resized_image_b64 = base64.b64encode(buffered.getvalue()).decode() 131 | 132 | # Prepare system and user messages 133 | system_message = { 134 | "role": "system", 135 | "content": SYSTEM_PROMPT.format(height=resized_height, width=resized_width) 136 | } 137 | 138 | user_message = { 139 | "role": "user", 140 | "content": [ 141 | { 142 | "type": "image_url", 143 | "image_url": { 144 | "url": f"data:image/png;base64,{resized_image_b64}" 145 | } 146 | }, 147 | { 148 | "type": "text", 149 | "text": instruction 150 | } 151 | ] 152 | } 153 | 154 | # Prepare API call kwargs 155 | api_kwargs = { 156 | "model": model, 157 | "messages": [system_message, user_message], 158 | "max_tokens": 2056, 159 | "temperature": 0.0, 160 | **kwargs 161 | } 162 | 163 | # Use liteLLM acompletion 164 | response = await litellm.acompletion(**api_kwargs) 165 | 166 | # Extract response text 167 | output_text = response.choices[0].message.content # type: ignore 168 | 169 | # Extract and rescale coordinates 170 | pred_x, pred_y = extract_coordinates(output_text) # type: ignore 171 | pred_x *= scale_x 172 | pred_y *= scale_y 173 | 174 | return (math.floor(pred_x), math.floor(pred_y)) 175 | 176 | def get_capabilities(self) -> List[AgentCapability]: 177 | """Return the capabilities supported by this agent.""" 178 | return ["click"] 179 | ``` -------------------------------------------------------------------------------- /libs/python/agent/benchmarks/models/gta1.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | GTA1 model implementation for benchmarking. 3 | """ 4 | 5 | from typing import Optional, Tuple 6 | from PIL import Image 7 | import torch 8 | import re 9 | import gc 10 | from qwen_vl_utils import process_vision_info, smart_resize 11 | from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor 12 | 13 | from .base import ModelProtocol 14 | 15 | 16 | class GTA1Model: 17 | """Ground truth GTA1 model implementation.""" 18 | 19 | def __init__(self, model_path: str = "HelloKKMe/GTA1-7B"): 20 | self.model_path = model_path 21 | self.model = None 22 | self.processor = None 23 | self.max_new_tokens = 32 24 | 25 | self.system_prompt = ''' 26 | You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. The image resolution is height {height} and width {width}. For elements with area, return the center point. 27 | 28 | Output the coordinate pair exactly: 29 | (x,y) 30 | '''.strip() 31 | 32 | @property 33 | def model_name(self) -> str: 34 | """Return the name of the model.""" 35 | return f"GTA1-{self.model_path.split('/')[-1]}" 36 | 37 | async def load_model(self) -> None: 38 | """Load the model into memory.""" 39 | if self.model is None: 40 | print(f"Loading GTA1 model: {self.model_path}") 41 | self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained( 42 | self.model_path, 43 | torch_dtype=torch.bfloat16, 44 | device_map="auto" 45 | ) 46 | self.processor = AutoProcessor.from_pretrained( 47 | self.model_path, 48 | min_pixels=3136, 49 | max_pixels=4096 * 2160 50 | ) 51 | print("GTA1 model loaded successfully") 52 | 53 | async def unload_model(self) -> None: 54 | """Unload the model from memory.""" 55 | if self.model is not None: 56 | print("Unloading GTA1 model from GPU...") 57 | del self.model 58 | del self.processor 59 | self.model = None 60 | self.processor = None 61 | gc.collect() 62 | if torch.cuda.is_available(): 63 | torch.cuda.empty_cache() 64 | print("GTA1 model unloaded") 65 | 66 | def _extract_coordinates(self, raw_string: str) -> Tuple[int, int]: 67 | """Extract coordinates from model output.""" 68 | try: 69 | matches = re.findall(r"\((-?\d*\.?\d+),\s*(-?\d*\.?\d+)\)", raw_string) 70 | return tuple(map(int, map(float, matches[0]))) # type: ignore 71 | except: 72 | return (0, 0) 73 | 74 | async def predict_click(self, image: Image.Image, instruction: str) -> Optional[Tuple[int, int]]: 75 | """ 76 | Predict click coordinates for the given image and instruction. 77 | 78 | Args: 79 | image: PIL Image to analyze 80 | instruction: Text instruction describing what to click 81 | 82 | Returns: 83 | Tuple of (x, y) coordinates or None if prediction fails 84 | """ 85 | if self.model is None or self.processor is None: 86 | await self.load_model() 87 | 88 | assert self.processor is not None 89 | assert self.model is not None 90 | 91 | try: 92 | width, height = image.width, image.height 93 | 94 | # Resize image according to processor requirements 95 | resized_height, resized_width = smart_resize( 96 | image.height, 97 | image.width, 98 | factor=self.processor.image_processor.patch_size * self.processor.image_processor.merge_size, 99 | min_pixels=self.processor.image_processor.min_pixels, 100 | max_pixels=self.processor.image_processor.max_pixels, 101 | ) 102 | resized_image = image.resize((resized_width, resized_height)) 103 | scale_x, scale_y = width / resized_width, height / resized_height 104 | 105 | # Prepare messages 106 | system_message = { 107 | "role": "system", 108 | "content": self.system_prompt.format(height=resized_height, width=resized_width) 109 | } 110 | 111 | user_message = { 112 | "role": "user", 113 | "content": [ 114 | {"type": "image", "image": resized_image}, 115 | {"type": "text", "text": instruction} 116 | ] 117 | } 118 | 119 | # Process inputs 120 | image_inputs, video_inputs = process_vision_info([system_message, user_message]) # type: ignore 121 | text = self.processor.apply_chat_template( 122 | [system_message, user_message], 123 | tokenize=False, 124 | add_generation_prompt=True 125 | ) 126 | inputs = self.processor( 127 | text=[text], 128 | images=image_inputs, 129 | videos=video_inputs, 130 | padding=True, 131 | return_tensors="pt" 132 | ) 133 | inputs = inputs.to(self.model.device) 134 | 135 | # Generate prediction 136 | output_ids = self.model.generate( 137 | **inputs, 138 | max_new_tokens=self.max_new_tokens, 139 | do_sample=False, 140 | temperature=1.0, 141 | use_cache=True 142 | ) 143 | generated_ids = [ 144 | output_ids[len(input_ids):] 145 | for input_ids, output_ids in zip(inputs.input_ids, output_ids) 146 | ] 147 | output_text = self.processor.batch_decode( 148 | generated_ids, 149 | skip_special_tokens=True, 150 | clean_up_tokenization_spaces=True 151 | )[0] 152 | 153 | # Extract and rescale coordinates 154 | pred_x, pred_y = self._extract_coordinates(output_text) 155 | pred_x = int(pred_x * scale_x) 156 | pred_y = int(pred_y * scale_y) 157 | 158 | return (pred_x, pred_y) 159 | 160 | except Exception as e: 161 | print(f"Error in GTA1 prediction: {e}") 162 | return None 163 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/integrations/hud/__init__.py: -------------------------------------------------------------------------------- ```python 1 | """HUD integration: dataset runners and MCP-based computer agent export. 2 | 3 | This module exposes helpers to evaluate HUD-compatible datasets and exports 4 | the MCP-compatible computer agent implementation. 5 | 6 | Exports: 7 | - run_single_task(dataset, ...) 8 | - run_full_dataset(dataset, ...) 9 | - MCPComputerAgent 10 | """ 11 | import time 12 | from typing import Any, Optional 13 | 14 | from agent.computers import is_agent_computer 15 | from datasets import load_dataset, Dataset 16 | from hud.datasets import Task, run_dataset 17 | from hud import trace 18 | 19 | from .agent import MCPComputerAgent 20 | 21 | 22 | # --------------------------------------------------------------------------- 23 | # Single-task runner 24 | # --------------------------------------------------------------------------- 25 | 26 | async def run_single_task( 27 | dataset: str | Dataset | list[dict[str, Any]], 28 | *, 29 | task_id: int = 0, 30 | model: str | None = None, 31 | allowed_tools: list[str] | None = None, 32 | # === ComputerAgent kwargs === 33 | tools: list[Any] | None = None, 34 | custom_loop: Any | None = None, 35 | only_n_most_recent_images: int | None = None, 36 | callbacks: list[Any] | None = None, 37 | instructions: str | None = None, 38 | verbosity: int | None = None, 39 | trajectory_dir: str | dict | None = None, 40 | max_retries: int | None = 3, 41 | screenshot_delay: float | int = 0.5, 42 | use_prompt_caching: bool | None = False, 43 | max_trajectory_budget: float | dict | None = None, 44 | telemetry_enabled: bool | None = True, 45 | ) -> None: 46 | """Load one task from the dataset and execute it with MCPComputerAgent.""" 47 | 48 | # Load dataset and pick a sample 49 | if isinstance(dataset, str): 50 | dataset = load_dataset(dataset, split="train") # type: ignore[arg-type] 51 | elif isinstance(dataset, list): 52 | dataset = dataset 53 | else: 54 | dataset = dataset["train"] 55 | 56 | sample_task = dataset[task_id] # type: ignore[index] 57 | task_prompt = sample_task.get("prompt", f"Task {sample_task.get('id', 0)}") # type: ignore[attr-defined] 58 | 59 | # Filter any existing Computer tools 60 | # The eval framework will add its own Computer tool per task 61 | if tools: 62 | tools = [ 63 | tool 64 | for tool in tools 65 | if not is_agent_computer(tool) 66 | ] 67 | 68 | with trace(name=task_prompt): 69 | task = Task(**sample_task) # type: ignore[arg-type] 70 | 71 | agent = MCPComputerAgent( 72 | model=model or "computer-use-preview", 73 | allowed_tools=allowed_tools or ["openai_computer"], 74 | # === ComputerAgent kwargs passthrough === 75 | tools=tools, 76 | custom_loop=custom_loop, 77 | only_n_most_recent_images=only_n_most_recent_images, 78 | callbacks=callbacks, 79 | instructions=instructions, 80 | verbosity=verbosity, 81 | trajectory_dir=trajectory_dir, 82 | max_retries=max_retries, 83 | screenshot_delay=screenshot_delay, 84 | use_prompt_caching=use_prompt_caching, 85 | max_trajectory_budget=max_trajectory_budget, 86 | telemetry_enabled=telemetry_enabled, 87 | ) 88 | print(f"Running: {task_prompt}") 89 | result = await agent.run(task, max_steps=10) 90 | print(f"✅ Reward: {getattr(result, 'reward')}") 91 | 92 | 93 | # --------------------------------------------------------------------------- 94 | # Full-dataset runner 95 | # --------------------------------------------------------------------------- 96 | 97 | async def run_full_dataset( 98 | dataset: str | Dataset | list[dict[str, Any]], 99 | *, 100 | job_name: Optional[str] = None, 101 | model: str | None = None, 102 | allowed_tools: list[str] | None = None, 103 | max_concurrent: int = 30, 104 | max_steps: int = 50, 105 | split: str = "train", 106 | trajectory_dir: str | dict | None = None, 107 | # === ComputerAgent kwargs === 108 | tools: list[Any] | None = None, 109 | custom_loop: Any | None = None, 110 | only_n_most_recent_images: int | None = 5, 111 | callbacks: list[Any] | None = None, 112 | instructions: str | None = None, 113 | verbosity: int | None = None, 114 | max_retries: int | None = 3, 115 | screenshot_delay: float | int = 0.5, 116 | use_prompt_caching: bool | None = False, 117 | max_trajectory_budget: float | dict | None = None, 118 | telemetry_enabled: bool | None = True, 119 | ) -> list[Any]: 120 | """Run evaluation across the entire dataset using hud.datasets.run_dataset.""" 121 | 122 | # Run with our MCP-based agent class. 123 | if isinstance(dataset, str): 124 | dataset_name = dataset.split('/')[-1] 125 | job_name = job_name or f"Evaluation {dataset_name}" 126 | dataset = load_dataset(dataset, split=split) # type: ignore[arg-type] 127 | else: 128 | dataset_name = "custom" 129 | job_name = job_name or f"Evaluation {time.strftime('%H:%M %Y-%m-%d')}" 130 | 131 | # Filter any existing Computer tools 132 | # The eval framework will add its own Computer tool per task 133 | if tools: 134 | tools = [ 135 | tool 136 | for tool in tools 137 | if not is_agent_computer(tool) 138 | ] 139 | 140 | # Execute evaluation 141 | return await run_dataset( 142 | name=job_name, 143 | dataset=dataset, 144 | agent_class=MCPComputerAgent, 145 | agent_config={ 146 | "model": model, 147 | "allowed_tools": allowed_tools, 148 | "trajectory_dir": trajectory_dir, 149 | # === ComputerAgent kwargs passthrough === 150 | "tools": tools, 151 | "custom_loop": custom_loop, 152 | "only_n_most_recent_images": only_n_most_recent_images, 153 | "callbacks": callbacks, 154 | "instructions": instructions, 155 | "verbosity": verbosity, 156 | "max_retries": max_retries, 157 | "screenshot_delay": screenshot_delay, 158 | "use_prompt_caching": use_prompt_caching, 159 | "max_trajectory_budget": max_trajectory_budget, 160 | "telemetry_enabled": telemetry_enabled, 161 | }, 162 | max_concurrent=max_concurrent, 163 | metadata={"dataset": dataset_name}, 164 | max_steps=max_steps, 165 | auto_respond=True, 166 | ) 167 | 168 | 169 | __all__ = [ 170 | "run_single_task", 171 | "run_full_dataset", 172 | "MCPComputerAgent", 173 | ] ``` -------------------------------------------------------------------------------- /libs/lume/tests/VMTests.swift: -------------------------------------------------------------------------------- ```swift 1 | import Foundation 2 | import Testing 3 | 4 | @testable import lume 5 | 6 | class MockProcessRunner: ProcessRunner { 7 | var runCalls: [(executable: String, arguments: [String])] = [] 8 | 9 | func run(executable: String, arguments: [String]) throws { 10 | runCalls.append((executable, arguments)) 11 | } 12 | } 13 | 14 | private func setupVMDirectory(_ tempDir: URL) throws -> VMDirectory { 15 | let vmDir = VMDirectory(Path(tempDir.path)) 16 | 17 | // Create disk image file 18 | let diskPath = vmDir.diskPath 19 | let diskData = Data(repeating: 0, count: 1024 * 1024) // 1MB mock disk 20 | try diskData.write(to: diskPath.url) 21 | 22 | // Create nvram file 23 | let nvramPath = vmDir.nvramPath 24 | let nvramData = Data(repeating: 0, count: 1024) // 1KB mock nvram 25 | try nvramData.write(to: nvramPath.url) 26 | 27 | // Create initial config file 28 | var config = try VMConfig( 29 | os: "mock-os", 30 | cpuCount: 1, 31 | memorySize: 1024, 32 | diskSize: 1024, 33 | display: "1024x768" 34 | ) 35 | config.setMacAddress("00:11:22:33:44:55") 36 | try vmDir.saveConfig(config) 37 | 38 | // Create .initialized file to mark VM as initialized 39 | let initializedPath = vmDir.dir.file(".initialized") 40 | try Data().write(to: initializedPath.url) 41 | 42 | return vmDir 43 | } 44 | 45 | @MainActor 46 | @Test("VM initialization and configuration") 47 | func testVMInitialization() async throws { 48 | let tempDir = try createTempDirectory() 49 | let vmDir = try setupVMDirectory(tempDir) 50 | var config = try VMConfig( 51 | os: "mock-os", 52 | cpuCount: 1, 53 | memorySize: 1024, 54 | diskSize: 1024, 55 | display: "1024x768" 56 | ) 57 | config.setMacAddress("00:11:22:33:44:55") // Set MAC address to avoid nil 58 | let home = Home(fileManager: FileManager.default) 59 | let context = VMDirContext(dir: vmDir, config: config, home: home, storage: nil) 60 | 61 | let vm = MockVM( 62 | vmDirContext: context, 63 | virtualizationServiceFactory: { _ in MockVMVirtualizationService() }, 64 | vncServiceFactory: { MockVNCService(vmDirectory: $0) } 65 | ) 66 | 67 | // Test initial state 68 | let details = vm.details 69 | #expect(details.name == vmDir.name) 70 | #expect(details.os == "mock-os") 71 | #expect(details.status == "stopped") 72 | #expect(details.vncUrl == nil) 73 | } 74 | 75 | @MainActor 76 | @Test("VM run and stop operations") 77 | func testVMRunAndStop() async throws { 78 | let tempDir = try createTempDirectory() 79 | let vmDir = try setupVMDirectory(tempDir) 80 | var config = try VMConfig( 81 | os: "mock-os", 82 | cpuCount: 2, 83 | memorySize: 2048, 84 | diskSize: 1024, 85 | display: "1024x768" 86 | ) 87 | config.setMacAddress("00:11:22:33:44:55") 88 | let home = Home(fileManager: FileManager.default) 89 | let context = VMDirContext(dir: vmDir, config: config, home: home, storage: nil) 90 | 91 | let vm = MockVM( 92 | vmDirContext: context, 93 | virtualizationServiceFactory: { _ in MockVMVirtualizationService() }, 94 | vncServiceFactory: { MockVNCService(vmDirectory: $0) } 95 | ) 96 | 97 | // Test running VM 98 | let runTask = Task { 99 | try await vm.run( 100 | noDisplay: false, sharedDirectories: [], mount: nil as Path?, vncPort: 0, 101 | recoveryMode: false) 102 | } 103 | 104 | // Give the VM time to start 105 | try await Task.sleep(nanoseconds: UInt64(1e9)) 106 | 107 | // Test stopping VM 108 | try await vm.stop() 109 | runTask.cancel() 110 | } 111 | 112 | @MainActor 113 | @Test("VM configuration updates") 114 | func testVMConfigurationUpdates() async throws { 115 | let tempDir = try createTempDirectory() 116 | let vmDir = try setupVMDirectory(tempDir) 117 | var config = try VMConfig( 118 | os: "mock-os", 119 | cpuCount: 1, 120 | memorySize: 1024, 121 | diskSize: 1024, 122 | display: "1024x768" 123 | ) 124 | config.setMacAddress("00:11:22:33:44:55") 125 | let home = Home(fileManager: FileManager.default) 126 | let context = VMDirContext(dir: vmDir, config: config, home: home, storage: nil) 127 | 128 | let vm = MockVM( 129 | vmDirContext: context, 130 | virtualizationServiceFactory: { _ in MockVMVirtualizationService() }, 131 | vncServiceFactory: { MockVNCService(vmDirectory: $0) } 132 | ) 133 | 134 | // Test CPU count update 135 | try vm.setCpuCount(4) 136 | #expect(vm.vmDirContext.config.cpuCount == 4) 137 | 138 | // Test memory size update 139 | try vm.setMemorySize(4096) 140 | #expect(vm.vmDirContext.config.memorySize == 4096) 141 | 142 | // Test MAC address update 143 | try vm.setMacAddress("00:11:22:33:44:66") 144 | #expect(vm.vmDirContext.config.macAddress == "00:11:22:33:44:66") 145 | } 146 | 147 | @MainActor 148 | @Test("VM setup process") 149 | func testVMSetup() async throws { 150 | let tempDir = try createTempDirectory() 151 | let vmDir = try setupVMDirectory(tempDir) 152 | var config = try VMConfig( 153 | os: "mock-os", 154 | cpuCount: 1, 155 | memorySize: 1024, 156 | diskSize: 1024, 157 | display: "1024x768" 158 | ) 159 | config.setMacAddress("00:11:22:33:44:55") 160 | let home = Home(fileManager: FileManager.default) 161 | let context = VMDirContext(dir: vmDir, config: config, home: home, storage: nil) 162 | 163 | let vm = MockVM( 164 | vmDirContext: context, 165 | virtualizationServiceFactory: { _ in MockVMVirtualizationService() }, 166 | vncServiceFactory: { MockVNCService(vmDirectory: $0) } 167 | ) 168 | 169 | let expectedDiskSize: UInt64 = 64 * 1024 * 1024 * 1024 // 64 GB 170 | 171 | try await vm.setup( 172 | ipswPath: "/path/to/mock.ipsw", 173 | cpuCount: 2, 174 | memorySize: 2048, 175 | diskSize: expectedDiskSize, 176 | display: "1024x768" 177 | ) 178 | 179 | #expect(vm.vmDirContext.config.cpuCount == 2) 180 | #expect(vm.vmDirContext.config.memorySize == 2048) 181 | let actualDiskSize = vm.vmDirContext.config.diskSize ?? 0 182 | #expect( 183 | actualDiskSize == expectedDiskSize, 184 | "Expected disk size \(expectedDiskSize), but got \(actualDiskSize)") 185 | #expect(vm.vmDirContext.config.macAddress == "00:11:22:33:44:55") 186 | } 187 | 188 | private func createTempDirectory() throws -> URL { 189 | let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent(UUID().uuidString) 190 | try FileManager.default.createDirectory(at: tempDir, withIntermediateDirectories: true) 191 | return tempDir 192 | } 193 | ``` -------------------------------------------------------------------------------- /docs/content/docs/libraries/lume/cli-reference.mdx: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | title: Lume CLI Reference 3 | description: Command Line Interface reference for Lume 4 | --- 5 | 6 | import { Callout } from 'fumadocs-ui/components/callout'; 7 | 8 | Once installed, you can start using Lume with these common workflows: 9 | 10 | ### Run a Prebuilt VM 11 | 12 | ```bash 13 | # Run a macOS Sequoia VM 14 | lume run macos-sequoia-vanilla:latest 15 | 16 | # Run an Ubuntu VM 17 | lume run ubuntu-noble-vanilla:latest 18 | ``` 19 | 20 | <Callout> 21 | We provide [prebuilt VM images](../lume/prebuilt-images) in our [ghcr registry](https://github.com/orgs/trycua/packages). 22 | </Callout> 23 | 24 | ### Create a Custom VM 25 | 26 | ```bash 27 | # Create a new macOS VM 28 | lume create my-macos-vm --cpu 4 --memory 8GB --disk-size 50GB 29 | 30 | # Create a Linux VM 31 | lume create my-linux-vm --os linux --cpu 2 --memory 4GB 32 | ``` 33 | 34 | <Callout title="Disk Space"> 35 | The actual disk space used by sparse images will be much lower than the logical size listed. You can resize VM disks after creation using `lume set <name> --disk-size <size>`. 36 | </Callout> 37 | 38 | ## VM Management 39 | 40 | lume create <name> 41 | Create a new macOS or Linux virtual machine. 42 | 43 | **Options:** 44 | - `--os <os>` - Operating system to install (macOS or linux, default: macOS) 45 | - `--cpu <cores>` - Number of CPU cores (default: 4) 46 | - `--memory <size>` - Memory size, e.g., 8GB (default: 4GB) 47 | - `--disk-size <size>` - Disk size, e.g., 50GB (default: 40GB) 48 | - `--display <res>` - Display resolution (default: 1024x768) 49 | - `--ipsw <path>` - Path to IPSW file or 'latest' for macOS VMs 50 | - `--storage <name>` - VM storage location to use 51 | 52 | **Examples:** 53 | ```bash 54 | # Create macOS VM with custom specs 55 | lume create my-mac --cpu 6 --memory 16GB --disk-size 100GB 56 | 57 | # Create Linux VM 58 | lume create my-ubuntu --os linux --cpu 2 --memory 8GB 59 | 60 | # Create macOS VM with latest IPSW 61 | lume create my-sequoia --ipsw latest 62 | ``` 63 | 64 | lume run <name> 65 | Start and run a virtual machine. 66 | 67 | **Options:** 68 | - `--no-display` - Do not start the VNC client app 69 | - `--shared-dir <dir>` - Share directory with VM (format: path[:ro|rw]) 70 | - `--mount <path>` - For Linux VMs only, attach a read-only disk image 71 | - `--registry <url>` - Container registry URL (default: ghcr.io) 72 | - `--organization <org>` - Organization to pull from (default: trycua) 73 | - `--vnc-port <port>` - Port to use for the VNC server (default: 0 for auto-assign) 74 | - `--recovery-mode <boolean>` - For macOS VMs only, start VM in recovery mode (default: false) 75 | - `--storage <name>` - VM storage location to use 76 | 77 | **Examples:** 78 | ```bash 79 | # Run VM with shared directory 80 | lume run my-vm --shared-dir /path/to/share:rw 81 | 82 | # Run VM without display (headless) 83 | lume run my-vm --no-display 84 | 85 | # Run macOS VM in recovery mode 86 | lume run my-mac --recovery-mode true 87 | ``` 88 | 89 | lume stop <name> 90 | Stop a running virtual machine. 91 | 92 | **Options:** 93 | - `--storage <name>` - VM storage location to use 94 | 95 | ### lume delete <name> 96 | Delete a virtual machine and its associated files. 97 | 98 | **Options:** 99 | - `--force` - Force deletion without confirmation 100 | - `--storage <name>` - VM storage location to use 101 | 102 | ### lume clone <name> <new-name> 103 | Create a copy of an existing virtual machine. 104 | 105 | **Options:** 106 | - `--source-storage <name>` - Source VM storage location 107 | - `--dest-storage <name>` - Destination VM storage location 108 | 109 | ## VM Information and Configuration 110 | 111 | ### lume ls 112 | List all virtual machines and their status. 113 | 114 | ### lume get <name> 115 | Get detailed information about a specific virtual machine. 116 | 117 | **Options:** 118 | - `-f, --format <format>` - Output format (json|text) 119 | - `--storage <name>` - VM storage location to use 120 | 121 | ### lume set <name> 122 | Modify virtual machine configuration. 123 | 124 | **Options:** 125 | - `--cpu <cores>` - New number of CPU cores (e.g., 4) 126 | - `--memory <size>` - New memory size (e.g., 8192MB or 8GB) 127 | - `--disk-size <size>` - New disk size (e.g., 40960MB or 40GB) 128 | - `--display <res>` - New display resolution in format WIDTHxHEIGHT (e.g., 1024x768) 129 | - `--storage <name>` - VM storage location to use 130 | 131 | **Examples:** 132 | ```bash 133 | # Increase VM memory 134 | lume set my-vm --memory 16GB 135 | 136 | # Change display resolution 137 | lume set my-vm --display 1920x1080 138 | 139 | # Add more CPU cores 140 | lume set my-vm --cpu 8 141 | ``` 142 | 143 | ## Image Management 144 | 145 | ### lume images 146 | List available macOS images in local cache. 147 | 148 | ### lume pull <image> 149 | Download a VM image from a container registry. 150 | 151 | **Options:** 152 | - `--registry <url>` - Container registry URL (default: ghcr.io) 153 | - `--organization <org>` - Organization to pull from (default: trycua) 154 | - `--storage <name>` - VM storage location to use 155 | 156 | ### lume push <name> <image:tag> 157 | Upload a VM image to a container registry. 158 | 159 | **Options:** 160 | - `--additional-tags <tags...>` - Additional tags to push the same image to 161 | - `--registry <url>` - Container registry URL (default: ghcr.io) 162 | - `--organization <org>` - Organization/user to push to (default: trycua) 163 | - `--storage <name>` - VM storage location to use 164 | - `--chunk-size-mb <size>` - Chunk size for disk image upload in MB (default: 512) 165 | - `--verbose` - Enable verbose logging 166 | - `--dry-run` - Prepare files and show plan without uploading 167 | - `--reassemble` - Verify integrity by reassembling chunks (requires --dry-run) 168 | 169 | ### lume ipsw 170 | Get the latest macOS restore image URL. 171 | 172 | ### lume prune 173 | Remove cached images to free up disk space. 174 | 175 | ## Configuration 176 | 177 | ### lume config 178 | Manage Lume configuration settings. 179 | 180 | **Subcommands:** 181 | 182 | ##### Storage Management 183 | - `lume config storage add <name> <path>` - Add a new VM storage location 184 | - `lume config storage remove <name>` - Remove a VM storage location 185 | - `lume config storage list` - List all VM storage locations 186 | - `lume config storage default <name>` - Set the default VM storage location 187 | 188 | ##### Cache Management 189 | - `lume config cache get` - Get current cache directory 190 | - `lume config cache set <path>` - Set cache directory 191 | 192 | ##### Image Caching 193 | - `lume config caching get` - Show current caching status 194 | - `lume config caching set <boolean>` - Enable or disable image caching 195 | 196 | ## API Server 197 | 198 | ### lume serve 199 | Start the Lume API server for programmatic access. 200 | 201 | **Options:** 202 | - `--port <port>` - Port to listen on (default: 7777) 203 | 204 | ## Global Options 205 | 206 | These options are available for all commands: 207 | 208 | - `--help` - Show help information 209 | - `--version` - Show version number ``` -------------------------------------------------------------------------------- /docs/content/docs/agent-sdk/agent-loops.mdx: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | title: Agent Loops 3 | description: Supported computer-using agent loops and models 4 | --- 5 | 6 | <Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/agent_nb.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.</Callout> 7 | 8 | An agent can be thought of as a loop - it generates actions, executes them, and repeats until done: 9 | 10 | 1. **Generate**: Your `model` generates `output_text`, `computer_call`, `function_call` 11 | 2. **Execute**: The `computer` safely executes those items 12 | 3. **Complete**: If the model has no more calls, it's done! 13 | 14 | To run an agent loop simply do: 15 | 16 | ```python 17 | from agent import ComputerAgent 18 | import asyncio 19 | from computer import Computer 20 | 21 | 22 | async def take_screenshot(): 23 | async with Computer( 24 | os_type="linux", 25 | provider_type="cloud", 26 | name="your-sandbox-name", 27 | api_key="your-api-key" 28 | ) as computer: 29 | 30 | agent = ComputerAgent( 31 | model="anthropic/claude-3-5-sonnet-20241022", 32 | tools=[computer], 33 | max_trajectory_budget=5.0 34 | ) 35 | 36 | messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}] 37 | 38 | async for result in agent.run(messages): 39 | for item in result["output"]: 40 | if item["type"] == "message": 41 | print(item["content"][0]["text"]) 42 | 43 | 44 | if __name__ == "__main__": 45 | asyncio.run(take_screenshot()) 46 | ``` 47 | 48 | For a list of supported models and configurations, see the [Supported Agents](./supported-agents/computer-use-agents) page. 49 | 50 | ### Response Format 51 | 52 | ```python 53 | { 54 | "output": [ 55 | { 56 | "type": "message", 57 | "role": "assistant", 58 | "content": [{"type": "output_text", "text": "I can see..."}] 59 | }, 60 | { 61 | "type": "computer_call", 62 | "action": {"type": "screenshot"}, 63 | "call_id": "call_123" 64 | }, 65 | { 66 | "type": "computer_call_output", 67 | "call_id": "call_123", 68 | "output": {"image_url": "data:image/png;base64,..."} 69 | } 70 | ], 71 | "usage": { 72 | "prompt_tokens": 150, 73 | "completion_tokens": 75, 74 | "total_tokens": 225, 75 | "response_cost": 0.01, 76 | } 77 | } 78 | ``` 79 | 80 | ### Environment Variables 81 | 82 | Use the following environment variables to configure the agent and its access to cloud computers and LLM providers: 83 | 84 | ```bash 85 | # Computer instance (cloud) 86 | export CUA_CONTAINER_NAME="your-container-name" 87 | export CUA_API_KEY="your-cua-api-key" 88 | 89 | # LLM API keys 90 | export ANTHROPIC_API_KEY="your-anthropic-key" 91 | export OPENAI_API_KEY="your-openai-key" 92 | ``` 93 | 94 | ### Input and output 95 | 96 | The input prompt passed to `Agent.run` can either be a string or a list of message dictionaries: 97 | 98 | ```python 99 | messages = [ 100 | { 101 | "role": "user", 102 | "content": "Take a screenshot and describe what you see" 103 | }, 104 | { 105 | "role": "assistant", 106 | "content": "I'll take a screenshot for you." 107 | } 108 | ] 109 | ``` 110 | 111 | The output is an AsyncGenerator that yields response chunks. 112 | 113 | ### Parameters 114 | 115 | The `ComputerAgent` constructor provides a wide range of options for customizing agent behavior, tool integration, callbacks, resource management, and more. 116 | 117 | - `model` (`str`): Default: **required** 118 | The LLM or agent model to use. Determines which agent loop is selected unless `custom_loop` is provided. (e.g., "claude-3-5-sonnet-20241022", "computer-use-preview", "omni+vertex_ai/gemini-pro") 119 | - `tools` (`List[Any]`): 120 | List of tools the agent can use (e.g., `Computer`, sandboxed Python functions, etc.). 121 | - `custom_loop` (`Callable`): 122 | Optional custom agent loop function. If provided, overrides automatic loop selection. 123 | - `only_n_most_recent_images` (`int`): 124 | If set, only the N most recent images are kept in the message history. Useful for limiting memory usage. Automatically adds `ImageRetentionCallback`. 125 | - `callbacks` (`List[Any]`): 126 | List of callback instances for advanced preprocessing, postprocessing, logging, or custom hooks. See [Callbacks & Extensibility](#callbacks--extensibility). 127 | - `verbosity` (`int`): 128 | Logging level (e.g., `logging.INFO`). If set, adds a logging callback. 129 | - `trajectory_dir` (`str`): 130 | Directory path to save full trajectory data, including screenshots and responses. Adds `TrajectorySaverCallback`. 131 | - `max_retries` (`int`): Default: `3` 132 | Maximum number of retries for failed API calls (default: 3). 133 | - `screenshot_delay` (`float` | `int`): Default: `0.5` 134 | Delay (in seconds) before taking screenshots (default: 0.5). 135 | - `use_prompt_caching` (`bool`): Default: `False` 136 | Enables prompt caching for repeated prompts (mainly for Anthropic models). 137 | - `max_trajectory_budget` (`float` | `dict`): 138 | If set (float or dict), adds a budget manager callback that tracks usage costs and stops execution if the budget is exceeded. Dict allows advanced options (e.g., `{ "max_budget": 5.0, "raise_error": True }`). 139 | - `**kwargs` (`any`): 140 | Any additional keyword arguments are passed through to the agent loop or model provider. 141 | 142 | **Example with advanced options:** 143 | 144 | ```python 145 | from agent import ComputerAgent 146 | from computer import Computer 147 | from agent.callbacks import ImageRetentionCallback 148 | 149 | agent = ComputerAgent( 150 | model="anthropic/claude-3-5-sonnet-20241022", 151 | tools=[Computer(...)], 152 | only_n_most_recent_images=3, 153 | callbacks=[ImageRetentionCallback(only_n_most_recent_images=3)], 154 | verbosity=logging.INFO, 155 | trajectory_dir="trajectories", 156 | max_retries=5, 157 | screenshot_delay=1.0, 158 | use_prompt_caching=True, 159 | max_trajectory_budget={"max_budget": 5.0, "raise_error": True} 160 | ) 161 | ``` 162 | 163 | ### Streaming Responses 164 | 165 | ```python 166 | async for result in agent.run(messages, stream=True): 167 | # Process streaming chunks 168 | for item in result["output"]: 169 | if item["type"] == "message": 170 | print(item["content"][0]["text"], end="", flush=True) 171 | elif item["type"] == "computer_call": 172 | action = item["action"] 173 | print(f"\n[Action: {action['type']}]") 174 | ``` 175 | 176 | ### Error Handling 177 | 178 | ```python 179 | try: 180 | async for result in agent.run(messages): 181 | # Process results 182 | pass 183 | except BudgetExceededException: 184 | print("Budget limit exceeded") 185 | except Exception as e: 186 | print(f"Agent error: {e}") 187 | ``` 188 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/proxy/examples.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Example usage of the proxy server and client requests. 3 | """ 4 | import dotenv 5 | dotenv.load_dotenv() 6 | 7 | import asyncio 8 | import json 9 | import os 10 | import aiohttp 11 | from typing import Dict, Any 12 | 13 | 14 | async def test_http_endpoint(): 15 | """Test the HTTP /responses endpoint.""" 16 | 17 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") 18 | assert isinstance(anthropic_api_key, str), "ANTHROPIC_API_KEY environment variable must be set" 19 | 20 | # Example 1: Simple text request 21 | simple_request = { 22 | "model": "anthropic/claude-3-5-sonnet-20241022", 23 | "input": "Tell me a three sentence bedtime story about a unicorn.", 24 | "env": { 25 | "ANTHROPIC_API_KEY": anthropic_api_key 26 | } 27 | } 28 | 29 | # Example 2: Multi-modal request with image 30 | multimodal_request = { 31 | "model": "anthropic/claude-3-5-sonnet-20241022", 32 | "input": [ 33 | { 34 | "role": "user", 35 | "content": [ 36 | {"type": "input_text", "text": "what is in this image?"}, 37 | { 38 | "type": "input_image", 39 | "image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" 40 | } 41 | ] 42 | } 43 | ], 44 | "env": { 45 | "ANTHROPIC_API_KEY": anthropic_api_key 46 | } 47 | } 48 | 49 | # Example 3: Request with custom agent and computer kwargs 50 | custom_request = { 51 | "model": "anthropic/claude-3-5-sonnet-20241022", 52 | "input": "Take a screenshot and tell me what you see", 53 | "env": { 54 | "ANTHROPIC_API_KEY": anthropic_api_key 55 | } 56 | } 57 | 58 | # Test requests 59 | base_url = "https://m-linux-96lcxd2c2k.containers.cloud.trycua.com:8443" 60 | # base_url = "http://localhost:8000" 61 | api_key = os.getenv("CUA_API_KEY") 62 | assert isinstance(api_key, str), "CUA_API_KEY environment variable must be set" 63 | 64 | async with aiohttp.ClientSession() as session: 65 | for i, request_data in enumerate([ 66 | simple_request, 67 | # multimodal_request, 68 | custom_request 69 | ], 1): 70 | print(f"\n--- Test {i} ---") 71 | print(f"Request: {json.dumps(request_data, indent=2)}") 72 | 73 | try: 74 | print(f"Sending request to {base_url}/responses") 75 | async with session.post( 76 | f"{base_url}/responses", 77 | json=request_data, 78 | headers={"Content-Type": "application/json", "X-API-Key": api_key} 79 | ) as response: 80 | result = await response.json() 81 | print(f"Status: {response.status}") 82 | print(f"Response: {json.dumps(result, indent=2)}") 83 | 84 | except Exception as e: 85 | print(f"Error: {e}") 86 | 87 | 88 | def curl_examples(): 89 | """Print curl command examples.""" 90 | 91 | print("=== CURL Examples ===\n") 92 | 93 | print("1. Simple text request:") 94 | print("""curl http://localhost:8000/responses \\ 95 | -H "Content-Type: application/json" \\ 96 | -d '{ 97 | "model": "anthropic/claude-3-5-sonnet-20241022", 98 | "input": "Tell me a three sentence bedtime story about a unicorn." 99 | }'""") 100 | 101 | print("\n2. Multi-modal request with image:") 102 | print("""curl http://localhost:8000/responses \\ 103 | -H "Content-Type: application/json" \\ 104 | -d '{ 105 | "model": "anthropic/claude-3-5-sonnet-20241022", 106 | "input": [ 107 | { 108 | "role": "user", 109 | "content": [ 110 | {"type": "input_text", "text": "what is in this image?"}, 111 | { 112 | "type": "input_image", 113 | "image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" 114 | } 115 | ] 116 | } 117 | ] 118 | }'""") 119 | 120 | print("\n3. Request with custom configuration:") 121 | print("""curl http://localhost:8000/responses \\ 122 | -H "Content-Type: application/json" \\ 123 | -d '{ 124 | "model": "anthropic/claude-3-5-sonnet-20241022", 125 | "input": "Take a screenshot and tell me what you see", 126 | "agent_kwargs": { 127 | "save_trajectory": true, 128 | "verbosity": 20 129 | }, 130 | "computer_kwargs": { 131 | "os_type": "linux", 132 | "provider_type": "cloud" 133 | } 134 | }'""") 135 | 136 | 137 | async def test_p2p_client(): 138 | """Example P2P client using peerjs-python.""" 139 | try: 140 | from peerjs import Peer, PeerOptions, ConnectionEventType 141 | from aiortc import RTCConfiguration, RTCIceServer 142 | 143 | # Set up client peer 144 | options = PeerOptions( 145 | host="0.peerjs.com", 146 | port=443, 147 | secure=True, 148 | config=RTCConfiguration( 149 | iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")] 150 | ) 151 | ) 152 | 153 | client_peer = Peer(id="test-client", peer_options=options) 154 | await client_peer.start() 155 | 156 | # Connect to proxy server 157 | connection = client_peer.connect("computer-agent-proxy") 158 | 159 | @connection.on(ConnectionEventType.Open) 160 | async def connection_open(): 161 | print("Connected to proxy server") 162 | 163 | # Send a test request 164 | request = { 165 | "model": "anthropic/claude-3-5-sonnet-20241022", 166 | "input": "Hello from P2P client!" 167 | } 168 | await connection.send(json.dumps(request)) 169 | 170 | @connection.on(ConnectionEventType.Data) 171 | async def connection_data(data): 172 | print(f"Received response: {data}") 173 | await client_peer.destroy() 174 | 175 | # Wait for connection 176 | await asyncio.sleep(10) 177 | 178 | except ImportError: 179 | print("P2P dependencies not available. Install peerjs-python for P2P testing.") 180 | except Exception as e: 181 | print(f"P2P test error: {e}") 182 | 183 | 184 | if __name__ == "__main__": 185 | import sys 186 | 187 | if len(sys.argv) > 1 and sys.argv[1] == "curl": 188 | curl_examples() 189 | elif len(sys.argv) > 1 and sys.argv[1] == "p2p": 190 | asyncio.run(test_p2p_client()) 191 | else: 192 | asyncio.run(test_http_endpoint()) 193 | ``` -------------------------------------------------------------------------------- /libs/python/computer-server/computer_server/diorama/safezone.py: -------------------------------------------------------------------------------- ```python 1 | #!/usr/bin/env python3 2 | """ 3 | UI Safezone Helper - A utility to get accurate bounds for macOS UI elements 4 | 5 | This module provides helper functions to get accurate bounds for macOS UI elements 6 | like the menubar and dock, which are needed for proper screenshot composition. 7 | """ 8 | 9 | import sys 10 | import time 11 | from typing import Dict, Any, Optional, Tuple 12 | 13 | # Import Objective-C bridge libraries 14 | try: 15 | import AppKit 16 | from ApplicationServices import ( 17 | AXUIElementCreateSystemWide, 18 | AXUIElementCreateApplication, 19 | AXUIElementCopyAttributeValue, 20 | AXUIElementCopyAttributeValues, 21 | kAXChildrenAttribute, 22 | kAXRoleAttribute, 23 | kAXTitleAttribute, 24 | kAXPositionAttribute, 25 | kAXSizeAttribute, 26 | kAXErrorSuccess, 27 | AXValueGetType, 28 | kAXValueCGSizeType, 29 | kAXValueCGPointType, 30 | AXUIElementGetTypeID, 31 | AXValueGetValue, 32 | kAXMenuBarAttribute, 33 | ) 34 | from AppKit import NSWorkspace, NSRunningApplication 35 | import Foundation 36 | except ImportError: 37 | print("Error: This script requires PyObjC to be installed.") 38 | print("Please install it with: pip install pyobjc") 39 | sys.exit(1) 40 | 41 | # Constants for accessibility API 42 | kAXErrorSuccess = 0 43 | kAXRoleAttribute = "AXRole" 44 | kAXSubroleAttribute = "AXSubrole" 45 | kAXTitleAttribute = "AXTitle" 46 | kAXPositionAttribute = "AXPosition" 47 | kAXSizeAttribute = "AXSize" 48 | kAXChildrenAttribute = "AXChildren" 49 | kAXMenuBarAttribute = "AXMenuBar" 50 | 51 | 52 | def element_attribute(element, attribute): 53 | """Get an attribute from an accessibility element""" 54 | if attribute == kAXChildrenAttribute: 55 | err, value = AXUIElementCopyAttributeValues(element, attribute, 0, 999, None) 56 | if err == kAXErrorSuccess: 57 | if isinstance(value, Foundation.NSArray): 58 | return list(value) 59 | else: 60 | return value 61 | err, value = AXUIElementCopyAttributeValue(element, attribute, None) 62 | if err == kAXErrorSuccess: 63 | return value 64 | return None 65 | 66 | 67 | def element_value(element, type): 68 | """Get a value from an accessibility element""" 69 | err, value = AXValueGetValue(element, type, None) 70 | if err == True: 71 | return value 72 | return None 73 | 74 | 75 | def get_element_bounds(element): 76 | """Get the bounds of an accessibility element""" 77 | bounds = { 78 | "x": 0, 79 | "y": 0, 80 | "width": 0, 81 | "height": 0 82 | } 83 | 84 | # Get position 85 | position_value = element_attribute(element, kAXPositionAttribute) 86 | if position_value: 87 | position_value = element_value(position_value, kAXValueCGPointType) 88 | if position_value: 89 | bounds["x"] = position_value.x 90 | bounds["y"] = position_value.y 91 | 92 | # Get size 93 | size_value = element_attribute(element, kAXSizeAttribute) 94 | if size_value: 95 | size_value = element_value(size_value, kAXValueCGSizeType) 96 | if size_value: 97 | bounds["width"] = size_value.width 98 | bounds["height"] = size_value.height 99 | 100 | return bounds 101 | 102 | 103 | def find_dock_process(): 104 | """Find the Dock process""" 105 | running_apps = NSWorkspace.sharedWorkspace().runningApplications() 106 | for app in running_apps: 107 | if app.localizedName() == "Dock" and app.bundleIdentifier() == "com.apple.dock": 108 | return app.processIdentifier() 109 | return None 110 | 111 | 112 | def get_menubar_bounds(): 113 | """Get the bounds of the macOS menubar 114 | 115 | Returns: 116 | Dictionary with x, y, width, height of the menubar 117 | """ 118 | # Get the system-wide accessibility element 119 | system_element = AXUIElementCreateSystemWide() 120 | 121 | # Try to find the menubar 122 | menubar = element_attribute(system_element, kAXMenuBarAttribute) 123 | if menubar is None: 124 | # If we can't get it directly, try through the frontmost app 125 | frontmost_app = NSWorkspace.sharedWorkspace().frontmostApplication() 126 | if frontmost_app: 127 | app_pid = frontmost_app.processIdentifier() 128 | app_element = AXUIElementCreateApplication(app_pid) 129 | menubar = element_attribute(app_element, kAXMenuBarAttribute) 130 | 131 | if menubar is None: 132 | print("Error: Could not get menubar") 133 | # Return default menubar bounds as fallback 134 | return {"x": 0, "y": 0, "width": 1800, "height": 24} 135 | 136 | # Get menubar bounds 137 | return get_element_bounds(menubar) 138 | 139 | 140 | def get_dock_bounds(): 141 | """Get the bounds of the macOS Dock 142 | 143 | Returns: 144 | Dictionary with x, y, width, height of the Dock 145 | """ 146 | dock_pid = find_dock_process() 147 | if dock_pid is None: 148 | print("Error: Could not find Dock process") 149 | # Return empty bounds as fallback 150 | return {"x": 0, "y": 0, "width": 0, "height": 0} 151 | 152 | # Create an accessibility element for the Dock 153 | dock_element = AXUIElementCreateApplication(dock_pid) 154 | if dock_element is None: 155 | print(f"Error: Could not create accessibility element for Dock (PID {dock_pid})") 156 | return {"x": 0, "y": 0, "width": 0, "height": 0} 157 | 158 | # Get the Dock's children 159 | children = element_attribute(dock_element, kAXChildrenAttribute) 160 | if not children or len(children) == 0: 161 | print("Error: Could not get Dock children") 162 | return {"x": 0, "y": 0, "width": 0, "height": 0} 163 | 164 | # Find the Dock's list (first child is usually the main dock list) 165 | dock_list = None 166 | for child in children: 167 | role = element_attribute(child, kAXRoleAttribute) 168 | if role == "AXList": 169 | dock_list = child 170 | break 171 | 172 | if dock_list is None: 173 | print("Error: Could not find Dock list") 174 | return {"x": 0, "y": 0, "width": 0, "height": 0} 175 | 176 | # Get the bounds of the dock list 177 | return get_element_bounds(dock_list) 178 | 179 | 180 | def get_ui_element_bounds(): 181 | """Get the bounds of important UI elements like menubar and dock 182 | 183 | Returns: 184 | Dictionary with menubar and dock bounds 185 | """ 186 | menubar_bounds = get_menubar_bounds() 187 | dock_bounds = get_dock_bounds() 188 | 189 | return { 190 | "menubar": menubar_bounds, 191 | "dock": dock_bounds 192 | } 193 | 194 | 195 | if __name__ == "__main__": 196 | # Example usage 197 | bounds = get_ui_element_bounds() 198 | print("Menubar bounds:", bounds["menubar"]) 199 | print("Dock bounds:", bounds["dock"]) 200 | ``` -------------------------------------------------------------------------------- /.github/workflows/pypi-publish-agent.yml: -------------------------------------------------------------------------------- ```yaml 1 | name: Publish Agent Package 2 | 3 | on: 4 | push: 5 | tags: 6 | - "agent-v*" 7 | workflow_dispatch: 8 | inputs: 9 | version: 10 | description: "Version to publish (without v prefix)" 11 | required: true 12 | default: "0.1.0" 13 | workflow_call: 14 | inputs: 15 | version: 16 | description: "Version to publish" 17 | required: true 18 | type: string 19 | 20 | # Adding permissions at workflow level 21 | permissions: 22 | contents: write 23 | 24 | jobs: 25 | prepare: 26 | runs-on: macos-latest 27 | outputs: 28 | version: ${{ steps.get-version.outputs.version }} 29 | computer_version: ${{ steps.update-deps.outputs.computer_version }} 30 | som_version: ${{ steps.update-deps.outputs.som_version }} 31 | core_version: ${{ steps.update-deps.outputs.core_version }} 32 | steps: 33 | - uses: actions/checkout@v4 34 | 35 | - name: Determine version 36 | id: get-version 37 | run: | 38 | if [ "${{ github.event_name }}" == "push" ]; then 39 | # Extract version from tag (for package-specific tags) 40 | if [[ "${{ github.ref }}" =~ ^refs/tags/agent-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then 41 | VERSION=${BASH_REMATCH[1]} 42 | else 43 | echo "Invalid tag format for agent" 44 | exit 1 45 | fi 46 | elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then 47 | # Use version from workflow dispatch 48 | VERSION=${{ github.event.inputs.version }} 49 | else 50 | # Use version from workflow_call 51 | VERSION=${{ inputs.version }} 52 | fi 53 | echo "VERSION=$VERSION" 54 | echo "version=$VERSION" >> $GITHUB_OUTPUT 55 | 56 | - name: Set up Python 57 | uses: actions/setup-python@v4 58 | with: 59 | python-version: "3.11" 60 | 61 | - name: Update dependencies to latest versions 62 | id: update-deps 63 | run: | 64 | cd libs/python/agent 65 | 66 | # Install required package for PyPI API access 67 | pip install requests 68 | 69 | # Create a more robust Python script for PyPI version checking 70 | cat > get_latest_versions.py << 'EOF' 71 | import requests 72 | import json 73 | import sys 74 | 75 | def get_package_version(package_name, fallback="0.1.0"): 76 | try: 77 | response = requests.get(f'https://pypi.org/pypi/{package_name}/json') 78 | print(f"API Response Status for {package_name}: {response.status_code}", file=sys.stderr) 79 | 80 | if response.status_code != 200: 81 | print(f"API request failed for {package_name}, using fallback version", file=sys.stderr) 82 | return fallback 83 | 84 | data = json.loads(response.text) 85 | 86 | if 'info' not in data: 87 | print(f"Missing 'info' key in API response for {package_name}, using fallback version", file=sys.stderr) 88 | return fallback 89 | 90 | return data['info']['version'] 91 | except Exception as e: 92 | print(f"Error fetching version for {package_name}: {str(e)}", file=sys.stderr) 93 | return fallback 94 | 95 | # Get latest versions 96 | print(get_package_version('cua-computer')) 97 | print(get_package_version('cua-som')) 98 | print(get_package_version('cua-core')) 99 | EOF 100 | 101 | # Execute the script to get the versions 102 | VERSIONS=($(python get_latest_versions.py)) 103 | LATEST_COMPUTER=${VERSIONS[0]} 104 | LATEST_SOM=${VERSIONS[1]} 105 | LATEST_CORE=${VERSIONS[2]} 106 | 107 | echo "Latest cua-computer version: $LATEST_COMPUTER" 108 | echo "Latest cua-som version: $LATEST_SOM" 109 | echo "Latest cua-core version: $LATEST_CORE" 110 | 111 | # Output the versions for the next job 112 | echo "computer_version=$LATEST_COMPUTER" >> $GITHUB_OUTPUT 113 | echo "som_version=$LATEST_SOM" >> $GITHUB_OUTPUT 114 | echo "core_version=$LATEST_CORE" >> $GITHUB_OUTPUT 115 | 116 | # Determine major version for version constraint 117 | COMPUTER_MAJOR=$(echo $LATEST_COMPUTER | cut -d. -f1) 118 | SOM_MAJOR=$(echo $LATEST_SOM | cut -d. -f1) 119 | CORE_MAJOR=$(echo $LATEST_CORE | cut -d. -f1) 120 | 121 | NEXT_COMPUTER_MAJOR=$((COMPUTER_MAJOR + 1)) 122 | NEXT_SOM_MAJOR=$((SOM_MAJOR + 1)) 123 | NEXT_CORE_MAJOR=$((CORE_MAJOR + 1)) 124 | 125 | # Update dependencies in pyproject.toml 126 | if [[ "$OSTYPE" == "darwin"* ]]; then 127 | # macOS version of sed needs an empty string for -i 128 | sed -i '' "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml 129 | sed -i '' "s/\"cua-som>=.*,<.*\"/\"cua-som>=$LATEST_SOM,<$NEXT_SOM_MAJOR.0.0\"/" pyproject.toml 130 | sed -i '' "s/\"cua-core>=.*,<.*\"/\"cua-core>=$LATEST_CORE,<$NEXT_CORE_MAJOR.0.0\"/" pyproject.toml 131 | else 132 | # Linux version 133 | sed -i "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml 134 | sed -i "s/\"cua-som>=.*,<.*\"/\"cua-som>=$LATEST_SOM,<$NEXT_SOM_MAJOR.0.0\"/" pyproject.toml 135 | sed -i "s/\"cua-core>=.*,<.*\"/\"cua-core>=$LATEST_CORE,<$NEXT_CORE_MAJOR.0.0\"/" pyproject.toml 136 | fi 137 | 138 | # Display the updated dependencies 139 | echo "Updated dependencies in pyproject.toml:" 140 | grep -E "cua-computer|cua-som|cua-core" pyproject.toml 141 | 142 | publish: 143 | needs: prepare 144 | uses: ./.github/workflows/pypi-reusable-publish.yml 145 | with: 146 | package_name: "agent" 147 | package_dir: "libs/python/agent" 148 | version: ${{ needs.prepare.outputs.version }} 149 | is_lume_package: false 150 | base_package_name: "cua-agent" 151 | secrets: 152 | PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} 153 | 154 | set-env-variables: 155 | needs: [prepare, publish] 156 | runs-on: macos-latest 157 | steps: 158 | - name: Set environment variables for use in other jobs 159 | run: | 160 | echo "COMPUTER_VERSION=${{ needs.prepare.outputs.computer_version }}" >> $GITHUB_ENV 161 | echo "SOM_VERSION=${{ needs.prepare.outputs.som_version }}" >> $GITHUB_ENV 162 | echo "CORE_VERSION=${{ needs.prepare.outputs.core_version }}" >> $GITHUB_ENV 163 | ``` -------------------------------------------------------------------------------- /libs/lumier/src/lib/utils.sh: -------------------------------------------------------------------------------- ```bash 1 | #!/usr/bin/env bash 2 | 3 | # Function to wait for SSH to become available 4 | wait_for_ssh() { 5 | local host_ip=$1 6 | local user=$2 7 | local password=$3 8 | local retry_interval=${4:-5} # Default retry interval is 5 seconds 9 | local max_retries=${5:-20} # Default maximum retries is 20 (0 for infinite) 10 | 11 | # Only show waiting message in debug mode 12 | if [ "${LUMIER_DEBUG:-0}" == "1" ]; then 13 | echo "Waiting for SSH to become available on $host_ip..." 14 | fi 15 | 16 | local retry_count=0 17 | while true; do 18 | # Try to connect via SSH 19 | # Add -q for completely silent operation, redirect stderr to /dev/null 20 | sshpass -p "$password" ssh -q -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR "$user@$host_ip" "exit" 2>/dev/null 21 | 22 | # Check the exit status of the SSH command 23 | if [ $? -eq 0 ]; then 24 | echo "SSH is ready on $host_ip!" 25 | return 0 26 | fi 27 | 28 | # Increment retry count 29 | ((retry_count++)) 30 | 31 | # Exit if maximum retries are reached 32 | if [ $max_retries -ne 0 ] && [ $retry_count -ge $max_retries ]; then 33 | echo "Maximum retries reached. SSH is not available." 34 | return 1 35 | fi 36 | 37 | # Only show retry messages in debug mode 38 | if [ "${LUMIER_DEBUG:-0}" == "1" ]; then 39 | echo "SSH not ready. Retrying in $retry_interval seconds... (Attempt $retry_count)" 40 | fi 41 | sleep $retry_interval 42 | done 43 | } 44 | 45 | # Function to execute a script on a remote server using sshpass 46 | execute_remote_script() { 47 | local host="$1" 48 | local user="$2" 49 | local password="$3" 50 | local script_path="$4" 51 | local vnc_password="$5" 52 | local data_folder="$6" 53 | 54 | # Check if all required arguments are provided 55 | if [ -z "$host" ] || [ -z "$user" ] || [ -z "$password" ] || [ -z "$script_path" ] || [ -z "$vnc_password" ]; then 56 | echo "Usage: execute_remote_script <host> <user> <password> <script_path> <vnc_password> [data_folder]" 57 | return 1 58 | fi 59 | 60 | # Only show VNC info in debug mode 61 | if [ "${LUMIER_DEBUG:-0}" == "1" ]; then 62 | echo "VNC password exported to VM: $vnc_password" 63 | fi 64 | 65 | # Set the shared folder path for the VM 66 | if [ -n "$data_folder" ]; then 67 | # VM always sees shared folders at this path, regardless of container path 68 | shared_folder_path="/Volumes/My Shared Files" 69 | 70 | # Only show path in debug mode 71 | if [ "${LUMIER_DEBUG:-0}" == "1" ]; then 72 | echo "Data folder path in VM: $shared_folder_path" 73 | fi 74 | else 75 | shared_folder_path="" 76 | fi 77 | 78 | # Read the script content and prepend the shebang 79 | script_content="#!/usr/bin/env bash\n" 80 | # Always export VNC_PASSWORD 81 | script_content+="export VNC_PASSWORD='$vnc_password'\n" 82 | # Export SHARED_FOLDER_PATH only if we have a data folder path 83 | if [ -n "$shared_folder_path" ]; then 84 | script_content+="export SHARED_FOLDER_PATH='$shared_folder_path'\n" 85 | fi 86 | # Pass debug setting to the VM 87 | script_content+="export VNC_DEBUG='${LUMIER_DEBUG:-0}'\n" 88 | 89 | # Add debug messages only if debug mode is enabled 90 | if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then 91 | script_content+="echo \"[DEBUG] Starting on-logon script execution...\"\n" 92 | fi 93 | 94 | # Add the original script content 95 | script_content+="$(<"$script_path")" 96 | 97 | # Add debug messages only if debug mode is enabled 98 | if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then 99 | script_content+="\necho \"[DEBUG] Finished executing on-logon script.\"\n" 100 | fi 101 | 102 | # Print debug info only when debug mode is enabled 103 | if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then 104 | echo "[DEBUG] Executing remote script with content length: $(echo -n "$script_content" | wc -c) bytes" 105 | echo "[DEBUG] Script path: $script_path" 106 | fi 107 | 108 | # Use a here-document to send the script content 109 | # We'll capture both stdout and stderr when debug is enabled 110 | if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then 111 | echo "[DEBUG] Connecting to $user@$host to execute script..." 112 | sshpass -p "$password" ssh -q -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR "$user@$host" "bash -s -- '$vnc_password' '$data_folder'" 2>&1 <<EOF 113 | $script_content 114 | EOF 115 | else 116 | # Otherwise run quietly 117 | sshpass -p "$password" ssh -q -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR "$user@$host" "bash -s -- '$vnc_password' '$data_folder'" 2>/dev/null <<EOF 118 | $script_content 119 | EOF 120 | fi 121 | 122 | # Print completion message only in debug mode 123 | if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then 124 | echo "[DEBUG] Script execution completed." 125 | fi 126 | 127 | # Check the exit status of the sshpass command 128 | if [ $? -ne 0 ]; then 129 | echo "Failed to execute script on remote host $host." 130 | return 1 131 | fi 132 | } 133 | 134 | extract_json_field() { 135 | local field_name=$1 136 | local input=$2 137 | local result="" 138 | 139 | # First attempt with jq if available (most reliable JSON parsing) 140 | if command -v jq &> /dev/null; then 141 | # Use jq for reliable JSON parsing 142 | result=$(echo "$input" | jq -r ".$field_name // empty" 2>/dev/null) 143 | if [[ -n "$result" ]]; then 144 | echo "$result" 145 | return 0 146 | fi 147 | fi 148 | 149 | # Fallback to grep-based approach with improvements 150 | # First try for quoted string values 151 | result=$(echo "$input" | tr -d '\n' | grep -o "\"$field_name\"\s*:\s*\"[^\"]*\"" | sed -E 's/.*":\s*"(.*)"$/\1/') 152 | if [[ -n "$result" ]]; then 153 | echo "$result" 154 | return 0 155 | fi 156 | 157 | # Try for non-quoted values (numbers, true, false, null) 158 | result=$(echo "$input" | tr -d '\n' | grep -o "\"$field_name\"\s*:\s*[^,}]*" | sed -E 's/.*":\s*(.*)$/\1/') 159 | if [[ -n "$result" ]]; then 160 | echo "$result" 161 | return 0 162 | fi 163 | 164 | # Return empty string if field not found 165 | echo "" 166 | } 167 | 168 | extract_json_field_from_file() { 169 | local field_name=$1 170 | local json_file=$2 171 | local json_text 172 | json_text=$(<"$json_file") 173 | extract_json_field "$field_name" "$json_text" 174 | } 175 | 176 | extract_json_field_from_text() { 177 | local field_name=$1 178 | local json_text=$2 179 | extract_json_field "$field_name" "$json_text" 180 | } 181 | ``` -------------------------------------------------------------------------------- /libs/lume/src/FileSystem/VMDirectory.swift: -------------------------------------------------------------------------------- ```swift 1 | import Foundation 2 | 3 | // MARK: - VMDirectory 4 | 5 | /// Manages a virtual machine's directory structure and files 6 | /// Responsible for: 7 | /// - Managing VM configuration files 8 | /// - Handling disk operations 9 | /// - Managing VM state and locking 10 | /// - Providing access to VM-related paths 11 | struct VMDirectory: Sendable { 12 | // MARK: - Constants 13 | 14 | private enum FileNames { 15 | static let nvram = "nvram.bin" 16 | static let disk = "disk.img" 17 | static let config = "config.json" 18 | static let sessions = "sessions.json" 19 | } 20 | 21 | // MARK: - Properties 22 | 23 | let dir: Path 24 | let nvramPath: Path 25 | let diskPath: Path 26 | let configPath: Path 27 | let sessionsPath: Path 28 | 29 | /// The name of the VM directory 30 | var name: String { dir.name } 31 | 32 | // MARK: - Initialization 33 | 34 | /// Creates a new VMDirectory instance 35 | /// - Parameters: 36 | /// - dir: The base directory path for the VM 37 | init(_ dir: Path) { 38 | self.dir = dir 39 | self.nvramPath = dir.file(FileNames.nvram) 40 | self.diskPath = dir.file(FileNames.disk) 41 | self.configPath = dir.file(FileNames.config) 42 | self.sessionsPath = dir.file(FileNames.sessions) 43 | } 44 | } 45 | 46 | // MARK: - VM State Management 47 | 48 | extension VMDirectory { 49 | /// Checks if the VM directory is fully initialized with all required files 50 | func initialized() -> Bool { 51 | // Add detailed logging for debugging 52 | let configExists = configPath.exists() 53 | let diskExists = diskPath.exists() 54 | let nvramExists = nvramPath.exists() 55 | 56 | // Logger.info( 57 | // "VM directory initialization check", 58 | // metadata: [ 59 | // "directory": dir.path, 60 | // "config_path": configPath.path, 61 | // "config_exists": "\(configExists)", 62 | // "disk_path": diskPath.path, 63 | // "disk_exists": "\(diskExists)", 64 | // "nvram_path": nvramPath.path, 65 | // "nvram_exists": "\(nvramExists)" 66 | // ] 67 | // ) 68 | 69 | return configExists && diskExists && nvramExists 70 | } 71 | 72 | /// Checks if the VM directory exists 73 | func exists() -> Bool { 74 | dir.exists() 75 | } 76 | } 77 | 78 | // MARK: - Disk Management 79 | 80 | extension VMDirectory { 81 | /// Resizes the VM's disk to the specified size 82 | /// - Parameter size: The new size in bytes 83 | /// - Throws: VMDirectoryError if the disk operation fails 84 | func setDisk(_ size: UInt64) throws { 85 | do { 86 | if !diskPath.exists() { 87 | guard FileManager.default.createFile(atPath: diskPath.path, contents: nil) else { 88 | throw VMDirectoryError.fileCreationFailed(diskPath.path) 89 | } 90 | } 91 | 92 | let handle = try FileHandle(forWritingTo: diskPath.url) 93 | defer { try? handle.close() } 94 | 95 | try handle.truncate(atOffset: size) 96 | } catch { 97 | } 98 | } 99 | } 100 | 101 | // MARK: - Configuration Management 102 | 103 | extension VMDirectory { 104 | /// Saves the VM configuration to disk 105 | /// - Parameter config: The configuration to save 106 | /// - Throws: VMDirectoryError if the save operation fails 107 | func saveConfig(_ config: VMConfig) throws { 108 | let encoder = JSONEncoder() 109 | encoder.outputFormatting = .prettyPrinted 110 | 111 | do { 112 | let data = try encoder.encode(config) 113 | guard FileManager.default.createFile(atPath: configPath.path, contents: data) else { 114 | throw VMDirectoryError.fileCreationFailed(configPath.path) 115 | } 116 | } catch { 117 | throw VMDirectoryError.invalidConfigData 118 | } 119 | } 120 | 121 | /// Loads the VM configuration from disk 122 | /// - Returns: The loaded configuration 123 | /// - Throws: VMDirectoryError if the load operation fails 124 | func loadConfig() throws -> VMConfig { 125 | guard let data = FileManager.default.contents(atPath: configPath.path) else { 126 | throw VMDirectoryError.configNotFound 127 | } 128 | 129 | do { 130 | let decoder = JSONDecoder() 131 | return try decoder.decode(VMConfig.self, from: data) 132 | } catch { 133 | throw VMDirectoryError.invalidConfigData 134 | } 135 | } 136 | } 137 | 138 | // MARK: - VNC Session Management 139 | 140 | struct VNCSession: Codable { 141 | let url: String 142 | let sharedDirectories: [SharedDirectory]? 143 | 144 | init(url: String, sharedDirectories: [SharedDirectory]? = nil) { 145 | self.url = url 146 | self.sharedDirectories = sharedDirectories 147 | } 148 | } 149 | 150 | extension VMDirectory { 151 | /// Saves VNC session information to disk 152 | /// - Parameters: 153 | /// - session: The VNC session to save 154 | /// - sharedDirectories: Optional array of shared directories to save with the session 155 | /// - Throws: VMDirectoryError if the save operation fails 156 | func saveSession(_ session: VNCSession) throws { 157 | let encoder = JSONEncoder() 158 | encoder.outputFormatting = .prettyPrinted 159 | 160 | do { 161 | let data = try encoder.encode(session) 162 | guard FileManager.default.createFile(atPath: sessionsPath.path, contents: data) else { 163 | throw VMDirectoryError.fileCreationFailed(sessionsPath.path) 164 | } 165 | } catch { 166 | throw VMDirectoryError.invalidSessionData 167 | } 168 | } 169 | 170 | /// Loads the VNC session information from disk 171 | /// - Returns: The loaded VNC session 172 | /// - Throws: VMDirectoryError if the load operation fails 173 | func loadSession() throws -> VNCSession { 174 | guard let data = FileManager.default.contents(atPath: sessionsPath.path) else { 175 | throw VMDirectoryError.sessionNotFound 176 | } 177 | 178 | do { 179 | let decoder = JSONDecoder() 180 | return try decoder.decode(VNCSession.self, from: data) 181 | } catch { 182 | throw VMDirectoryError.invalidSessionData 183 | } 184 | } 185 | 186 | /// Removes the VNC session information from disk 187 | func clearSession() { 188 | try? FileManager.default.removeItem(atPath: sessionsPath.path) 189 | } 190 | } 191 | 192 | // MARK: - CustomStringConvertible 193 | extension VMDirectory: CustomStringConvertible { 194 | var description: String { 195 | "VMDirectory(path: \(dir.path))" 196 | } 197 | } 198 | 199 | extension VMDirectory { 200 | func delete() throws { 201 | try FileManager.default.removeItem(atPath: dir.path) 202 | } 203 | } 204 | ``` -------------------------------------------------------------------------------- /libs/python/agent/benchmarks/ss-pro.py: -------------------------------------------------------------------------------- ```python 1 | #!/usr/bin/env python3 2 | """ 3 | ScreenSpot-Pro Benchmark Script 4 | 5 | Evaluates models on the ScreenSpot-Pro dataset for click prediction accuracy. 6 | Supports both ComputerAgent model strings and custom model classes. 7 | """ 8 | 9 | import argparse 10 | import asyncio 11 | import random 12 | import statistics 13 | import time 14 | from typing import Optional 15 | 16 | from datasets import load_dataset 17 | from tqdm import tqdm 18 | 19 | from utils import ( 20 | ModelWrapper, 21 | is_click_in_bbox, 22 | save_results_to_markdown, 23 | save_visualizations, 24 | get_available_models, 25 | get_gpu_memory 26 | ) 27 | 28 | 29 | async def evaluate_model(model_wrapper: ModelWrapper, dataset, max_samples: Optional[int] = None) -> dict: 30 | """ 31 | Evaluate a model on the ScreenSpot-Pro dataset. 32 | 33 | Args: 34 | model_wrapper: ModelWrapper instance 35 | dataset: ScreenSpot-Pro dataset (list of samples) 36 | max_samples: Maximum number of samples to evaluate (None for all) 37 | 38 | Returns: 39 | Dictionary with evaluation results 40 | """ 41 | print(f"\nEvaluating model: {model_wrapper.model_name}") 42 | 43 | # Load model 44 | await model_wrapper.load_model() 45 | 46 | total_samples = len(dataset) 47 | if max_samples is not None: 48 | total_samples = min(max_samples, total_samples) 49 | 50 | correct_predictions = 0 51 | error_predictions = 0 52 | results = [] 53 | 54 | for i in tqdm(range(total_samples), desc=f"Evaluating {model_wrapper.model_name}"): 55 | sample = dataset[i] 56 | 57 | # Extract sample data 58 | image = sample['image'] 59 | instruction = sample['instruction'] 60 | bbox = sample['bbox'] # [x1, y1, x2, y2] 61 | sample_id = sample['img_filename'] 62 | 63 | # Predict click coordinates with timing 64 | start_time = time.time() 65 | click_coords = await model_wrapper.predict_click(image, instruction) 66 | prediction_time = time.time() - start_time 67 | 68 | # Check if prediction is correct 69 | is_correct = is_click_in_bbox(click_coords, bbox) 70 | 71 | if is_correct: 72 | correct_predictions += 1 73 | 74 | results.append({ 75 | 'id': sample_id, 76 | 'instruction': instruction, 77 | 'bbox': bbox, 78 | 'predicted_coords': click_coords, 79 | 'is_correct': is_correct, 80 | 'failed': False, 81 | 'prediction_time': prediction_time 82 | }) 83 | 84 | # Unload model 85 | await model_wrapper.unload_model() 86 | 87 | # Calculate metrics 88 | accuracy = correct_predictions / total_samples if total_samples > 0 else 0.0 89 | error_rate = error_predictions / total_samples if total_samples > 0 else 0.0 90 | 91 | # Calculate timing statistics 92 | successful_times = [r['prediction_time'] for r in results if not r['failed']] 93 | avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0 94 | median_prediction_time = statistics.median(successful_times) if successful_times else 0.0 95 | min_prediction_time = min(successful_times) if successful_times else 0.0 96 | max_prediction_time = max(successful_times) if successful_times else 0.0 97 | 98 | # Get VRAM statistics 99 | vram_stats = model_wrapper.get_vram_stats() 100 | 101 | return { 102 | 'model_name': model_wrapper.model_name, 103 | 'total_samples': total_samples, 104 | 'correct_predictions': correct_predictions, 105 | 'failed_predictions': error_predictions, 106 | 'accuracy': accuracy, 107 | 'failure_rate': error_rate, 108 | 'avg_prediction_time': avg_prediction_time, 109 | 'median_prediction_time': median_prediction_time, 110 | 'min_prediction_time': min_prediction_time, 111 | 'max_prediction_time': max_prediction_time, 112 | 'vram_max_mb': vram_stats['max_mb'], 113 | 'vram_avg_mb': vram_stats['avg_mb'], 114 | 'results': results 115 | } 116 | 117 | 118 | async def main(): 119 | """ 120 | Main function to run the benchmark. 121 | """ 122 | # Parse command line arguments 123 | parser = argparse.ArgumentParser(description='ScreenSpot-Pro Benchmark Script') 124 | parser.add_argument('--samples', type=int, default=300, 125 | help='Number of samples to evaluate (default: 300)') 126 | parser.add_argument('--seed', type=int, default=42, 127 | help='Random seed for shuffling (default: 42)') 128 | args = parser.parse_args() 129 | 130 | # Set random seed 131 | random.seed(args.seed) 132 | 133 | # Load dataset 134 | print("Loading ScreenSpot-Pro dataset...") 135 | ds = load_dataset("lmms-lab/ScreenSpot-Pro") 136 | dataset = ds['train'] # type: ignore 137 | # Convert to list to support indexing 138 | dataset_list = list(dataset) 139 | print(f"Dataset loaded: {len(dataset_list)} samples") 140 | 141 | # Shuffle dataset with seed 142 | random.shuffle(dataset_list) 143 | print(f"Dataset shuffled with seed {args.seed}") 144 | 145 | # Get available models 146 | models = get_available_models() 147 | 148 | # Evaluation settings 149 | max_samples = args.samples # Use command line argument 150 | 151 | # Run evaluations 152 | all_results = [] 153 | 154 | for model in models: 155 | model_wrapper = ModelWrapper(model) 156 | result = await evaluate_model(model_wrapper, dataset_list, max_samples) 157 | all_results.append(result) 158 | 159 | # Print summary 160 | print(f"\n{result['model_name']} Results:") 161 | print(f" Accuracy: {result['accuracy']*100:.2f}%") 162 | print(f" Correct: {result['correct_predictions']}/{result['total_samples']}") 163 | print(f" Errors: {result['failed_predictions']}") 164 | print(f" Error Rate: {result['failure_rate']*100:.2f}%") 165 | print(f" Avg Time: {result['avg_prediction_time']:.2f}s") 166 | print(f" Median Time: {result['median_prediction_time']:.2f}s") 167 | print(f" Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s") 168 | print(f" VRAM Max: {result['vram_max_mb']:.1f}MB") 169 | print(f" VRAM Avg: {result['vram_avg_mb']:.1f}MB") 170 | 171 | # Print GPU memory info 172 | gpu_memory = get_gpu_memory() 173 | if gpu_memory and gpu_memory[0] > 0: 174 | print(f" GPU Free Memory: {gpu_memory[0]:.1f}MB") 175 | 176 | # Save results 177 | if all_results: 178 | save_results_to_markdown(all_results) 179 | save_visualizations(all_results, dataset_list) 180 | print("\nBenchmark completed successfully!") 181 | else: 182 | print("\nNo successful evaluations completed.") 183 | 184 | 185 | if __name__ == "__main__": 186 | asyncio.run(main()) ```