This is page 11 of 21. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .all-contributorsrc ├── .cursorignore ├── .devcontainer │ ├── devcontainer.json │ ├── post-install.sh │ └── README.md ├── .dockerignore ├── .gitattributes ├── .github │ ├── FUNDING.yml │ ├── scripts │ │ ├── get_pyproject_version.py │ │ └── tests │ │ ├── __init__.py │ │ ├── README.md │ │ └── test_get_pyproject_version.py │ └── workflows │ ├── ci-lume.yml │ ├── docker-publish-kasm.yml │ ├── docker-publish-xfce.yml │ ├── docker-reusable-publish.yml │ ├── npm-publish-computer.yml │ ├── npm-publish-core.yml │ ├── publish-lume.yml │ ├── pypi-publish-agent.yml │ ├── pypi-publish-computer-server.yml │ ├── pypi-publish-computer.yml │ ├── pypi-publish-core.yml │ ├── pypi-publish-mcp-server.yml │ ├── pypi-publish-pylume.yml │ ├── pypi-publish-som.yml │ ├── pypi-reusable-publish.yml │ └── test-validation-script.yml ├── .gitignore ├── .vscode │ ├── docs.code-workspace │ ├── launch.json │ ├── libs-ts.code-workspace │ ├── lume.code-workspace │ ├── lumier.code-workspace │ ├── py.code-workspace │ └── settings.json ├── blog │ ├── app-use.md │ ├── assets │ │ ├── composite-agents.png │ │ ├── docker-ubuntu-support.png │ │ ├── hack-booth.png │ │ ├── hack-closing-ceremony.jpg │ │ ├── hack-cua-ollama-hud.jpeg │ │ ├── hack-leaderboard.png │ │ ├── hack-the-north.png │ │ ├── hack-winners.jpeg │ │ ├── hack-workshop.jpeg │ │ ├── hud-agent-evals.png │ │ └── trajectory-viewer.jpeg │ ├── bringing-computer-use-to-the-web.md │ ├── build-your-own-operator-on-macos-1.md │ ├── build-your-own-operator-on-macos-2.md │ ├── composite-agents.md │ ├── cua-hackathon.md │ ├── hack-the-north.md │ ├── hud-agent-evals.md │ ├── human-in-the-loop.md │ ├── introducing-cua-cloud-containers.md │ ├── lume-to-containerization.md │ ├── sandboxed-python-execution.md │ ├── training-computer-use-models-trajectories-1.md │ ├── trajectory-viewer.md │ ├── ubuntu-docker-support.md │ └── windows-sandbox.md ├── CONTRIBUTING.md ├── Development.md ├── Dockerfile ├── docs │ ├── .gitignore │ ├── .prettierrc │ ├── content │ │ └── docs │ │ ├── agent-sdk │ │ │ ├── agent-loops.mdx │ │ │ ├── benchmarks │ │ │ │ ├── index.mdx │ │ │ │ ├── interactive.mdx │ │ │ │ ├── introduction.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── osworld-verified.mdx │ │ │ │ ├── screenspot-pro.mdx │ │ │ │ └── screenspot-v2.mdx │ │ │ ├── callbacks │ │ │ │ ├── agent-lifecycle.mdx │ │ │ │ ├── cost-saving.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── logging.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── pii-anonymization.mdx │ │ │ │ └── trajectories.mdx │ │ │ ├── chat-history.mdx │ │ │ ├── custom-computer-handlers.mdx │ │ │ ├── custom-tools.mdx │ │ │ ├── customizing-computeragent.mdx │ │ │ ├── integrations │ │ │ │ ├── hud.mdx │ │ │ │ └── meta.json │ │ │ ├── message-format.mdx │ │ │ ├── meta.json │ │ │ ├── migration-guide.mdx │ │ │ ├── prompt-caching.mdx │ │ │ ├── supported-agents │ │ │ │ ├── composed-agents.mdx │ │ │ │ ├── computer-use-agents.mdx │ │ │ │ ├── grounding-models.mdx │ │ │ │ ├── human-in-the-loop.mdx │ │ │ │ └── meta.json │ │ │ ├── supported-model-providers │ │ │ │ ├── index.mdx │ │ │ │ └── local-models.mdx │ │ │ └── usage-tracking.mdx │ │ ├── computer-sdk │ │ │ ├── cloud-vm-management.mdx │ │ │ ├── commands.mdx │ │ │ ├── computer-ui.mdx │ │ │ ├── computers.mdx │ │ │ ├── meta.json │ │ │ └── sandboxed-python.mdx │ │ ├── index.mdx │ │ ├── libraries │ │ │ ├── agent │ │ │ │ └── index.mdx │ │ │ ├── computer │ │ │ │ └── index.mdx │ │ │ ├── computer-server │ │ │ │ ├── Commands.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── REST-API.mdx │ │ │ │ └── WebSocket-API.mdx │ │ │ ├── core │ │ │ │ └── index.mdx │ │ │ ├── lume │ │ │ │ ├── cli-reference.mdx │ │ │ │ ├── faq.md │ │ │ │ ├── http-api.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── meta.json │ │ │ │ └── prebuilt-images.mdx │ │ │ ├── lumier │ │ │ │ ├── building-lumier.mdx │ │ │ │ ├── docker-compose.mdx │ │ │ │ ├── docker.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ └── meta.json │ │ │ ├── mcp-server │ │ │ │ ├── client-integrations.mdx │ │ │ │ ├── configuration.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── llm-integrations.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── tools.mdx │ │ │ │ └── usage.mdx │ │ │ └── som │ │ │ ├── configuration.mdx │ │ │ └── index.mdx │ │ ├── meta.json │ │ ├── quickstart-cli.mdx │ │ ├── quickstart-devs.mdx │ │ └── telemetry.mdx │ ├── next.config.mjs │ ├── package-lock.json │ ├── package.json │ ├── pnpm-lock.yaml │ ├── postcss.config.mjs │ ├── public │ │ └── img │ │ ├── agent_gradio_ui.png │ │ ├── agent.png │ │ ├── cli.png │ │ ├── computer.png │ │ ├── som_box_threshold.png │ │ └── som_iou_threshold.png │ ├── README.md │ ├── source.config.ts │ ├── src │ │ ├── app │ │ │ ├── (home) │ │ │ │ ├── [[...slug]] │ │ │ │ │ └── page.tsx │ │ │ │ └── layout.tsx │ │ │ ├── api │ │ │ │ └── search │ │ │ │ └── route.ts │ │ │ ├── favicon.ico │ │ │ ├── global.css │ │ │ ├── layout.config.tsx │ │ │ ├── layout.tsx │ │ │ ├── llms.mdx │ │ │ │ └── [[...slug]] │ │ │ │ └── route.ts │ │ │ └── llms.txt │ │ │ └── route.ts │ │ ├── assets │ │ │ ├── discord-black.svg │ │ │ ├── discord-white.svg │ │ │ ├── logo-black.svg │ │ │ └── logo-white.svg │ │ ├── components │ │ │ ├── iou.tsx │ │ │ └── mermaid.tsx │ │ ├── lib │ │ │ ├── llms.ts │ │ │ └── source.ts │ │ └── mdx-components.tsx │ └── tsconfig.json ├── examples │ ├── agent_examples.py │ ├── agent_ui_examples.py │ ├── cloud_api_examples.py │ ├── computer_examples_windows.py │ ├── computer_examples.py │ ├── computer_ui_examples.py │ ├── computer-example-ts │ │ ├── .env.example │ │ ├── .gitignore │ │ ├── .prettierrc │ │ ├── package-lock.json │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── README.md │ │ ├── src │ │ │ ├── helpers.ts │ │ │ └── index.ts │ │ └── tsconfig.json │ ├── docker_examples.py │ ├── evals │ │ ├── hud_eval_examples.py │ │ └── wikipedia_most_linked.txt │ ├── pylume_examples.py │ ├── sandboxed_functions_examples.py │ ├── som_examples.py │ ├── utils.py │ └── winsandbox_example.py ├── img │ ├── agent_gradio_ui.png │ ├── agent.png │ ├── cli.png │ ├── computer.png │ ├── logo_black.png │ └── logo_white.png ├── libs │ ├── kasm │ │ ├── Dockerfile │ │ ├── LICENSE │ │ ├── README.md │ │ └── src │ │ └── ubuntu │ │ └── install │ │ └── firefox │ │ ├── custom_startup.sh │ │ ├── firefox.desktop │ │ └── install_firefox.sh │ ├── lume │ │ ├── .cursorignore │ │ ├── CONTRIBUTING.md │ │ ├── Development.md │ │ ├── img │ │ │ └── cli.png │ │ ├── Package.resolved │ │ ├── Package.swift │ │ ├── README.md │ │ ├── resources │ │ │ └── lume.entitlements │ │ ├── scripts │ │ │ ├── build │ │ │ │ ├── build-debug.sh │ │ │ │ ├── build-release-notarized.sh │ │ │ │ └── build-release.sh │ │ │ └── install.sh │ │ ├── src │ │ │ ├── Commands │ │ │ │ ├── Clone.swift │ │ │ │ ├── Config.swift │ │ │ │ ├── Create.swift │ │ │ │ ├── Delete.swift │ │ │ │ ├── Get.swift │ │ │ │ ├── Images.swift │ │ │ │ ├── IPSW.swift │ │ │ │ ├── List.swift │ │ │ │ ├── Logs.swift │ │ │ │ ├── Options │ │ │ │ │ └── FormatOption.swift │ │ │ │ ├── Prune.swift │ │ │ │ ├── Pull.swift │ │ │ │ ├── Push.swift │ │ │ │ ├── Run.swift │ │ │ │ ├── Serve.swift │ │ │ │ ├── Set.swift │ │ │ │ └── Stop.swift │ │ │ ├── ContainerRegistry │ │ │ │ ├── ImageContainerRegistry.swift │ │ │ │ ├── ImageList.swift │ │ │ │ └── ImagesPrinter.swift │ │ │ ├── Errors │ │ │ │ └── Errors.swift │ │ │ ├── FileSystem │ │ │ │ ├── Home.swift │ │ │ │ ├── Settings.swift │ │ │ │ ├── VMConfig.swift │ │ │ │ ├── VMDirectory.swift │ │ │ │ └── VMLocation.swift │ │ │ ├── LumeController.swift │ │ │ ├── Main.swift │ │ │ ├── Server │ │ │ │ ├── Handlers.swift │ │ │ │ ├── HTTP.swift │ │ │ │ ├── Requests.swift │ │ │ │ ├── Responses.swift │ │ │ │ └── Server.swift │ │ │ ├── Utils │ │ │ │ ├── CommandRegistry.swift │ │ │ │ ├── CommandUtils.swift │ │ │ │ ├── Logger.swift │ │ │ │ ├── NetworkUtils.swift │ │ │ │ ├── Path.swift │ │ │ │ ├── ProcessRunner.swift │ │ │ │ ├── ProgressLogger.swift │ │ │ │ ├── String.swift │ │ │ │ └── Utils.swift │ │ │ ├── Virtualization │ │ │ │ ├── DarwinImageLoader.swift │ │ │ │ ├── DHCPLeaseParser.swift │ │ │ │ ├── ImageLoaderFactory.swift │ │ │ │ └── VMVirtualizationService.swift │ │ │ ├── VM │ │ │ │ ├── DarwinVM.swift │ │ │ │ ├── LinuxVM.swift │ │ │ │ ├── VM.swift │ │ │ │ ├── VMDetails.swift │ │ │ │ ├── VMDetailsPrinter.swift │ │ │ │ ├── VMDisplayResolution.swift │ │ │ │ └── VMFactory.swift │ │ │ └── VNC │ │ │ ├── PassphraseGenerator.swift │ │ │ └── VNCService.swift │ │ └── tests │ │ ├── Mocks │ │ │ ├── MockVM.swift │ │ │ ├── MockVMVirtualizationService.swift │ │ │ └── MockVNCService.swift │ │ ├── VM │ │ │ └── VMDetailsPrinterTests.swift │ │ ├── VMTests.swift │ │ ├── VMVirtualizationServiceTests.swift │ │ └── VNCServiceTests.swift │ ├── lumier │ │ ├── .dockerignore │ │ ├── Dockerfile │ │ ├── README.md │ │ └── src │ │ ├── bin │ │ │ └── entry.sh │ │ ├── config │ │ │ └── constants.sh │ │ ├── hooks │ │ │ └── on-logon.sh │ │ └── lib │ │ ├── utils.sh │ │ └── vm.sh │ ├── python │ │ ├── agent │ │ │ ├── .bumpversion.cfg │ │ │ ├── agent │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── adapters │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── huggingfacelocal_adapter.py │ │ │ │ │ ├── human_adapter.py │ │ │ │ │ ├── mlxvlm_adapter.py │ │ │ │ │ └── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── qwen2_5_vl.py │ │ │ │ ├── agent.py │ │ │ │ ├── callbacks │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── budget_manager.py │ │ │ │ │ ├── image_retention.py │ │ │ │ │ ├── logging.py │ │ │ │ │ ├── operator_validator.py │ │ │ │ │ ├── pii_anonymization.py │ │ │ │ │ ├── prompt_instructions.py │ │ │ │ │ ├── telemetry.py │ │ │ │ │ └── trajectory_saver.py │ │ │ │ ├── cli.py │ │ │ │ ├── computers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cua.py │ │ │ │ │ └── custom.py │ │ │ │ ├── decorators.py │ │ │ │ ├── human_tool │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ ├── server.py │ │ │ │ │ └── ui.py │ │ │ │ ├── integrations │ │ │ │ │ └── hud │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── agent.py │ │ │ │ │ └── proxy.py │ │ │ │ ├── loops │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── anthropic.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── composed_grounded.py │ │ │ │ │ ├── gemini.py │ │ │ │ │ ├── glm45v.py │ │ │ │ │ ├── gta1.py │ │ │ │ │ ├── holo.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── model_types.csv │ │ │ │ │ ├── moondream3.py │ │ │ │ │ ├── omniparser.py │ │ │ │ │ ├── openai.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── uitars.py │ │ │ │ ├── proxy │ │ │ │ │ ├── examples.py │ │ │ │ │ └── handlers.py │ │ │ │ ├── responses.py │ │ │ │ ├── types.py │ │ │ │ └── ui │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ └── gradio │ │ │ │ ├── __init__.py │ │ │ │ ├── app.py │ │ │ │ └── ui_components.py │ │ │ ├── benchmarks │ │ │ │ ├── .gitignore │ │ │ │ ├── contrib.md │ │ │ │ ├── interactive.py │ │ │ │ ├── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ └── gta1.py │ │ │ │ ├── README.md │ │ │ │ ├── ss-pro.py │ │ │ │ ├── ss-v2.py │ │ │ │ └── utils.py │ │ │ ├── example.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer │ │ │ ├── .bumpversion.cfg │ │ │ ├── computer │ │ │ │ ├── __init__.py │ │ │ │ ├── computer.py │ │ │ │ ├── diorama_computer.py │ │ │ │ ├── helpers.py │ │ │ │ ├── interface │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ ├── models.py │ │ │ │ │ └── windows.py │ │ │ │ ├── logger.py │ │ │ │ ├── models.py │ │ │ │ ├── providers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cloud │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── docker │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── lume │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── lume_api.py │ │ │ │ │ ├── lumier │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── types.py │ │ │ │ │ └── winsandbox │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── provider.py │ │ │ │ │ └── setup_script.ps1 │ │ │ │ ├── ui │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ └── gradio │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── app.py │ │ │ │ └── utils.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer-server │ │ │ ├── .bumpversion.cfg │ │ │ ├── computer_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── cli.py │ │ │ │ ├── diorama │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── diorama_computer.py │ │ │ │ │ ├── diorama.py │ │ │ │ │ ├── draw.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── safezone.py │ │ │ │ ├── handlers │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── windows.py │ │ │ │ ├── main.py │ │ │ │ ├── server.py │ │ │ │ └── watchdog.py │ │ │ ├── examples │ │ │ │ ├── __init__.py │ │ │ │ └── usage_example.py │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ ├── run_server.py │ │ │ └── test_connection.py │ │ ├── core │ │ │ ├── .bumpversion.cfg │ │ │ ├── core │ │ │ │ ├── __init__.py │ │ │ │ └── telemetry │ │ │ │ ├── __init__.py │ │ │ │ └── posthog.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── mcp-server │ │ │ ├── .bumpversion.cfg │ │ │ ├── CONCURRENT_SESSIONS.md │ │ │ ├── mcp_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── server.py │ │ │ │ └── session_manager.py │ │ │ ├── pdm.lock │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ └── scripts │ │ │ ├── install_mcp_server.sh │ │ │ └── start_mcp_server.sh │ │ ├── pylume │ │ │ ├── __init__.py │ │ │ ├── .bumpversion.cfg │ │ │ ├── pylume │ │ │ │ ├── __init__.py │ │ │ │ ├── client.py │ │ │ │ ├── exceptions.py │ │ │ │ ├── lume │ │ │ │ ├── models.py │ │ │ │ ├── pylume.py │ │ │ │ └── server.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ └── som │ │ ├── .bumpversion.cfg │ │ ├── LICENSE │ │ ├── poetry.toml │ │ ├── pyproject.toml │ │ ├── README.md │ │ ├── som │ │ │ ├── __init__.py │ │ │ ├── detect.py │ │ │ ├── detection.py │ │ │ ├── models.py │ │ │ ├── ocr.py │ │ │ ├── util │ │ │ │ └── utils.py │ │ │ └── visualization.py │ │ └── tests │ │ └── test_omniparser.py │ ├── typescript │ │ ├── .gitignore │ │ ├── .nvmrc │ │ ├── agent │ │ │ ├── examples │ │ │ │ ├── playground-example.html │ │ │ │ └── README.md │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── client.ts │ │ │ │ ├── index.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ └── client.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── biome.json │ │ ├── computer │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── computer │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── providers │ │ │ │ │ │ ├── base.ts │ │ │ │ │ │ ├── cloud.ts │ │ │ │ │ │ └── index.ts │ │ │ │ │ └── types.ts │ │ │ │ ├── index.ts │ │ │ │ ├── interface │ │ │ │ │ ├── base.ts │ │ │ │ │ ├── factory.ts │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── linux.ts │ │ │ │ │ ├── macos.ts │ │ │ │ │ └── windows.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ ├── computer │ │ │ │ │ └── cloud.test.ts │ │ │ │ ├── interface │ │ │ │ │ ├── factory.test.ts │ │ │ │ │ ├── index.test.ts │ │ │ │ │ ├── linux.test.ts │ │ │ │ │ ├── macos.test.ts │ │ │ │ │ └── windows.test.ts │ │ │ │ └── setup.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── core │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── index.ts │ │ │ │ └── telemetry │ │ │ │ ├── clients │ │ │ │ │ ├── index.ts │ │ │ │ │ └── posthog.ts │ │ │ │ └── index.ts │ │ │ ├── tests │ │ │ │ └── telemetry.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── pnpm-workspace.yaml │ │ └── README.md │ └── xfce │ ├── .dockerignore │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ └── src │ ├── scripts │ │ ├── resize-display.sh │ │ ├── start-computer-server.sh │ │ ├── start-novnc.sh │ │ ├── start-vnc.sh │ │ └── xstartup.sh │ ├── supervisor │ │ └── supervisord.conf │ └── xfce-config │ ├── helpers.rc │ ├── xfce4-power-manager.xml │ └── xfce4-session.xml ├── LICENSE.md ├── Makefile ├── notebooks │ ├── agent_nb.ipynb │ ├── blog │ │ ├── build-your-own-operator-on-macos-1.ipynb │ │ └── build-your-own-operator-on-macos-2.ipynb │ ├── composite_agents_docker_nb.ipynb │ ├── computer_nb.ipynb │ ├── computer_server_nb.ipynb │ ├── customizing_computeragent.ipynb │ ├── eval_osworld.ipynb │ ├── ollama_nb.ipynb │ ├── pylume_nb.ipynb │ ├── README.md │ ├── sota_hackathon_cloud.ipynb │ └── sota_hackathon.ipynb ├── pdm.lock ├── pyproject.toml ├── pyrightconfig.json ├── README.md ├── samples │ └── community │ ├── global-online │ │ └── README.md │ └── hack-the-north │ └── README.md ├── scripts │ ├── build-uv.sh │ ├── build.ps1 │ ├── build.sh │ ├── cleanup.sh │ ├── playground-docker.sh │ ├── playground.sh │ └── run-docker-dev.sh └── tests ├── pytest.ini ├── shell_cmd.py ├── test_files.py ├── test_mcp_server_session_management.py ├── test_mcp_server_streaming.py ├── test_shell_bash.py ├── test_telemetry.py ├── test_venv.py └── test_watchdog.py ``` # Files -------------------------------------------------------------------------------- /libs/python/agent/agent/adapters/human_adapter.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import asyncio 3 | import requests 4 | from typing import List, Dict, Any, Iterator, AsyncIterator 5 | from litellm.types.utils import GenericStreamingChunk, ModelResponse 6 | from litellm.llms.custom_llm import CustomLLM 7 | from litellm import completion, acompletion 8 | 9 | 10 | class HumanAdapter(CustomLLM): 11 | """Human Adapter for human-in-the-loop completions. 12 | 13 | This adapter sends completion requests to a human completion server 14 | where humans can review and respond to AI requests. 15 | """ 16 | 17 | def __init__(self, base_url: str | None = None, timeout: float = 300.0, **kwargs): 18 | """Initialize the human adapter. 19 | 20 | Args: 21 | base_url: Base URL for the human completion server. 22 | Defaults to HUMAN_BASE_URL environment variable or http://localhost:8002 23 | timeout: Timeout in seconds for waiting for human response 24 | **kwargs: Additional arguments 25 | """ 26 | super().__init__() 27 | self.base_url = base_url or os.getenv('HUMAN_BASE_URL', 'http://localhost:8002') 28 | self.timeout = timeout 29 | 30 | # Ensure base_url doesn't end with slash 31 | self.base_url = self.base_url.rstrip('/') 32 | 33 | def _queue_completion(self, messages: List[Dict[str, Any]], model: str) -> str: 34 | """Queue a completion request and return the call ID. 35 | 36 | Args: 37 | messages: Messages in OpenAI format 38 | model: Model name 39 | 40 | Returns: 41 | Call ID for tracking the request 42 | 43 | Raises: 44 | Exception: If queueing fails 45 | """ 46 | try: 47 | response = requests.post( 48 | f"{self.base_url}/queue", 49 | json={"messages": messages, "model": model}, 50 | timeout=10 51 | ) 52 | response.raise_for_status() 53 | return response.json()["id"] 54 | except requests.RequestException as e: 55 | raise Exception(f"Failed to queue completion request: {e}") 56 | 57 | def _wait_for_completion(self, call_id: str) -> Dict[str, Any]: 58 | """Wait for human to complete the call. 59 | 60 | Args: 61 | call_id: ID of the queued completion call 62 | 63 | Returns: 64 | Dict containing response and/or tool_calls 65 | 66 | Raises: 67 | TimeoutError: If timeout is exceeded 68 | Exception: If completion fails 69 | """ 70 | import time 71 | 72 | start_time = time.time() 73 | 74 | while True: 75 | try: 76 | # Check status 77 | status_response = requests.get(f"{self.base_url}/status/{call_id}") 78 | status_response.raise_for_status() 79 | status_data = status_response.json() 80 | 81 | if status_data["status"] == "completed": 82 | result = {} 83 | if "response" in status_data and status_data["response"]: 84 | result["response"] = status_data["response"] 85 | if "tool_calls" in status_data and status_data["tool_calls"]: 86 | result["tool_calls"] = status_data["tool_calls"] 87 | return result 88 | elif status_data["status"] == "failed": 89 | error_msg = status_data.get("error", "Unknown error") 90 | raise Exception(f"Completion failed: {error_msg}") 91 | 92 | # Check timeout 93 | if time.time() - start_time > self.timeout: 94 | raise TimeoutError(f"Timeout waiting for human response after {self.timeout} seconds") 95 | 96 | # Wait before checking again 97 | time.sleep(1.0) 98 | 99 | except requests.RequestException as e: 100 | if time.time() - start_time > self.timeout: 101 | raise TimeoutError(f"Timeout waiting for human response: {e}") 102 | # Continue trying if we haven't timed out 103 | time.sleep(1.0) 104 | 105 | async def _async_wait_for_completion(self, call_id: str) -> Dict[str, Any]: 106 | """Async version of wait_for_completion. 107 | 108 | Args: 109 | call_id: ID of the queued completion call 110 | 111 | Returns: 112 | Dict containing response and/or tool_calls 113 | 114 | Raises: 115 | TimeoutError: If timeout is exceeded 116 | Exception: If completion fails 117 | """ 118 | import aiohttp 119 | import time 120 | 121 | start_time = time.time() 122 | 123 | async with aiohttp.ClientSession() as session: 124 | while True: 125 | try: 126 | # Check status 127 | async with session.get(f"{self.base_url}/status/{call_id}") as response: 128 | response.raise_for_status() 129 | status_data = await response.json() 130 | 131 | if status_data["status"] == "completed": 132 | result = {} 133 | if "response" in status_data and status_data["response"]: 134 | result["response"] = status_data["response"] 135 | if "tool_calls" in status_data and status_data["tool_calls"]: 136 | result["tool_calls"] = status_data["tool_calls"] 137 | return result 138 | elif status_data["status"] == "failed": 139 | error_msg = status_data.get("error", "Unknown error") 140 | raise Exception(f"Completion failed: {error_msg}") 141 | 142 | # Check timeout 143 | if time.time() - start_time > self.timeout: 144 | raise TimeoutError(f"Timeout waiting for human response after {self.timeout} seconds") 145 | 146 | # Wait before checking again 147 | await asyncio.sleep(1.0) 148 | 149 | except Exception as e: 150 | if time.time() - start_time > self.timeout: 151 | raise TimeoutError(f"Timeout waiting for human response: {e}") 152 | # Continue trying if we haven't timed out 153 | await asyncio.sleep(1.0) 154 | 155 | def _generate_response(self, messages: List[Dict[str, Any]], model: str) -> Dict[str, Any]: 156 | """Generate a human response for the given messages. 157 | 158 | Args: 159 | messages: Messages in OpenAI format 160 | model: Model name 161 | 162 | Returns: 163 | Dict containing response and/or tool_calls 164 | """ 165 | # Queue the completion request 166 | call_id = self._queue_completion(messages, model) 167 | 168 | # Wait for human response 169 | response = self._wait_for_completion(call_id) 170 | 171 | return response 172 | 173 | async def _async_generate_response(self, messages: List[Dict[str, Any]], model: str) -> Dict[str, Any]: 174 | """Async version of _generate_response. 175 | 176 | Args: 177 | messages: Messages in OpenAI format 178 | model: Model name 179 | 180 | Returns: 181 | Dict containing response and/or tool_calls 182 | """ 183 | # Queue the completion request (sync operation) 184 | call_id = self._queue_completion(messages, model) 185 | 186 | # Wait for human response (async) 187 | response = await self._async_wait_for_completion(call_id) 188 | 189 | return response 190 | 191 | def completion(self, *args, **kwargs) -> ModelResponse: 192 | """Synchronous completion method. 193 | 194 | Returns: 195 | ModelResponse with human-generated text or tool calls 196 | """ 197 | messages = kwargs.get('messages', []) 198 | model = kwargs.get('model', 'human') 199 | 200 | # Generate human response 201 | human_response_data = self._generate_response(messages, model) 202 | 203 | # Create ModelResponse with proper structure 204 | from litellm.types.utils import ModelResponse, Choices, Message 205 | import uuid 206 | import time 207 | 208 | # Create message content based on response type 209 | if "tool_calls" in human_response_data and human_response_data["tool_calls"]: 210 | # Tool calls response 211 | message = Message( 212 | role="assistant", 213 | content=human_response_data.get("response", ""), 214 | tool_calls=human_response_data["tool_calls"] 215 | ) 216 | else: 217 | # Text response 218 | message = Message( 219 | role="assistant", 220 | content=human_response_data.get("response", "") 221 | ) 222 | 223 | choice = Choices( 224 | finish_reason="stop", 225 | index=0, 226 | message=message 227 | ) 228 | 229 | result = ModelResponse( 230 | id=f"human-{uuid.uuid4()}", 231 | choices=[choice], 232 | created=int(time.time()), 233 | model=f"human/{model}", 234 | object="chat.completion" 235 | ) 236 | 237 | return result 238 | 239 | async def acompletion(self, *args, **kwargs) -> ModelResponse: 240 | """Asynchronous completion method. 241 | 242 | Returns: 243 | ModelResponse with human-generated text or tool calls 244 | """ 245 | messages = kwargs.get('messages', []) 246 | model = kwargs.get('model', 'human') 247 | 248 | # Generate human response 249 | human_response_data = await self._async_generate_response(messages, model) 250 | 251 | # Create ModelResponse with proper structure 252 | from litellm.types.utils import ModelResponse, Choices, Message 253 | import uuid 254 | import time 255 | 256 | # Create message content based on response type 257 | if "tool_calls" in human_response_data and human_response_data["tool_calls"]: 258 | # Tool calls response 259 | message = Message( 260 | role="assistant", 261 | content=human_response_data.get("response", ""), 262 | tool_calls=human_response_data["tool_calls"] 263 | ) 264 | else: 265 | # Text response 266 | message = Message( 267 | role="assistant", 268 | content=human_response_data.get("response", "") 269 | ) 270 | 271 | choice = Choices( 272 | finish_reason="stop", 273 | index=0, 274 | message=message 275 | ) 276 | 277 | result = ModelResponse( 278 | id=f"human-{uuid.uuid4()}", 279 | choices=[choice], 280 | created=int(time.time()), 281 | model=f"human/{model}", 282 | object="chat.completion" 283 | ) 284 | 285 | return result 286 | 287 | def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]: 288 | """Synchronous streaming method. 289 | 290 | Yields: 291 | Streaming chunks with human-generated text or tool calls 292 | """ 293 | messages = kwargs.get('messages', []) 294 | model = kwargs.get('model', 'human') 295 | 296 | # Generate human response 297 | human_response_data = self._generate_response(messages, model) 298 | 299 | import time 300 | 301 | # Handle tool calls vs text response 302 | if "tool_calls" in human_response_data and human_response_data["tool_calls"]: 303 | # Stream tool calls as a single chunk 304 | generic_chunk: GenericStreamingChunk = { 305 | "finish_reason": "tool_calls", 306 | "index": 0, 307 | "is_finished": True, 308 | "text": human_response_data.get("response", ""), 309 | "tool_use": human_response_data["tool_calls"], 310 | "usage": {"completion_tokens": 1, "prompt_tokens": 0, "total_tokens": 1}, 311 | } 312 | yield generic_chunk 313 | else: 314 | # Stream text response 315 | response_text = human_response_data.get("response", "") 316 | generic_chunk: GenericStreamingChunk = { 317 | "finish_reason": "stop", 318 | "index": 0, 319 | "is_finished": True, 320 | "text": response_text, 321 | "tool_use": None, 322 | "usage": {"completion_tokens": len(response_text.split()), "prompt_tokens": 0, "total_tokens": len(response_text.split())}, 323 | } 324 | yield generic_chunk 325 | 326 | async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]: 327 | """Asynchronous streaming method. 328 | 329 | Yields: 330 | Streaming chunks with human-generated text or tool calls 331 | """ 332 | messages = kwargs.get('messages', []) 333 | model = kwargs.get('model', 'human') 334 | 335 | # Generate human response 336 | human_response = await self._async_generate_response(messages, model) 337 | 338 | # Return as single streaming chunk 339 | generic_streaming_chunk: GenericStreamingChunk = { 340 | "finish_reason": "stop", 341 | "index": 0, 342 | "is_finished": True, 343 | "text": human_response, 344 | "tool_use": None, 345 | "usage": {"completion_tokens": len(human_response.split()), "prompt_tokens": 0, "total_tokens": len(human_response.split())}, 346 | } 347 | 348 | yield generic_streaming_chunk ``` -------------------------------------------------------------------------------- /blog/bringing-computer-use-to-the-web.md: -------------------------------------------------------------------------------- ```markdown 1 | # Bringing Computer-Use to the Web 2 | 3 | *Published on August 5, 2025 by Morgan Dean* 4 | 5 | In one of our original posts, we explored building Computer-Use Operators on macOS - first with a [manual implementation](build-your-own-operator-on-macos-1.md) using OpenAI's `computer-use-preview` model, then with our [cua-agent framework](build-your-own-operator-on-macos-2.md) for Python developers. While these tutorials have been incredibly popular, we've received consistent feedback from our community: **"Can we use C/ua with JavaScript and TypeScript?"** 6 | 7 | Today, we're excited to announce the release of the **`@trycua/computer` Web SDK** - a new library that allows you to control your C/ua cloud containers from any JavaScript or TypeScript project. With this library, you can click, type, and grab screenshots from your cloud containers - no extra servers required. 8 | 9 | With this new SDK, you can easily develop CUA experiences like the one below, which we will release soon as open source. 10 | 11 | <div align="center"> 12 | <video src="https://github.com/user-attachments/assets/e213d6c3-73b6-48dd-a7d9-ed761ed74f89" width="600" controls></video> 13 | </div> 14 | 15 | Let’s see how it works. 16 | 17 | ## What You'll Learn 18 | 19 | By the end of this tutorial, you'll be able to: 20 | 21 | - Set up the `@trycua/computer` npm library in any JavaScript/TypeScript project 22 | - Connect OpenAI's computer-use model to C/ua cloud containers from web applications 23 | - Build computer-use agents that work in Node.js, React, Vue, or any web framework 24 | - Handle different types of computer actions (clicking, typing, scrolling) from web code 25 | - Implement the complete computer-use loop in JavaScript/TypeScript 26 | - Integrate AI automation into existing web applications and workflows 27 | 28 | **Prerequisites:** 29 | 30 | - Node.js 16+ and npm/yarn/pnpm 31 | - Basic JavaScript or TypeScript knowledge 32 | - OpenAI API access (Tier 3+ for computer-use-preview) 33 | - C/ua cloud container credits ([get started here](https://trycua.com/pricing)) 34 | 35 | **Estimated Time:** 45-60 minutes 36 | 37 | ## Access Requirements 38 | 39 | ### OpenAI Model Availability 40 | 41 | At the time of writing, the **computer-use-preview** model has limited availability: 42 | 43 | - Only accessible to OpenAI tier 3+ users 44 | - Additional application process may be required even for eligible users 45 | - Cannot be used in the OpenAI Playground 46 | - Outside of ChatGPT Operator, usage is restricted to the new **Responses API** 47 | 48 | Luckily, the `@trycua/computer` library can be used in conjunction with other models, like [Anthropic’s Computer Use](https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool) or [UI-TARS](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B). You’ll just have to write your own handler to parse the model output for interfacing with the container. 49 | 50 | ### C/ua Cloud Containers 51 | 52 | To follow this guide, you’ll need access to a C/ua cloud container. 53 | 54 | Getting access is simple: purchase credits from our [pricing page](https://trycua.com/pricing), then create and provision a new container instance from the [dashboard](https://trycua.com/dashboard/containers). With your container running, you'll be ready to leverage the web SDK and bring automation to your JavaScript or TypeScript applications. 55 | 56 | ## Understanding the Flow 57 | 58 | ### OpenAI API Overview 59 | 60 | Let's start with the basics. In our case, we'll use OpenAI's API to communicate with their computer-use model. 61 | 62 | Think of it like this: 63 | 64 | 1. We send the model a screenshot of our container and tell it what we want it to do 65 | 2. The model looks at the screenshot and decides what actions to take 66 | 3. It sends back instructions (like "click here" or "type this") 67 | 4. We execute those instructions in our container. 68 | 69 | ### Model Setup 70 | 71 | Here's how we set up the computer-use model for web development: 72 | 73 | ```javascript 74 | const res = await openai.responses.create({ 75 | model: 'computer-use-preview', 76 | tools: [ 77 | { 78 | type: 'computer_use_preview', 79 | display_width: 1024, 80 | display_height: 768, 81 | environment: 'linux', // we're using a linux container 82 | }, 83 | ], 84 | input: [ 85 | { 86 | role: 'user', 87 | content: [ 88 | // what we want the ai to do 89 | { type: 'input_text', text: 'Open firefox and go to trycua.com' }, 90 | // first screenshot of the vm 91 | { 92 | type: 'input_image', 93 | image_url: `data:image/png;base64,${screenshotBase64}`, 94 | detail: 'auto', 95 | }, 96 | ], 97 | }, 98 | ], 99 | truncation: 'auto' 100 | }); 101 | ``` 102 | 103 | ### Understanding the Response 104 | 105 | When we send a request, the API sends back a response that looks like this: 106 | 107 | ```json 108 | "output": [ 109 | { 110 | "type": "reasoning", // The AI explains what it's thinking 111 | "id": "rs_67cc...", 112 | "summary": [ 113 | { 114 | "type": "summary_text", 115 | "text": "Clicking on the browser address bar." 116 | } 117 | ] 118 | }, 119 | { 120 | "type": "computer_call", // The actual action to perform 121 | "id": "cu_67cc...", 122 | "call_id": "call_zw3...", // Used to track previous calls 123 | "action": { 124 | "type": "click", // What kind of action (click, type, etc.) 125 | "button": "left", // Which mouse button to use 126 | "x": 156, // Where to click (coordinates) 127 | "y": 50 128 | }, 129 | "pending_safety_checks": [], // Any safety warnings to consider 130 | "status": "completed" // Whether the action was successful 131 | } 132 | ] 133 | 134 | ``` 135 | 136 | Each response contains: 137 | 138 | 1. **Reasoning**: The AI's explanation of what it's doing 139 | 2. **Action**: The specific computer action to perform 140 | 3. **Safety Checks**: Any potential risks to review 141 | 4. **Status**: Whether everything worked as planned 142 | 143 | ## Implementation Guide 144 | 145 | ### Provision a C/ua Cloud Container 146 | 147 | 1. Visit [trycua.com](https://trycua.com), sign up, purchase [credits](https://trycua.com/pricing), and create a new container instance from the [dashboard](https://trycua.com/dashboard). 148 | 2. Create an API key from the dashboard — be sure to save it in a secure location before continuing. 149 | 3. Start the cloud container from the dashboard. 150 | 151 | ### Environment Setup 152 | 153 | 1. Install required packages with your preferred package manager: 154 | 155 | ```bash 156 | npm install --save @trycua/computer # or yarn, pnpm, bun 157 | npm install --save openai # or yarn, pnpm, bun 158 | ``` 159 | 160 | Works with any JavaScript/TypeScript project setup - whether you're using Create React App, Next.js, Vue, Angular, or plain JavaScript. 161 | 162 | 2. Save your OpenAI API key, C/ua API key, and container name to a `.env` file: 163 | 164 | ```bash 165 | OPENAI_API_KEY=openai-api-key 166 | CUA_API_KEY=cua-api-key 167 | CUA_CONTAINER_NAME=cua-cloud-container-name 168 | ``` 169 | 170 | These environment variables work the same whether you're using vanilla JavaScript, TypeScript, or any web framework. 171 | 172 | ## Building the Agent 173 | 174 | ### Mapping API Actions to `@trycua/computer` Interface Methods 175 | 176 | This helper function handles a `computer_call` action from the OpenAI API — converting the action into an equivalent action from the `@trycua/computer` interface. These actions will execute on the initialized `Computer` instance. For example, `await computer.interface.leftClick()` sends a mouse left click to the current cursor position. 177 | 178 | Whether you're using JavaScript or TypeScript, the interface remains the same: 179 | 180 | ```javascript 181 | export async function executeAction( 182 | computer: Computer, 183 | action: OpenAI.Responses.ResponseComputerToolCall['action'] 184 | ) { 185 | switch (action.type) { 186 | case 'click': 187 | const { x, y, button } = action; 188 | console.log(`Executing click at (${x}, ${y}) with button '${button}'.`); 189 | await computer.interface.moveCursor(x, y); 190 | if (button === 'right') await computer.interface.rightClick(); 191 | else await computer.interface.leftClick(); 192 | break; 193 | case 'type': 194 | const { text } = action; 195 | console.log(`Typing text: ${text}`); 196 | await computer.interface.typeText(text); 197 | break; 198 | case 'scroll': 199 | const { x: locX, y: locY, scroll_x, scroll_y } = action; 200 | console.log( 201 | `Scrolling at (${locX}, ${locY}) with offsets (scroll_x=${scroll_x}, scroll_y=${scroll_y}).` 202 | ); 203 | await computer.interface.moveCursor(locX, locY); 204 | await computer.interface.scroll(scroll_x, scroll_y); 205 | break; 206 | case 'keypress': 207 | const { keys } = action; 208 | for (const key of keys) { 209 | console.log(`Pressing key: ${key}.`); 210 | // Map common key names to CUA equivalents 211 | if (key.toLowerCase() === 'enter') { 212 | await computer.interface.pressKey('return'); 213 | } else if (key.toLowerCase() === 'space') { 214 | await computer.interface.pressKey('space'); 215 | } else { 216 | await computer.interface.pressKey(key); 217 | } 218 | } 219 | break; 220 | case 'wait': 221 | console.log(`Waiting for 3 seconds.`); 222 | await new Promise((resolve) => setTimeout(resolve, 3 * 1000)); 223 | break; 224 | case 'screenshot': 225 | console.log('Taking screenshot.'); 226 | // This is handled automatically in the main loop, but we can take an extra one if requested 227 | const screenshot = await computer.interface.screenshot(); 228 | return screenshot; 229 | default: 230 | console.log(`Unrecognized action: ${action.type}`); 231 | break; 232 | } 233 | } 234 | ``` 235 | 236 | ### Implementing the Computer-Use Loop 237 | 238 | This section defines a loop that: 239 | 240 | 1. Initializes the `Computer` instance (connecting to a Linux cloud container). 241 | 2. Captures a screenshot of the current state. 242 | 3. Sends the screenshot (with a user prompt) to the OpenAI Responses API using the `computer-use-preview` model. 243 | 4. Processes the returned `computer_call` action and executes it using our helper function. 244 | 5. Captures an updated screenshot after the action. 245 | 6. Send the updated screenshot and loops until no more actions are returned. 246 | 247 | ```javascript 248 | const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }); 249 | 250 | // Initialize the Computer Connection 251 | const computer = new Computer({ 252 | apiKey: process.env.CUA_API_KEY!, 253 | name: process.env.CUA_CONTAINER_NAME!, 254 | osType: OSType.LINUX, 255 | }); 256 | 257 | await computer.run(); 258 | // Take the initial screenshot 259 | const screenshot = await computer.interface.screenshot(); 260 | const screenshotBase64 = screenshot.toString('base64'); 261 | 262 | // Setup openai config for computer use 263 | const computerUseConfig: OpenAI.Responses.ResponseCreateParamsNonStreaming = { 264 | model: 'computer-use-preview', 265 | tools: [ 266 | { 267 | type: 'computer_use_preview', 268 | display_width: 1024, 269 | display_height: 768, 270 | environment: 'linux', // we're using a linux vm 271 | }, 272 | ], 273 | truncation: 'auto', 274 | }; 275 | 276 | // Send initial screenshot to the openai computer use model 277 | let res = await openai.responses.create({ 278 | ...computerUseConfig, 279 | input: [ 280 | { 281 | role: 'user', 282 | content: [ 283 | // what we want the ai to do 284 | { type: 'input_text', text: 'open firefox and go to trycua.com' }, 285 | // current screenshot of the vm 286 | { 287 | type: 'input_image', 288 | image_url: `data:image/png;base64,${screenshotBase64}`, 289 | detail: 'auto', 290 | }, 291 | ], 292 | }, 293 | ], 294 | }); 295 | 296 | // Loop until there are no more computer use actions. 297 | while (true) { 298 | const computerCalls = res.output.filter((o) => o.type === 'computer_call'); 299 | if (computerCalls.length < 1) { 300 | console.log('No more computer calls. Loop complete.'); 301 | break; 302 | } 303 | // Get the first call 304 | const call = computerCalls[0]; 305 | const action = call.action; 306 | console.log('Received action from OpenAI Responses API:', action); 307 | let ackChecks: OpenAI.Responses.ResponseComputerToolCall.PendingSafetyCheck[] = 308 | []; 309 | if (call.pending_safety_checks.length > 0) { 310 | console.log('Safety checks pending:', call.pending_safety_checks); 311 | // In a real implementation, you would want to get user confirmation here. 312 | ackChecks = call.pending_safety_checks; 313 | } 314 | 315 | // Execute the action in the container 316 | await executeAction(computer, action); 317 | // Wait for changes to process within the container (1sec) 318 | await new Promise((resolve) => setTimeout(resolve, 1000)); 319 | 320 | // Capture new screenshot 321 | const newScreenshot = await computer.interface.screenshot(); 322 | const newScreenshotBase64 = newScreenshot.toString('base64'); 323 | 324 | // Screenshot back as computer_call_output 325 | 326 | res = await openai.responses.create({ 327 | ...computerUseConfig, 328 | previous_response_id: res.id, 329 | input: [ 330 | { 331 | type: 'computer_call_output', 332 | call_id: call.call_id, 333 | acknowledged_safety_checks: ackChecks, 334 | output: { 335 | type: 'computer_screenshot', 336 | image_url: `data:image/png;base64,${newScreenshotBase64}`, 337 | }, 338 | }, 339 | ], 340 | }); 341 | } 342 | ``` 343 | 344 | You can find the full example on [GitHub](https://github.com/trycua/cua/tree/main/examples/computer-example-ts). 345 | 346 | ## What's Next? 347 | 348 | The `@trycua/computer` Web SDK opens up some interesting possibilities. You could build browser-based testing tools, create interactive demos for your products, or automate repetitive workflows directly from your web apps. 349 | 350 | We're working on more examples and better documentation - if you build something cool with this SDK, we'd love to see it. Drop by our [Discord](https://discord.gg/cua-ai) and share what you're working on. 351 | 352 | Happy automating on the web! 353 | ``` -------------------------------------------------------------------------------- /tests/test_mcp_server_session_management.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Tests for MCP Server Session Management functionality. 3 | 4 | This module tests the new concurrent session management and resource lifecycle features. 5 | """ 6 | 7 | import asyncio 8 | import importlib.util 9 | import sys 10 | import types 11 | import time 12 | from pathlib import Path 13 | 14 | import pytest 15 | 16 | 17 | def _install_stub_module(name: str, module: types.ModuleType, registry: dict[str, types.ModuleType | None]) -> None: 18 | registry[name] = sys.modules.get(name) 19 | sys.modules[name] = module 20 | 21 | 22 | @pytest.fixture 23 | def server_module(): 24 | """Create a server module with stubbed dependencies for testing.""" 25 | stubbed_modules: dict[str, types.ModuleType | None] = {} 26 | 27 | # Stub MCP Context primitives 28 | mcp_module = types.ModuleType("mcp") 29 | mcp_module.__path__ = [] # mark as package 30 | 31 | mcp_server_module = types.ModuleType("mcp.server") 32 | mcp_server_module.__path__ = [] 33 | 34 | fastmcp_module = types.ModuleType("mcp.server.fastmcp") 35 | 36 | class _StubContext: 37 | async def yield_message(self, *args, **kwargs): 38 | return None 39 | 40 | async def yield_tool_call(self, *args, **kwargs): 41 | return None 42 | 43 | async def yield_tool_output(self, *args, **kwargs): 44 | return None 45 | 46 | def report_progress(self, *_args, **_kwargs): 47 | return None 48 | 49 | def info(self, *_args, **_kwargs): 50 | return None 51 | 52 | def error(self, *_args, **_kwargs): 53 | return None 54 | 55 | class _StubImage: 56 | def __init__(self, format: str, data: bytes): 57 | self.format = format 58 | self.data = data 59 | 60 | class _StubFastMCP: 61 | def __init__(self, name: str): 62 | self.name = name 63 | self._tools: dict[str, types.FunctionType] = {} 64 | 65 | def tool(self, *args, **kwargs): 66 | def decorator(func): 67 | self._tools[func.__name__] = func 68 | return func 69 | 70 | return decorator 71 | 72 | def run(self): 73 | return None 74 | 75 | fastmcp_module.Context = _StubContext 76 | fastmcp_module.FastMCP = _StubFastMCP 77 | fastmcp_module.Image = _StubImage 78 | 79 | _install_stub_module("mcp", mcp_module, stubbed_modules) 80 | _install_stub_module("mcp.server", mcp_server_module, stubbed_modules) 81 | _install_stub_module("mcp.server.fastmcp", fastmcp_module, stubbed_modules) 82 | 83 | # Stub Computer module 84 | computer_module = types.ModuleType("computer") 85 | 86 | class _StubInterface: 87 | async def screenshot(self) -> bytes: 88 | return b"test-screenshot-data" 89 | 90 | class _StubComputer: 91 | def __init__(self, *args, **kwargs): 92 | self.interface = _StubInterface() 93 | 94 | async def run(self): 95 | return None 96 | 97 | computer_module.Computer = _StubComputer 98 | 99 | _install_stub_module("computer", computer_module, stubbed_modules) 100 | 101 | # Stub agent module 102 | agent_module = types.ModuleType("agent") 103 | 104 | class _StubComputerAgent: 105 | def __init__(self, *args, **kwargs): 106 | pass 107 | 108 | async def run(self, *_args, **_kwargs): 109 | # Simulate agent execution with streaming 110 | yield { 111 | "output": [ 112 | { 113 | "type": "message", 114 | "role": "assistant", 115 | "content": [{"type": "output_text", "text": "Task completed"}] 116 | } 117 | ] 118 | } 119 | 120 | agent_module.ComputerAgent = _StubComputerAgent 121 | 122 | _install_stub_module("agent", agent_module, stubbed_modules) 123 | 124 | # Stub session manager module 125 | session_manager_module = types.ModuleType("mcp_server.session_manager") 126 | 127 | class _StubSessionInfo: 128 | def __init__(self, session_id: str, computer, created_at: float, last_activity: float): 129 | self.session_id = session_id 130 | self.computer = computer 131 | self.created_at = created_at 132 | self.last_activity = last_activity 133 | self.active_tasks = set() 134 | self.is_shutting_down = False 135 | 136 | class _StubSessionManager: 137 | def __init__(self): 138 | self._sessions = {} 139 | self._session_lock = asyncio.Lock() 140 | 141 | async def get_session(self, session_id=None): 142 | """Context manager that returns a session.""" 143 | if session_id is None: 144 | session_id = "test-session-123" 145 | 146 | async with self._session_lock: 147 | if session_id not in self._sessions: 148 | computer = _StubComputer() 149 | session = _StubSessionInfo( 150 | session_id=session_id, 151 | computer=computer, 152 | created_at=time.time(), 153 | last_activity=time.time() 154 | ) 155 | self._sessions[session_id] = session 156 | 157 | return self._sessions[session_id] 158 | 159 | async def register_task(self, session_id: str, task_id: str): 160 | pass 161 | 162 | async def unregister_task(self, session_id: str, task_id: str): 163 | pass 164 | 165 | async def cleanup_session(self, session_id: str): 166 | async with self._session_lock: 167 | self._sessions.pop(session_id, None) 168 | 169 | def get_session_stats(self): 170 | return { 171 | "total_sessions": len(self._sessions), 172 | "max_concurrent": 10, 173 | "sessions": {sid: {"active_tasks": 0} for sid in self._sessions} 174 | } 175 | 176 | _stub_session_manager = _StubSessionManager() 177 | 178 | def get_session_manager(): 179 | return _stub_session_manager 180 | 181 | async def initialize_session_manager(): 182 | return _stub_session_manager 183 | 184 | async def shutdown_session_manager(): 185 | pass 186 | 187 | session_manager_module.get_session_manager = get_session_manager 188 | session_manager_module.initialize_session_manager = initialize_session_manager 189 | session_manager_module.shutdown_session_manager = shutdown_session_manager 190 | 191 | _install_stub_module("mcp_server.session_manager", session_manager_module, stubbed_modules) 192 | 193 | # Load the actual server module 194 | module_name = "mcp_server_server_under_test" 195 | module_path = Path("libs/python/mcp-server/mcp_server/server.py").resolve() 196 | spec = importlib.util.spec_from_file_location(module_name, module_path) 197 | server_module = importlib.util.module_from_spec(spec) 198 | assert spec and spec.loader 199 | spec.loader.exec_module(server_module) 200 | 201 | server_instance = getattr(server_module, "server", None) 202 | if server_instance is not None and hasattr(server_instance, "_tools"): 203 | for name, func in server_instance._tools.items(): 204 | setattr(server_module, name, func) 205 | 206 | try: 207 | yield server_module 208 | finally: 209 | sys.modules.pop(module_name, None) 210 | for name, original in stubbed_modules.items(): 211 | if original is None: 212 | sys.modules.pop(name, None) 213 | else: 214 | sys.modules[name] = original 215 | 216 | 217 | class FakeContext: 218 | """Fake context for testing.""" 219 | 220 | def __init__(self) -> None: 221 | self.events: list[tuple] = [] 222 | self.progress_updates: list[float] = [] 223 | 224 | def info(self, message: str) -> None: 225 | self.events.append(("info", message)) 226 | 227 | def error(self, message: str) -> None: 228 | self.events.append(("error", message)) 229 | 230 | def report_progress(self, value: float) -> None: 231 | self.progress_updates.append(value) 232 | 233 | async def yield_message(self, *, role: str, content): 234 | timestamp = asyncio.get_running_loop().time() 235 | self.events.append(("message", role, content, timestamp)) 236 | 237 | async def yield_tool_call(self, *, name: str | None, call_id: str, input): 238 | timestamp = asyncio.get_running_loop().time() 239 | self.events.append(("tool_call", name, call_id, input, timestamp)) 240 | 241 | async def yield_tool_output(self, *, call_id: str, output, is_error: bool = False): 242 | timestamp = asyncio.get_running_loop().time() 243 | self.events.append(("tool_output", call_id, output, is_error, timestamp)) 244 | 245 | 246 | def test_screenshot_cua_with_session_id(server_module): 247 | """Test that screenshot_cua works with session management.""" 248 | async def _run_test(): 249 | ctx = FakeContext() 250 | result = await server_module.screenshot_cua(ctx, session_id="test-session") 251 | 252 | assert result.format == "png" 253 | assert result.data == b"test-screenshot-data" 254 | 255 | asyncio.run(_run_test()) 256 | 257 | 258 | def test_screenshot_cua_creates_new_session(server_module): 259 | """Test that screenshot_cua creates a new session when none provided.""" 260 | async def _run_test(): 261 | ctx = FakeContext() 262 | result = await server_module.screenshot_cua(ctx) 263 | 264 | assert result.format == "png" 265 | assert result.data == b"test-screenshot-data" 266 | 267 | asyncio.run(_run_test()) 268 | 269 | 270 | def test_run_cua_task_with_session_management(server_module): 271 | """Test that run_cua_task works with session management.""" 272 | async def _run_test(): 273 | ctx = FakeContext() 274 | task = "Test task" 275 | session_id = "test-session-456" 276 | 277 | text_result, image = await server_module.run_cua_task(ctx, task, session_id) 278 | 279 | assert "Task completed" in text_result 280 | assert image.format == "png" 281 | assert image.data == b"test-screenshot-data" 282 | 283 | asyncio.run(_run_test()) 284 | 285 | 286 | def test_run_multi_cua_tasks_sequential(server_module): 287 | """Test that run_multi_cua_tasks works sequentially.""" 288 | async def _run_test(): 289 | ctx = FakeContext() 290 | tasks = ["Task 1", "Task 2", "Task 3"] 291 | 292 | results = await server_module.run_multi_cua_tasks(ctx, tasks, concurrent=False) 293 | 294 | assert len(results) == 3 295 | for i, (text, image) in enumerate(results): 296 | assert "Task completed" in text 297 | assert image.format == "png" 298 | 299 | asyncio.run(_run_test()) 300 | 301 | 302 | def test_run_multi_cua_tasks_concurrent(server_module): 303 | """Test that run_multi_cua_tasks works concurrently.""" 304 | async def _run_test(): 305 | ctx = FakeContext() 306 | tasks = ["Task 1", "Task 2", "Task 3"] 307 | 308 | results = await server_module.run_multi_cua_tasks(ctx, tasks, concurrent=True) 309 | 310 | assert len(results) == 3 311 | for i, (text, image) in enumerate(results): 312 | assert "Task completed" in text 313 | assert image.format == "png" 314 | 315 | asyncio.run(_run_test()) 316 | 317 | 318 | def test_get_session_stats(server_module): 319 | """Test that get_session_stats returns proper statistics.""" 320 | async def _run_test(): 321 | ctx = FakeContext() 322 | stats = await server_module.get_session_stats() 323 | 324 | assert "total_sessions" in stats 325 | assert "max_concurrent" in stats 326 | assert "sessions" in stats 327 | 328 | asyncio.run(_run_test()) 329 | 330 | 331 | def test_cleanup_session(server_module): 332 | """Test that cleanup_session works properly.""" 333 | async def _run_test(): 334 | ctx = FakeContext() 335 | session_id = "test-cleanup-session" 336 | 337 | result = await server_module.cleanup_session(ctx, session_id) 338 | 339 | assert f"Session {session_id} cleanup initiated" in result 340 | 341 | asyncio.run(_run_test()) 342 | 343 | 344 | def test_concurrent_sessions_isolation(server_module): 345 | """Test that concurrent sessions are properly isolated.""" 346 | async def _run_test(): 347 | ctx = FakeContext() 348 | 349 | # Run multiple tasks with different session IDs concurrently 350 | task1 = asyncio.create_task( 351 | server_module.run_cua_task(ctx, "Task for session 1", "session-1") 352 | ) 353 | task2 = asyncio.create_task( 354 | server_module.run_cua_task(ctx, "Task for session 2", "session-2") 355 | ) 356 | 357 | results = await asyncio.gather(task1, task2) 358 | 359 | assert len(results) == 2 360 | for text, image in results: 361 | assert "Task completed" in text 362 | assert image.format == "png" 363 | 364 | asyncio.run(_run_test()) 365 | 366 | 367 | def test_session_reuse_with_same_id(server_module): 368 | """Test that sessions are reused when the same session ID is provided.""" 369 | async def _run_test(): 370 | ctx = FakeContext() 371 | session_id = "reuse-session" 372 | 373 | # First call 374 | result1 = await server_module.screenshot_cua(ctx, session_id) 375 | 376 | # Second call with same session ID 377 | result2 = await server_module.screenshot_cua(ctx, session_id) 378 | 379 | assert result1.format == result2.format 380 | assert result1.data == result2.data 381 | 382 | asyncio.run(_run_test()) 383 | 384 | 385 | def test_error_handling_with_session_management(server_module): 386 | """Test that errors are handled properly with session management.""" 387 | async def _run_test(): 388 | # Mock an agent that raises an exception 389 | class _FailingAgent: 390 | def __init__(self, *args, **kwargs): 391 | pass 392 | 393 | async def run(self, *_args, **_kwargs): 394 | raise RuntimeError("Simulated agent failure") 395 | 396 | # Replace the ComputerAgent with our failing one 397 | original_agent = server_module.ComputerAgent 398 | server_module.ComputerAgent = _FailingAgent 399 | 400 | try: 401 | ctx = FakeContext() 402 | task = "This will fail" 403 | 404 | text_result, image = await server_module.run_cua_task(ctx, task, "error-session") 405 | 406 | assert "Error during task execution" in text_result 407 | assert image.format == "png" 408 | 409 | finally: 410 | # Restore original agent 411 | server_module.ComputerAgent = original_agent 412 | 413 | asyncio.run(_run_test()) 414 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/loops/gemini.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Gemini 2.5 Computer Use agent loop 3 | 4 | Maps internal Agent SDK message format to Google's Gemini Computer Use API and back. 5 | 6 | Key features: 7 | - Lazy import of google.genai 8 | - Configure Computer Use tool with excluded browser-specific predefined functions 9 | - Optional custom function declarations hook for computer-call specific functions 10 | - Convert Gemini function_call parts into internal computer_call actions 11 | """ 12 | 13 | from __future__ import annotations 14 | 15 | import base64 16 | import io 17 | import uuid 18 | from typing import Any, Dict, List, Optional, Tuple 19 | 20 | from PIL import Image 21 | 22 | from ..decorators import register_agent 23 | from ..loops.base import AsyncAgentConfig 24 | from ..types import AgentCapability 25 | 26 | 27 | def _lazy_import_genai(): 28 | """Import google.genai lazily to avoid hard dependency unless used.""" 29 | try: 30 | from google import genai # type: ignore 31 | from google.genai import types # type: ignore 32 | return genai, types 33 | except Exception as e: # pragma: no cover 34 | raise RuntimeError( 35 | "google.genai is required for the Gemini Computer Use loop. Install the Google Gemini SDK." 36 | ) from e 37 | 38 | 39 | def _data_url_to_bytes(data_url: str) -> Tuple[bytes, str]: 40 | """Convert a data URL to raw bytes and mime type.""" 41 | if not data_url.startswith("data:"): 42 | # Assume it's base64 png payload 43 | try: 44 | return base64.b64decode(data_url), "image/png" 45 | except Exception: 46 | return b"", "application/octet-stream" 47 | header, b64 = data_url.split(",", 1) 48 | mime = "image/png" 49 | if ";" in header: 50 | mime = header.split(";")[0].split(":", 1)[1] or "image/png" 51 | return base64.b64decode(b64), mime 52 | 53 | 54 | def _bytes_image_size(img_bytes: bytes) -> Tuple[int, int]: 55 | try: 56 | img = Image.open(io.BytesIO(img_bytes)) 57 | return img.size 58 | except Exception: 59 | return (1024, 768) 60 | 61 | 62 | def _find_last_user_text(messages: List[Dict[str, Any]]) -> List[str]: 63 | texts: List[str] = [] 64 | for msg in reversed(messages): 65 | if msg.get("type") in (None, "message") and msg.get("role") == "user": 66 | content = msg.get("content") 67 | if isinstance(content, str): 68 | return [content] 69 | elif isinstance(content, list): 70 | for c in content: 71 | if c.get("type") in ("input_text", "output_text") and c.get("text"): 72 | texts.append(c["text"]) # newest first 73 | if texts: 74 | return list(reversed(texts)) 75 | return [] 76 | 77 | 78 | def _find_last_screenshot(messages: List[Dict[str, Any]]) -> Optional[bytes]: 79 | for msg in reversed(messages): 80 | if msg.get("type") == "computer_call_output": 81 | out = msg.get("output", {}) 82 | if isinstance(out, dict) and out.get("type") in ("input_image", "computer_screenshot"): 83 | image_url = out.get("image_url", "") 84 | if image_url: 85 | data, _ = _data_url_to_bytes(image_url) 86 | return data 87 | return None 88 | 89 | 90 | def _denormalize(v: int, size: int) -> int: 91 | # Gemini returns 0-999 normalized 92 | try: 93 | return max(0, min(size - 1, int(round(v / 1000 * size)))) 94 | except Exception: 95 | return 0 96 | 97 | 98 | def _map_gemini_fc_to_computer_call( 99 | fc: Dict[str, Any], 100 | screen_w: int, 101 | screen_h: int, 102 | ) -> Optional[Dict[str, Any]]: 103 | name = fc.get("name") 104 | args = fc.get("args", {}) or {} 105 | 106 | action: Dict[str, Any] = {} 107 | if name == "click_at": 108 | x = _denormalize(int(args.get("x", 0)), screen_w) 109 | y = _denormalize(int(args.get("y", 0)), screen_h) 110 | action = {"type": "click", "x": x, "y": y, "button": "left"} 111 | elif name == "type_text_at": 112 | x = _denormalize(int(args.get("x", 0)), screen_w) 113 | y = _denormalize(int(args.get("y", 0)), screen_h) 114 | text = args.get("text", "") 115 | if args.get("press_enter") == True: 116 | text += "\n" 117 | action = {"type": "type", "x": x, "y": y, "text": text} 118 | elif name == "hover_at": 119 | x = _denormalize(int(args.get("x", 0)), screen_w) 120 | y = _denormalize(int(args.get("y", 0)), screen_h) 121 | action = {"type": "move", "x": x, "y": y} 122 | elif name == "key_combination": 123 | keys = str(args.get("keys", "")) 124 | action = {"type": "keypress", "keys": keys} 125 | elif name == "scroll_document": 126 | direction = args.get("direction", "down") 127 | magnitude = 800 128 | dx, dy = 0, 0 129 | if direction == "down": 130 | dy = magnitude 131 | elif direction == "up": 132 | dy = -magnitude 133 | elif direction == "right": 134 | dx = magnitude 135 | elif direction == "left": 136 | dx = -magnitude 137 | action = {"type": "scroll", "scroll_x": dx, "scroll_y": dy, "x": int(screen_w / 2), "y": int(screen_h / 2)} 138 | elif name == "scroll_at": 139 | x = _denormalize(int(args.get("x", 500)), screen_w) 140 | y = _denormalize(int(args.get("y", 500)), screen_h) 141 | direction = args.get("direction", "down") 142 | magnitude = int(args.get("magnitude", 800)) 143 | dx, dy = 0, 0 144 | if direction == "down": 145 | dy = magnitude 146 | elif direction == "up": 147 | dy = -magnitude 148 | elif direction == "right": 149 | dx = magnitude 150 | elif direction == "left": 151 | dx = -magnitude 152 | action = {"type": "scroll", "scroll_x": dx, "scroll_y": dy, "x": x, "y": y} 153 | elif name == "drag_and_drop": 154 | x = _denormalize(int(args.get("x", 0)), screen_w) 155 | y = _denormalize(int(args.get("y", 0)), screen_h) 156 | dx = _denormalize(int(args.get("destination_x", x)), screen_w) 157 | dy = _denormalize(int(args.get("destination_y", y)), screen_h) 158 | action = {"type": "drag", "start_x": x, "start_y": y, "end_x": dx, "end_y": dy, "button": "left"} 159 | elif name == "wait_5_seconds": 160 | action = {"type": "wait"} 161 | else: 162 | # Unsupported / excluded browser-specific or custom function; ignore 163 | return None 164 | 165 | return { 166 | "type": "computer_call", 167 | "call_id": uuid.uuid4().hex, 168 | "status": "completed", 169 | "action": action, 170 | } 171 | 172 | 173 | @register_agent(models=r"^gemini-2\.5-computer-use-preview-10-2025$") 174 | class GeminiComputerUseConfig(AsyncAgentConfig): 175 | async def predict_step( 176 | self, 177 | messages: List[Dict[str, Any]], 178 | model: str, 179 | tools: Optional[List[Dict[str, Any]]] = None, 180 | max_retries: Optional[int] = None, 181 | stream: bool = False, 182 | computer_handler=None, 183 | use_prompt_caching: Optional[bool] = False, 184 | _on_api_start=None, 185 | _on_api_end=None, 186 | _on_usage=None, 187 | _on_screenshot=None, 188 | **kwargs, 189 | ) -> Dict[str, Any]: 190 | genai, types = _lazy_import_genai() 191 | 192 | client = genai.Client() 193 | 194 | # Build excluded predefined functions for browser-specific behavior 195 | excluded = [ 196 | "open_web_browser", 197 | "search", 198 | "navigate", 199 | "go_forward", 200 | "go_back", 201 | "scroll_document", 202 | ] 203 | # Optional custom functions: can be extended by host code via `tools` parameter later if desired 204 | CUSTOM_FUNCTION_DECLARATIONS: List[Any] = [] 205 | 206 | # Compose tools config 207 | generate_content_config = types.GenerateContentConfig( 208 | tools=[ 209 | types.Tool( 210 | computer_use=types.ComputerUse( 211 | environment=types.Environment.ENVIRONMENT_BROWSER, 212 | excluded_predefined_functions=excluded, 213 | ) 214 | ), 215 | # types.Tool(function_declarations=CUSTOM_FUNCTION_DECLARATIONS), # enable when custom functions needed 216 | ] 217 | ) 218 | 219 | # Prepare contents: last user text + latest screenshot 220 | user_texts = _find_last_user_text(messages) 221 | screenshot_bytes = _find_last_screenshot(messages) 222 | 223 | parts: List[Any] = [] 224 | for t in user_texts: 225 | parts.append(types.Part(text=t)) 226 | 227 | screen_w, screen_h = 1024, 768 228 | if screenshot_bytes: 229 | screen_w, screen_h = _bytes_image_size(screenshot_bytes) 230 | parts.append(types.Part.from_bytes(data=screenshot_bytes, mime_type="image/png")) 231 | 232 | # If we don't have any content, at least pass an empty user part to prompt reasoning 233 | if not parts: 234 | parts = [types.Part(text="Proceed to the next action.")] 235 | 236 | contents = [types.Content(role="user", parts=parts)] 237 | 238 | api_kwargs = { 239 | "model": model, 240 | "contents": contents, 241 | "config": generate_content_config, 242 | } 243 | 244 | if _on_api_start: 245 | await _on_api_start({ 246 | "model": api_kwargs["model"], 247 | # "contents": api_kwargs["contents"], # Disabled for now 248 | "config": api_kwargs["config"], 249 | }) 250 | 251 | response = client.models.generate_content(**api_kwargs) 252 | 253 | if _on_api_end: 254 | await _on_api_end({ 255 | "model": api_kwargs["model"], 256 | # "contents": api_kwargs["contents"], # Disabled for now 257 | "config": api_kwargs["config"], 258 | }, response) 259 | 260 | # Usage (Gemini SDK may not always provide token usage; populate when available) 261 | usage: Dict[str, Any] = {} 262 | try: 263 | # Some SDKs expose response.usage; if available, copy 264 | if getattr(response, "usage_metadata", None): 265 | md = response.usage_metadata 266 | usage = { 267 | "prompt_tokens": getattr(md, "prompt_token_count", None) or 0, 268 | "completion_tokens": getattr(md, "candidates_token_count", None) or 0, 269 | "total_tokens": getattr(md, "total_token_count", None) or 0, 270 | } 271 | except Exception: 272 | pass 273 | 274 | if _on_usage and usage: 275 | await _on_usage(usage) 276 | 277 | # Parse output into internal items 278 | output_items: List[Dict[str, Any]] = [] 279 | 280 | candidate = response.candidates[0] 281 | # Text parts from the model (assistant message) 282 | text_parts: List[str] = [] 283 | function_calls: List[Dict[str, Any]] = [] 284 | for p in candidate.content.parts: 285 | if getattr(p, "text", None): 286 | text_parts.append(p.text) 287 | if getattr(p, "function_call", None): 288 | # p.function_call has name and args 289 | fc = { 290 | "name": getattr(p.function_call, "name", None), 291 | "args": dict(getattr(p.function_call, "args", {}) or {}), 292 | } 293 | function_calls.append(fc) 294 | 295 | if text_parts: 296 | output_items.append( 297 | { 298 | "type": "message", 299 | "role": "assistant", 300 | "content": [{"type": "output_text", "text": "\n".join(text_parts)}], 301 | } 302 | ) 303 | 304 | # Map function calls to internal computer_call actions 305 | for fc in function_calls: 306 | item = _map_gemini_fc_to_computer_call(fc, screen_w, screen_h) 307 | if item is not None: 308 | output_items.append(item) 309 | 310 | return {"output": output_items, "usage": usage} 311 | 312 | async def predict_click( 313 | self, 314 | model: str, 315 | image_b64: str, 316 | instruction: str, 317 | **kwargs, 318 | ) -> Optional[Tuple[float, float]]: 319 | """Ask Gemini CUA to output a single click action for the given instruction. 320 | 321 | Excludes all predefined tools except `click_at` and sends the screenshot. 322 | Returns pixel (x, y) if a click is proposed, else None. 323 | """ 324 | genai, types = _lazy_import_genai() 325 | 326 | client = genai.Client() 327 | 328 | # Exclude all but click_at 329 | exclude_all_but_click = [ 330 | "open_web_browser", 331 | "wait_5_seconds", 332 | "go_back", 333 | "go_forward", 334 | "search", 335 | "navigate", 336 | "hover_at", 337 | "type_text_at", 338 | "key_combination", 339 | "scroll_document", 340 | "scroll_at", 341 | "drag_and_drop", 342 | ] 343 | 344 | config = types.GenerateContentConfig( 345 | tools=[ 346 | types.Tool( 347 | computer_use=types.ComputerUse( 348 | environment=types.Environment.ENVIRONMENT_BROWSER, 349 | excluded_predefined_functions=exclude_all_but_click, 350 | ) 351 | ) 352 | ] 353 | ) 354 | 355 | # Prepare prompt parts 356 | try: 357 | img_bytes = base64.b64decode(image_b64) 358 | except Exception: 359 | img_bytes = b"" 360 | 361 | w, h = _bytes_image_size(img_bytes) if img_bytes else (1024, 768) 362 | 363 | parts: List[Any] = [types.Part(text=f"Click {instruction}.")] 364 | if img_bytes: 365 | parts.append(types.Part.from_bytes(data=img_bytes, mime_type="image/png")) 366 | 367 | contents = [types.Content(role="user", parts=parts)] 368 | 369 | response = client.models.generate_content( 370 | model=model, 371 | contents=contents, 372 | config=config, 373 | ) 374 | 375 | # Parse first click_at 376 | try: 377 | candidate = response.candidates[0] 378 | for p in candidate.content.parts: 379 | fc = getattr(p, "function_call", None) 380 | if fc and getattr(fc, "name", None) == "click_at": 381 | args = dict(getattr(fc, "args", {}) or {}) 382 | x = _denormalize(int(args.get("x", 0)), w) 383 | y = _denormalize(int(args.get("y", 0)), h) 384 | return float(x), float(y) 385 | except Exception: 386 | return None 387 | 388 | return None 389 | 390 | def get_capabilities(self) -> List[AgentCapability]: 391 | return ["click", "step"] 392 | ``` -------------------------------------------------------------------------------- /libs/lume/src/FileSystem/Home.swift: -------------------------------------------------------------------------------- ```swift 1 | import Foundation 2 | 3 | /// Manages the application's home directory and virtual machine directories. 4 | /// Responsible for creating, accessing, and validating the application's directory structure. 5 | final class Home { 6 | // MARK: - Constants 7 | 8 | private enum Constants { 9 | static let defaultDirectoryName = ".lume" 10 | static let homeDirPath = "~/\(defaultDirectoryName)" 11 | } 12 | 13 | // MARK: - Properties 14 | 15 | private var _homeDir: Path 16 | private let settingsManager: SettingsManager 17 | private let fileManager: FileManager 18 | private var locations: [String: VMLocation] = [:] 19 | 20 | // Current home directory based on default location 21 | var homeDir: Path { 22 | return _homeDir 23 | } 24 | 25 | // MARK: - Initialization 26 | 27 | init( 28 | settingsManager: SettingsManager = SettingsManager.shared, 29 | fileManager: FileManager = .default 30 | ) { 31 | self.settingsManager = settingsManager 32 | self.fileManager = fileManager 33 | 34 | // Get home directory path from settings or use default 35 | let settings = settingsManager.getSettings() 36 | guard let defaultLocation = settings.defaultLocation else { 37 | fatalError("No default VM location found") 38 | } 39 | 40 | self._homeDir = Path(defaultLocation.path) 41 | 42 | // Cache all locations 43 | for location in settings.vmLocations { 44 | locations[location.name] = location 45 | } 46 | } 47 | 48 | // MARK: - VM Directory Management 49 | 50 | /// Creates a temporary VM directory with a unique identifier 51 | /// - Returns: A VMDirectory instance representing the created directory 52 | /// - Throws: HomeError if directory creation fails 53 | func createTempVMDirectory() throws -> VMDirectory { 54 | let uuid = UUID().uuidString 55 | let tempDir = homeDir.directory(uuid) 56 | 57 | Logger.info("Creating temporary directory", metadata: ["path": tempDir.path]) 58 | 59 | do { 60 | try createDirectory(at: tempDir.url) 61 | return VMDirectory(tempDir) 62 | } catch { 63 | throw HomeError.directoryCreationFailed(path: tempDir.path) 64 | } 65 | } 66 | 67 | /// Gets a VM directory for a specific VM name and optional location 68 | /// 69 | /// - Parameters: 70 | /// - name: Name of the VM directory 71 | /// - storage: Optional name of the VM location (default: default location) 72 | /// - Returns: A VMDirectory instance 73 | /// - Throws: HomeError if location not found 74 | func getVMDirectory(_ name: String, storage: String? = nil) throws -> VMDirectory { 75 | // Special case for ephemeral storage using macOS temporary directory 76 | if let storage = storage, storage == "ephemeral" { 77 | // Get the current temporary directory 78 | let tmpDir = ProcessInfo.processInfo.environment["TMPDIR"] ?? "/tmp" 79 | // Remove trailing slash if present 80 | let cleanPath = tmpDir.hasSuffix("/") ? String(tmpDir.dropLast()) : tmpDir 81 | 82 | // Create the directory if it doesn't exist 83 | if !fileExists(at: cleanPath) { 84 | try createVMLocation(at: cleanPath) 85 | } 86 | 87 | let baseDir = Path(cleanPath) 88 | return VMDirectory(baseDir.directory(name)) 89 | } 90 | 91 | // Check if storage is a direct path 92 | if let storage = storage, (storage.contains("/") || storage.contains("\\")) { 93 | let cleanPath = storage.hasSuffix("/") ? String(storage.dropLast()) : storage 94 | let baseDir = Path(cleanPath) 95 | return VMDirectory(baseDir.directory(name)) 96 | } 97 | 98 | let location: VMLocation 99 | 100 | if let storage = storage { 101 | // Get a specific location 102 | guard let loc = locations[storage] else { 103 | throw VMLocationError.locationNotFound(name: storage) 104 | } 105 | location = loc 106 | } else { 107 | // Use default location 108 | let settings = settingsManager.getSettings() 109 | guard let defaultLocation = settings.defaultLocation else { 110 | throw HomeError.invalidHomeDirectory 111 | } 112 | location = defaultLocation 113 | } 114 | 115 | let baseDir = Path(location.expandedPath) 116 | return VMDirectory(baseDir.directory(name)) 117 | } 118 | 119 | /// Gets a VM directory from a direct file path 120 | /// 121 | /// - Parameters: 122 | /// - name: Name of the VM directory 123 | /// - storagePath: Direct file system path where the VM is located 124 | /// - Returns: A VMDirectory instance 125 | /// - Throws: HomeError if path is invalid 126 | func getVMDirectoryFromPath(_ name: String, storagePath: String) throws -> VMDirectory { 127 | let baseDir = Path(storagePath) 128 | 129 | // Create the directory if it doesn't exist 130 | if !fileExists(at: storagePath) { 131 | Logger.info("Creating storage directory", metadata: ["path": storagePath]) 132 | try createVMLocation(at: storagePath) 133 | } else if !isValidDirectory(at: storagePath) { 134 | // Path exists but isn't a valid directory 135 | throw HomeError.invalidHomeDirectory 136 | } 137 | 138 | return VMDirectory(baseDir.directory(name)) 139 | } 140 | 141 | /// Returns all initialized VM directories across all locations 142 | /// - Returns: An array of VMDirectory instances with location info 143 | /// - Throws: HomeError if directory access is denied 144 | func getAllVMDirectories() throws -> [VMDirectoryWithLocation] { 145 | var results: [VMDirectoryWithLocation] = [] 146 | 147 | // Loop through all locations 148 | let settings = settingsManager.getSettings() 149 | 150 | // Also check ephemeral directory (macOS temporary directory) 151 | let tmpDir = ProcessInfo.processInfo.environment["TMPDIR"] ?? "/tmp" 152 | let cleanPath = tmpDir.hasSuffix("/") ? String(tmpDir.dropLast()) : tmpDir 153 | 154 | // If tmp directory exists, check for VMs there 155 | if fileExists(at: cleanPath) { 156 | let tmpDirPath = Path(cleanPath) 157 | do { 158 | let directoryURL = URL(fileURLWithPath: cleanPath) 159 | let contents = try FileManager.default.contentsOfDirectory( 160 | at: directoryURL, 161 | includingPropertiesForKeys: [.isDirectoryKey], 162 | options: .skipsHiddenFiles 163 | ) 164 | 165 | for subdir in contents { 166 | do { 167 | guard let isDirectory = try subdir.resourceValues(forKeys: [.isDirectoryKey]).isDirectory, 168 | isDirectory else { 169 | continue 170 | } 171 | 172 | let vmName = subdir.lastPathComponent 173 | let vmDir = VMDirectory(tmpDirPath.directory(vmName)) 174 | 175 | // Only include if it's a valid VM directory 176 | if vmDir.initialized() { 177 | results.append(VMDirectoryWithLocation( 178 | directory: vmDir, 179 | locationName: "ephemeral" 180 | )) 181 | } 182 | } catch { 183 | // Skip any directories we can't access 184 | continue 185 | } 186 | } 187 | } catch { 188 | Logger.error( 189 | "Failed to access ephemeral directory", 190 | metadata: [ 191 | "path": cleanPath, 192 | "error": error.localizedDescription, 193 | ] 194 | ) 195 | // Continue to regular locations rather than failing completely 196 | } 197 | } 198 | for location in settings.vmLocations { 199 | let locationPath = Path(location.expandedPath) 200 | 201 | // Skip non-existent locations 202 | if !locationPath.exists() { 203 | continue 204 | } 205 | 206 | do { 207 | let allFolders = try fileManager.contentsOfDirectory( 208 | at: locationPath.url, 209 | includingPropertiesForKeys: nil 210 | ) 211 | 212 | let folders = 213 | allFolders 214 | .compactMap { url in 215 | let sanitizedName = sanitizeFileName(url.lastPathComponent) 216 | let dir = VMDirectory(locationPath.directory(sanitizedName)) 217 | let dirWithLoc = 218 | dir.initialized() 219 | ? VMDirectoryWithLocation(directory: dir, locationName: location.name) 220 | : nil 221 | return dirWithLoc 222 | } 223 | 224 | results.append(contentsOf: folders) 225 | } catch { 226 | Logger.error( 227 | "Failed to access VM location", 228 | metadata: [ 229 | "location": location.name, 230 | "error": error.localizedDescription, 231 | ]) 232 | // Continue to next location rather than failing completely 233 | } 234 | } 235 | 236 | return results 237 | } 238 | 239 | /// Copies a VM directory to a new location with a new name 240 | /// - Parameters: 241 | /// - sourceName: Name of the source VM 242 | /// - destName: Name for the destination VM 243 | /// - sourceLocation: Optional name of the source location 244 | /// - destLocation: Optional name of the destination location 245 | /// - Throws: HomeError if the copy operation fails 246 | func copyVMDirectory( 247 | from sourceName: String, 248 | to destName: String, 249 | sourceLocation: String? = nil, 250 | destLocation: String? = nil 251 | ) throws { 252 | let sourceDir = try getVMDirectory(sourceName, storage: sourceLocation) 253 | let destDir = try getVMDirectory(destName, storage: destLocation) 254 | 255 | // Check if destination directory exists at all 256 | if destDir.exists() { 257 | throw HomeError.directoryAlreadyExists(path: destDir.dir.path) 258 | } 259 | 260 | do { 261 | try fileManager.copyItem(atPath: sourceDir.dir.path, toPath: destDir.dir.path) 262 | } catch { 263 | throw HomeError.directoryCreationFailed(path: destDir.dir.path) 264 | } 265 | } 266 | 267 | // MARK: - Location Management 268 | 269 | /// Adds a new VM location 270 | /// - Parameters: 271 | /// - name: Location name 272 | /// - path: Location path 273 | /// - Throws: Error if location cannot be added 274 | func addLocation(name: String, path: String) throws { 275 | let location = VMLocation(name: name, path: path) 276 | try settingsManager.addLocation(location) 277 | 278 | // Update cache 279 | locations[name] = location 280 | } 281 | 282 | /// Removes a VM location 283 | /// - Parameter name: Location name 284 | /// - Throws: Error if location cannot be removed 285 | func removeLocation(name: String) throws { 286 | try settingsManager.removeLocation(name: name) 287 | 288 | // Update cache 289 | locations.removeValue(forKey: name) 290 | } 291 | 292 | /// Sets the default VM location 293 | /// - Parameter name: Location name 294 | /// - Throws: Error if location cannot be set as default 295 | func setDefaultLocation(name: String) throws { 296 | try settingsManager.setDefaultLocation(name: name) 297 | 298 | // Update home directory 299 | guard let location = locations[name] else { 300 | throw VMLocationError.locationNotFound(name: name) 301 | } 302 | 303 | // Update homeDir to reflect the new default 304 | self._homeDir = Path(location.path) 305 | } 306 | 307 | /// Gets all available VM locations 308 | /// - Returns: Array of VM locations 309 | func getLocations() -> [VMLocation] { 310 | return settingsManager.getSettings().sortedLocations 311 | } 312 | 313 | /// Gets the default VM location 314 | /// - Returns: Default VM location 315 | /// - Throws: HomeError if no default location 316 | func getDefaultLocation() throws -> VMLocation { 317 | guard let location = settingsManager.getSettings().defaultLocation else { 318 | throw HomeError.invalidHomeDirectory 319 | } 320 | return location 321 | } 322 | 323 | // MARK: - Directory Validation 324 | 325 | /// Validates and ensures the existence of all VM locations 326 | /// - Throws: HomeError if validation fails or directory creation fails 327 | func validateHomeDirectory() throws { 328 | let settings = settingsManager.getSettings() 329 | 330 | for location in settings.vmLocations { 331 | let path = location.expandedPath 332 | if !fileExists(at: path) { 333 | try createVMLocation(at: path) 334 | } else if !isValidDirectory(at: path) { 335 | throw HomeError.invalidHomeDirectory 336 | } 337 | } 338 | } 339 | 340 | // MARK: - Private Helpers 341 | 342 | private func createVMLocation(at path: String) throws { 343 | do { 344 | try fileManager.createDirectory( 345 | atPath: path, 346 | withIntermediateDirectories: true 347 | ) 348 | } catch { 349 | throw HomeError.directoryCreationFailed(path: path) 350 | } 351 | } 352 | 353 | private func createDirectory(at url: URL) throws { 354 | try fileManager.createDirectory( 355 | at: url, 356 | withIntermediateDirectories: true 357 | ) 358 | } 359 | 360 | private func isValidDirectory(at path: String) -> Bool { 361 | var isDirectory: ObjCBool = false 362 | return fileManager.fileExists(atPath: path, isDirectory: &isDirectory) 363 | && isDirectory.boolValue 364 | && Path(path).writable() 365 | } 366 | 367 | private func fileExists(at path: String) -> Bool { 368 | return fileManager.fileExists(atPath: path) 369 | } 370 | 371 | private func sanitizeFileName(_ name: String) -> String { 372 | // Only decode percent encoding (e.g., %20 for spaces) 373 | return name.removingPercentEncoding ?? name 374 | } 375 | } 376 | 377 | // MARK: - VM Directory with Location 378 | 379 | /// Represents a VM directory with its location information 380 | struct VMDirectoryWithLocation { 381 | let directory: VMDirectory 382 | let locationName: String 383 | } 384 | 385 | // MARK: - Home + CustomStringConvertible 386 | 387 | extension Home: CustomStringConvertible { 388 | var description: String { 389 | "Home(path: \(homeDir.path))" 390 | } 391 | } 392 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/integrations/hud/agent.py: -------------------------------------------------------------------------------- ```python 1 | """MCP-compatible Computer Agent for HUD integration. 2 | 3 | This agent subclasses HUD's MCPAgent and delegates planning/execution to 4 | our core ComputerAgent while using the Agent SDK's plain-dict message 5 | format documented in `docs/content/docs/agent-sdk/message-format.mdx`. 6 | 7 | Key differences from the OpenAI OperatorAgent variant: 8 | - No OpenAI types are used; everything is standard Python dicts. 9 | - Planning is executed via `ComputerAgent.run(messages)`. 10 | - The first yielded result per step is returned as the agent response. 11 | """ 12 | from __future__ import annotations 13 | 14 | import io 15 | from typing import Any, ClassVar, Optional 16 | 17 | from agent.agent import ComputerAgent as BaseComputerAgent 18 | from agent.callbacks import PromptInstructionsCallback 19 | from agent.callbacks.trajectory_saver import TrajectorySaverCallback 20 | from hud.agents import MCPAgent 21 | from hud.tools.computer.settings import computer_settings 22 | from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace 23 | 24 | from agent.responses import make_failed_tool_call_items 25 | from agent.computers import is_agent_computer 26 | from PIL import Image 27 | import mcp.types as types 28 | import hud 29 | import uuid 30 | import base64 31 | from pathlib import Path 32 | 33 | 34 | class MCPComputerAgent(MCPAgent): 35 | """MCP agent that uses ComputerAgent for planning and tools for execution. 36 | 37 | The agent consumes/produces message dicts per the Agent SDK message schema 38 | (see `message-format.mdx`). 39 | """ 40 | 41 | metadata: ClassVar[dict[str, Any]] = { 42 | "display_width": computer_settings.OPENAI_COMPUTER_WIDTH, 43 | "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT, 44 | } 45 | 46 | required_tools: ClassVar[list[str]] = ["openai_computer"] 47 | 48 | def __init__( 49 | self, 50 | *, 51 | model: str | None = None, 52 | allowed_tools: list[str] | None = None, 53 | trajectory_dir: str | dict | None = None, 54 | # === ComputerAgent kwargs === 55 | tools: list[Any] | None = None, 56 | custom_loop: Any | None = None, 57 | only_n_most_recent_images: int | None = None, 58 | callbacks: list[Any] | None = None, 59 | instructions: str | None = None, 60 | verbosity: int | None = None, 61 | max_retries: int | None = 3, 62 | screenshot_delay: float | int = 0.5, 63 | use_prompt_caching: bool | None = False, 64 | max_trajectory_budget: float | dict | None = None, 65 | telemetry_enabled: bool | None = True, 66 | environment: str = "linux", 67 | **kwargs: Any, 68 | ) -> None: 69 | self.allowed_tools = allowed_tools or ["openai_computer"] 70 | super().__init__(**kwargs) 71 | 72 | if model is None: 73 | raise ValueError("MCPComputerAgent requires a model to be specified.") 74 | 75 | self.model = model 76 | self.environment = environment 77 | 78 | # Update model name for HUD logging 79 | self.model_name = "cua-" + self.model 80 | 81 | # Stateful tracking of tool call inputs 82 | self.tool_call_inputs: dict[str, list[dict[str, Any]]] = {} 83 | self.previous_output: list[dict[str, Any]] = [] 84 | 85 | # Build system prompt 86 | operator_instructions = """ 87 | You are an autonomous computer-using agent. Follow these guidelines: 88 | 89 | 1. NEVER ask for confirmation. Complete all tasks autonomously. 90 | 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed. 91 | 3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking. 92 | 4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files). 93 | 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT. 94 | 6. The user has already given you permission by running this agent. No further confirmation is needed. 95 | 7. Be decisive and action-oriented. Complete the requested task fully. 96 | 97 | Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked. 98 | """.strip() # noqa: E501 99 | # Append Operator instructions to the system prompt 100 | if not self.system_prompt: 101 | self.system_prompt = operator_instructions 102 | else: 103 | self.system_prompt += f"\n\n{operator_instructions}" 104 | # Append user instructions to the system prompt 105 | if instructions: 106 | self.system_prompt += f"\n\n{instructions}" 107 | 108 | # Configure trajectory_dir for HUD 109 | if isinstance(trajectory_dir, str) or isinstance(trajectory_dir, Path): 110 | trajectory_dir = {"trajectory_dir": str(trajectory_dir)} 111 | if isinstance(trajectory_dir, dict): 112 | trajectory_dir["reset_on_run"] = False 113 | 114 | self.last_screenshot_b64 = None 115 | 116 | buffer = io.BytesIO() 117 | Image.new('RGB', (self.metadata["display_width"], self.metadata["display_height"])).save(buffer, format='PNG') 118 | self.last_screenshot_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8') 119 | 120 | # Ensure a computer shim is present so width/height/environment are known 121 | computer_shim = { 122 | "screenshot": lambda: self.last_screenshot_b64, 123 | "environment": self.environment, 124 | "dimensions": ( 125 | self.metadata["display_width"], 126 | self.metadata["display_height"], 127 | ), 128 | } 129 | agent_tools: list[Any] = [computer_shim] 130 | if tools: 131 | agent_tools.extend([ 132 | tool 133 | for tool in tools 134 | if not is_agent_computer(tool) 135 | ]) 136 | 137 | agent_kwargs = { 138 | "model": self.model, 139 | "trajectory_dir": trajectory_dir, 140 | "tools": agent_tools, 141 | "custom_loop": custom_loop, 142 | "only_n_most_recent_images": only_n_most_recent_images, 143 | "callbacks": callbacks, 144 | "instructions": self.system_prompt, 145 | "verbosity": verbosity, 146 | "max_retries": max_retries, 147 | "screenshot_delay": screenshot_delay, 148 | "use_prompt_caching": use_prompt_caching, 149 | "max_trajectory_budget": max_trajectory_budget, 150 | "telemetry_enabled": telemetry_enabled, 151 | } 152 | 153 | self.computer_agent = BaseComputerAgent( 154 | **agent_kwargs 155 | ) 156 | 157 | async def get_system_messages(self) -> list[Any]: 158 | """Create initial messages. 159 | 160 | Unused - ComputerAgent handles this with the 'instructions' parameter. 161 | """ 162 | return [] 163 | 164 | async def format_blocks( 165 | self, blocks: list[types.ContentBlock] 166 | ) -> list[dict[str, Any]]: 167 | """ 168 | Format blocks for OpenAI input format. 169 | 170 | Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts. 171 | """ # noqa: E501 172 | formatted = [] 173 | for block in blocks: 174 | if isinstance(block, types.TextContent): 175 | formatted.append({"type": "input_text", "text": block.text}) 176 | elif isinstance(block, types.ImageContent): 177 | mime_type = getattr(block, "mimeType", "image/png") 178 | formatted.append( 179 | {"type": "input_image", "image_url": f"data:{mime_type};base64,{block.data}"} 180 | ) 181 | self.last_screenshot_b64 = block.data 182 | return [{"role": "user", "content": formatted}] 183 | 184 | @hud.instrument( 185 | span_type="agent", 186 | record_args=False, # Messages can be large 187 | record_result=True, 188 | ) 189 | async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: 190 | """Get a single-step response by delegating to ComputerAgent.run. 191 | 192 | Returns an Agent SDK-style response dict: 193 | { "output": [AgentMessage, ...], "usage": Usage } 194 | """ 195 | tool_calls: list[MCPToolCall] = [] 196 | output_text: list[str] = [] 197 | is_done: bool = True 198 | 199 | agent_result: list[dict[str, Any]] = [] 200 | 201 | # Call the ComputerAgent LLM API 202 | async for result in self.computer_agent.run(messages): # type: ignore[arg-type] 203 | items = result['output'] 204 | if not items or tool_calls: 205 | break 206 | 207 | for item in items: 208 | if item['type'] in ['reasoning', 'message', 'computer_call', 'function_call', 'function_call_output']: 209 | agent_result.append(item) 210 | 211 | # Add messages to output text 212 | if item['type'] == 'reasoning': 213 | output_text.extend( 214 | f"Reasoning: {summary['text']}" 215 | for summary in item['summary'] 216 | ) 217 | elif item['type'] == 'message': 218 | if isinstance(item['content'], list): 219 | output_text.extend( 220 | item['text'] 221 | for item in item['content'] 222 | if item['type'] == 'output_text' 223 | ) 224 | elif isinstance(item['content'], str): 225 | output_text.append(item['content']) 226 | 227 | # If we get a tool call, we're not done 228 | if item['type'] == 'computer_call': 229 | id = item["call_id"] 230 | tool_calls.append(MCPToolCall( 231 | name="openai_computer", 232 | arguments=item["action"], 233 | id=id, 234 | )) 235 | is_done = False 236 | self.tool_call_inputs[id] = agent_result 237 | break 238 | 239 | # if we have tool calls, we should exit the loop 240 | if tool_calls: 241 | break 242 | 243 | self.previous_output = agent_result 244 | 245 | return AgentResponse( 246 | content="\n".join(output_text), 247 | tool_calls=tool_calls, 248 | done=is_done, 249 | ) 250 | 251 | def _log_image(self, image_b64: str): 252 | callbacks = self.computer_agent.callbacks 253 | for callback in callbacks: 254 | if isinstance(callback, TrajectorySaverCallback): 255 | # convert str to bytes 256 | image_bytes = base64.b64decode(image_b64) 257 | callback._save_artifact("screenshot_after", image_bytes) 258 | 259 | async def format_tool_results( 260 | self, 261 | tool_calls: list[MCPToolCall], 262 | tool_results: list[MCPToolResult] 263 | ) -> list[dict[str, Any]]: 264 | """Extract latest screenshot from tool results in dict form. 265 | 266 | Expects results to already be in the message-format content dicts. 267 | Returns a list of input content dicts suitable for follow-up calls. 268 | """ 269 | messages = [] 270 | 271 | for call, result in zip(tool_calls, tool_results): 272 | if call.id not in self.tool_call_inputs: 273 | # If we don't have the tool call inputs, we should just use the previous output 274 | previous_output = self.previous_output.copy() or [] 275 | 276 | # First we need to remove any pending computer_calls from the end of previous_output 277 | while previous_output and previous_output[-1]['type'] == 'computer_call': 278 | previous_output.pop() 279 | messages.extend(previous_output) 280 | 281 | # If the call is a 'response', don't add the result 282 | if call.name == 'response': 283 | continue 284 | # Otherwise, if we have a result, we should add it to the messages 285 | content = [ 286 | { "type": "input_text", "text": content.text } if isinstance(content, types.TextContent) 287 | else { "type": "input_image", "image_url": f"data:image/png;base64,{content.data}" } if isinstance(content, types.ImageContent) 288 | else { "type": "input_text", "text": "" } 289 | for content in result.content 290 | ] 291 | messages.append({ 292 | "role": "user", 293 | "content": content, 294 | }) 295 | 296 | continue 297 | 298 | # Add the assistant's computer call 299 | messages.extend(self.tool_call_inputs[call.id]) 300 | 301 | if result.isError: 302 | error_text = "".join([ 303 | content.text 304 | for content in result.content 305 | if isinstance(content, types.TextContent) 306 | ]) 307 | 308 | # Replace computer call with failed tool call 309 | messages.pop() 310 | messages.extend(make_failed_tool_call_items( 311 | tool_name=call.name, 312 | tool_kwargs=call.arguments or {}, 313 | error_message=error_text, 314 | call_id=call.id, 315 | )) 316 | else: 317 | # Get the latest screenshot 318 | screenshots = [ 319 | content.data 320 | for content in result.content 321 | if isinstance(content, types.ImageContent) 322 | ] 323 | 324 | # Add the resulting screenshot 325 | if screenshots: 326 | self._log_image(screenshots[0]) 327 | self.last_screenshot_b64 = screenshots[0] 328 | messages.append({ 329 | "type": "computer_call_output", 330 | "call_id": call.id, 331 | "output": { 332 | "type": "input_image", 333 | "image_url": f"data:image/png;base64,{screenshots[0]}" 334 | }, 335 | }) 336 | else: 337 | # Otherwise, replace computer call with failed tool call 338 | messages.pop() 339 | messages.extend(make_failed_tool_call_items( 340 | tool_name=call.name, 341 | tool_kwargs=call.arguments or {}, 342 | error_message="No screenshots returned.", 343 | call_id=call.id, 344 | )) 345 | 346 | return messages 347 | 348 | 349 | __all__ = [ 350 | "MCPComputerAgent", 351 | ] 352 | ``` -------------------------------------------------------------------------------- /blog/sandboxed-python-execution.md: -------------------------------------------------------------------------------- ```markdown 1 | # Sandboxed Python Execution: Run Code Safely in Cua Containers 2 | 3 | *Published on June 23, 2025 by Dillon DuPont* 4 | 5 | Cua's computer-use capabilities that we touched on in [Building your own Operator on macOS - Part 2](build-your-own-operator-on-macos-2.md) – your AI agents can click, scroll, type, and interact with any desktop application. But what if your agent needs to do more than just UI automation? What if it needs to process data, make API calls, analyze images, or run complex logic alongside those UI interactions, within the same virtual environment? 6 | 7 | That's where Cua's `@sandboxed` decorator comes in. While Cua handles the clicking and typing, sandboxed execution lets you run full Python code inside the same virtual environment. It's like giving your AI agents a programming brain to complement their clicking fingers. 8 | 9 | Think of it as the perfect marriage: Cua handles the "what you see" (UI interactions), while sandboxed Python handles the "what you compute" (data processing, logic, API calls) – all happening in the same isolated environment. 10 | 11 | ## So, what exactly is sandboxed execution? 12 | 13 | Cua excels at automating user interfaces – clicking buttons, filling forms, navigating applications. But modern AI agents need to do more than just UI automation. They need to process the data they collect, make intelligent decisions, call external APIs, and run sophisticated algorithms. 14 | 15 | Sandboxed execution bridges this gap. You write a Python function, decorate it with `@sandboxed`, and it runs inside your Cua container alongside your UI automation. Your agent can now click a button, extract some data, process it with Python, and then use those results to decide what to click next. 16 | 17 | Here's what makes this combination powerful for AI agent development: 18 | 19 | - **Unified environment**: Your UI automation and code execution happen in the same container 20 | - **Rich capabilities**: Combine Cua's clicking with Python's data processing, API calls, and libraries 21 | - **Seamless integration**: Pass data between UI interactions and Python functions effortlessly 22 | - **Cross-platform consistency**: Your Python code runs the same way across different Cua environments 23 | - **Complete workflows**: Build agents that can both interact with apps AND process the data they collect 24 | 25 | ## The architecture behind @sandboxed 26 | 27 | Let's jump right into an example that'll make this crystal clear: 28 | 29 | ```python 30 | from computer.helpers import sandboxed 31 | 32 | @sandboxed("demo_venv") 33 | def greet_and_print(name): 34 | """This function runs inside the container""" 35 | import PyXA # macOS-specific library 36 | safari = PyXA.Application("Safari") 37 | html = safari.current_document.source() 38 | print(f"Hello from inside the container, {name}!") 39 | return {"greeted": name, "safari_html": html} 40 | 41 | # When called, this executes in the container 42 | result = await greet_and_print("Cua") 43 | ``` 44 | 45 | What's happening here? When you call `greet_and_print()`, Cua extracts the function's source code, transmits it to the container, and executes it there. The result returns to you seamlessly, while the actual execution remains completely isolated. 46 | 47 | ## How does sandboxed execution work? 48 | 49 | Cua's sandboxed execution system employs several key architectural components: 50 | 51 | ### 1. Source Code Extraction 52 | Cua uses Python's `inspect.getsource()` to extract your function's source code and reconstruct the function definition in the remote environment. 53 | 54 | ### 2. Virtual Environment Isolation 55 | Each sandboxed function runs in a named virtual environment within the container. This provides complete dependency isolation between different functions and their respective environments. 56 | 57 | ### 3. Data Serialization and Transport 58 | Arguments and return values are serialized as JSON and transported between the host and container. This ensures compatibility across different Python versions and execution environments. 59 | 60 | ### 4. Comprehensive Error Handling 61 | The system captures both successful results and exceptions, preserving stack traces and error information for debugging purposes. 62 | 63 | ## Getting your sandbox ready 64 | 65 | Setting up sandboxed execution is simple: 66 | 67 | ```python 68 | import asyncio 69 | from computer.computer import Computer 70 | from computer.helpers import sandboxed, set_default_computer 71 | 72 | async def main(): 73 | # Fire up the computer 74 | computer = Computer() 75 | await computer.run() 76 | 77 | # Make it the default for all sandboxed functions 78 | set_default_computer(computer) 79 | 80 | # Install some packages in a virtual environment 81 | await computer.venv_install("demo_venv", ["requests", "beautifulsoup4"]) 82 | ``` 83 | 84 | If you want to get fancy, you can specify which computer instance to use: 85 | 86 | ```python 87 | @sandboxed("my_venv", computer=my_specific_computer) 88 | def my_function(): 89 | # This runs on your specified computer instance 90 | pass 91 | ``` 92 | 93 | ## Real-world examples that actually work 94 | 95 | ### Browser automation without the headaches 96 | 97 | Ever tried to automate a browser and had it crash your entire system? Yeah, us too. Here's how to do it safely: 98 | 99 | ```python 100 | @sandboxed("browser_env") 101 | def automate_browser_with_playwright(): 102 | """Automate browser interactions using Playwright""" 103 | from playwright.sync_api import sync_playwright 104 | import time 105 | import base64 106 | from datetime import datetime 107 | 108 | try: 109 | with sync_playwright() as p: 110 | # Launch browser (visible, because why not?) 111 | browser = p.chromium.launch( 112 | headless=False, 113 | args=['--no-sandbox', '--disable-dev-shm-usage'] 114 | ) 115 | 116 | page = browser.new_page() 117 | page.set_viewport_size({"width": 1280, "height": 720}) 118 | 119 | actions = [] 120 | screenshots = {} 121 | 122 | # Let's visit example.com and poke around 123 | page.goto("https://example.com") 124 | actions.append("Navigated to example.com") 125 | 126 | # Grab a screenshot because screenshots are cool 127 | screenshot_bytes = page.screenshot(full_page=True) 128 | screenshots["initial"] = base64.b64encode(screenshot_bytes).decode() 129 | 130 | # Get some basic info 131 | title = page.title() 132 | actions.append(f"Page title: {title}") 133 | 134 | # Find links and headings 135 | try: 136 | links = page.locator("a").all() 137 | link_texts = [link.text_content() for link in links[:5]] 138 | actions.append(f"Found {len(links)} links: {link_texts}") 139 | 140 | headings = page.locator("h1, h2, h3").all() 141 | heading_texts = [h.text_content() for h in headings[:3]] 142 | actions.append(f"Found headings: {heading_texts}") 143 | 144 | except Exception as e: 145 | actions.append(f"Element interaction error: {str(e)}") 146 | 147 | # Let's try a form for good measure 148 | try: 149 | page.goto("https://httpbin.org/forms/post") 150 | actions.append("Navigated to form page") 151 | 152 | # Fill out the form 153 | page.fill('input[name="custname"]', "Test User from Sandboxed Environment") 154 | page.fill('input[name="custtel"]', "555-0123") 155 | page.fill('input[name="custemail"]', "[email protected]") 156 | page.select_option('select[name="size"]', "large") 157 | 158 | actions.append("Filled out form fields") 159 | 160 | # Submit and see what happens 161 | page.click('input[type="submit"]') 162 | page.wait_for_load_state("networkidle") 163 | 164 | actions.append("Submitted form") 165 | 166 | except Exception as e: 167 | actions.append(f"Form interaction error: {str(e)}") 168 | 169 | browser.close() 170 | 171 | return { 172 | "actions_performed": actions, 173 | "screenshots": screenshots, 174 | "success": True 175 | } 176 | 177 | except Exception as e: 178 | return {"error": f"Browser automation failed: {str(e)}"} 179 | 180 | # Install Playwright and its browsers 181 | await computer.venv_install("browser_env", ["playwright"]) 182 | await computer.venv_cmd("browser_env", "playwright install chromium") 183 | 184 | # Run the automation 185 | result = await automate_browser_with_playwright() 186 | print(f"Performed {len(result.get('actions_performed', []))} actions") 187 | ``` 188 | 189 | ### Building code analysis agents 190 | 191 | Want to build agents that can analyze code safely? Here's a security audit tool that won't accidentally `eval()` your system into oblivion: 192 | 193 | ```python 194 | @sandboxed("analysis_env") 195 | def security_audit_tool(code_snippet): 196 | """Analyze code for potential security issues""" 197 | import ast 198 | import re 199 | 200 | issues = [] 201 | 202 | # Check for the usual suspects 203 | dangerous_patterns = [ 204 | (r'eval\s*\(', "Use of eval() function"), 205 | (r'exec\s*\(', "Use of exec() function"), 206 | (r'__import__\s*\(', "Dynamic import usage"), 207 | (r'subprocess\.', "Subprocess usage"), 208 | (r'os\.system\s*\(', "OS system call"), 209 | ] 210 | 211 | for pattern, description in dangerous_patterns: 212 | if re.search(pattern, code_snippet): 213 | issues.append(description) 214 | 215 | # Get fancy with AST analysis 216 | try: 217 | tree = ast.parse(code_snippet) 218 | for node in ast.walk(tree): 219 | if isinstance(node, ast.Call): 220 | if hasattr(node.func, 'id'): 221 | if node.func.id in ['eval', 'exec', 'compile']: 222 | issues.append(f"Dangerous function call: {node.func.id}") 223 | except SyntaxError: 224 | issues.append("Syntax error in code") 225 | 226 | return { 227 | "security_issues": issues, 228 | "risk_level": "HIGH" if len(issues) > 2 else "MEDIUM" if issues else "LOW" 229 | } 230 | 231 | # Test it on some sketchy code 232 | audit_result = await security_audit_tool("eval(user_input)") 233 | print(f"Security audit: {audit_result}") 234 | ``` 235 | 236 | ### Desktop automation in the cloud 237 | 238 | Here's where things get really interesting. Cua Cloud Sandbox comes with full desktop environments, so you can automate GUIs: 239 | 240 | ```python 241 | @sandboxed("desktop_env") 242 | def take_screenshot_and_analyze(): 243 | """Take a screenshot and analyze the desktop""" 244 | import io 245 | import base64 246 | from PIL import ImageGrab 247 | from datetime import datetime 248 | 249 | try: 250 | # Grab the screen 251 | screenshot = ImageGrab.grab() 252 | 253 | # Convert to base64 for easy transport 254 | buffer = io.BytesIO() 255 | screenshot.save(buffer, format='PNG') 256 | screenshot_data = base64.b64encode(buffer.getvalue()).decode() 257 | 258 | # Get some basic info 259 | screen_info = { 260 | "size": screenshot.size, 261 | "mode": screenshot.mode, 262 | "timestamp": datetime.now().isoformat() 263 | } 264 | 265 | # Analyze the colors (because why not?) 266 | colors = screenshot.getcolors(maxcolors=256*256*256) 267 | dominant_color = max(colors, key=lambda x: x[0])[1] if colors else None 268 | 269 | return { 270 | "screenshot_base64": screenshot_data, 271 | "screen_info": screen_info, 272 | "dominant_color": dominant_color, 273 | "unique_colors": len(colors) if colors else 0 274 | } 275 | 276 | except Exception as e: 277 | return {"error": f"Screenshot failed: {str(e)}"} 278 | 279 | # Install the dependencies 280 | await computer.venv_install("desktop_env", ["Pillow"]) 281 | 282 | # Take and analyze a screenshot 283 | result = await take_screenshot_and_analyze() 284 | print("Desktop analysis complete!") 285 | ``` 286 | 287 | ## Pro tips for sandboxed success 288 | 289 | ### Keep it self-contained 290 | Always put your imports inside the function. Trust us on this one: 291 | 292 | ```python 293 | @sandboxed("good_env") 294 | def good_function(): 295 | import os # Import inside the function 296 | import json 297 | 298 | # Your code here 299 | return {"result": "success"} 300 | ``` 301 | 302 | ### Install dependencies first 303 | Don't forget to install packages before using them: 304 | 305 | ```python 306 | # Install first 307 | await computer.venv_install("my_env", ["pandas", "numpy", "matplotlib"]) 308 | 309 | @sandboxed("my_env") 310 | def data_analysis(): 311 | import pandas as pd 312 | import numpy as np 313 | # Now you can use them 314 | ``` 315 | 316 | ### Use descriptive environment names 317 | Future you will thank you: 318 | 319 | ```python 320 | @sandboxed("data_processing_env") 321 | def process_data(): pass 322 | 323 | @sandboxed("web_scraping_env") 324 | def scrape_site(): pass 325 | 326 | @sandboxed("ml_training_env") 327 | def train_model(): pass 328 | ``` 329 | 330 | ### Always handle errors gracefully 331 | Things break. Plan for it: 332 | 333 | ```python 334 | @sandboxed("robust_env") 335 | def robust_function(data): 336 | try: 337 | result = process_data(data) 338 | return {"success": True, "result": result} 339 | except Exception as e: 340 | return {"success": False, "error": str(e)} 341 | ``` 342 | 343 | ## What about performance? 344 | 345 | Let's be honest – there's some overhead here. Code needs to be serialized, sent over the network, and executed remotely. But for most use cases, the benefits far outweigh the costs. 346 | 347 | If you're building something performance-critical, consider: 348 | - Batching multiple operations into a single sandboxed function 349 | - Minimizing data transfer between host and container 350 | - Using persistent virtual environments 351 | 352 | ## The security angle 353 | 354 | This is where sandboxed execution really shines: 355 | 356 | 1. **Complete process isolation** – code runs in a separate container 357 | 2. **File system protection** – limited access to your host files 358 | 3. **Network isolation** – controlled network access 359 | 4. **Clean environments** – no package conflicts or pollution 360 | 5. **Resource limits** – container-level constraints keep things in check 361 | 362 | ## Ready to get started? 363 | 364 | The `@sandboxed` decorator is one of those features that sounds simple but opens up a world of possibilities. Whether you're testing sketchy code, building AI agents, or just want to keep your development environment pristine, it's got you covered. 365 | 366 | Give it a try in your next Cua project and see how liberating it feels to run code without fear! 367 | 368 | Happy coding (safely)! 369 | 370 | --- 371 | 372 | *Want to dive deeper? Check out our [sandboxed functions examples](https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py) and [virtual environment tests](https://github.com/trycua/cua/blob/main/tests/venv.py) on GitHub. Questions? Come chat with us on Discord!* 373 | ``` -------------------------------------------------------------------------------- /libs/lume/src/FileSystem/Settings.swift: -------------------------------------------------------------------------------- ```swift 1 | import Foundation 2 | 3 | /// Manages the application settings using a config file 4 | struct LumeSettings: Codable, Sendable { 5 | var vmLocations: [VMLocation] 6 | var defaultLocationName: String 7 | var cacheDirectory: String 8 | var cachingEnabled: Bool 9 | 10 | var defaultLocation: VMLocation? { 11 | vmLocations.first { $0.name == defaultLocationName } 12 | } 13 | 14 | // For backward compatibility 15 | var homeDirectory: String { 16 | defaultLocation?.path ?? "~/.lume" 17 | } 18 | 19 | static let defaultSettings = LumeSettings( 20 | vmLocations: [ 21 | VMLocation(name: "default", path: "~/.lume") 22 | ], 23 | defaultLocationName: "default", 24 | cacheDirectory: "~/.lume/cache", 25 | cachingEnabled: true 26 | ) 27 | 28 | /// Gets all locations sorted by name 29 | var sortedLocations: [VMLocation] { 30 | vmLocations.sorted { $0.name < $1.name } 31 | } 32 | } 33 | 34 | final class SettingsManager: @unchecked Sendable { 35 | // MARK: - Constants 36 | 37 | private enum Constants { 38 | // Default path for config 39 | static let fallbackConfigDir = "~/.config/lume" 40 | static let configFileName = "config.yaml" 41 | } 42 | 43 | // MARK: - Properties 44 | 45 | static let shared = SettingsManager() 46 | private let fileManager: FileManager 47 | 48 | // Get the config directory following XDG spec 49 | private var configDir: String { 50 | // Check XDG_CONFIG_HOME environment variable first 51 | if let xdgConfigHome = ProcessInfo.processInfo.environment["XDG_CONFIG_HOME"] { 52 | return "\(xdgConfigHome)/lume" 53 | } 54 | // Fall back to default 55 | return (Constants.fallbackConfigDir as NSString).expandingTildeInPath 56 | } 57 | 58 | // Path to config file 59 | private var configFilePath: String { 60 | return "\(configDir)/\(Constants.configFileName)" 61 | } 62 | 63 | // MARK: - Initialization 64 | 65 | init(fileManager: FileManager = .default) { 66 | self.fileManager = fileManager 67 | ensureConfigDirectoryExists() 68 | } 69 | 70 | // MARK: - Settings Access 71 | 72 | func getSettings() -> LumeSettings { 73 | if let settings = readSettingsFromFile() { 74 | return settings 75 | } 76 | 77 | // No settings file found, use defaults 78 | let defaultSettings = LumeSettings( 79 | vmLocations: [ 80 | VMLocation(name: "default", path: "~/.lume") 81 | ], 82 | defaultLocationName: "default", 83 | cacheDirectory: "~/.lume/cache", 84 | cachingEnabled: true 85 | ) 86 | 87 | // Try to save default settings 88 | try? saveSettings(defaultSettings) 89 | 90 | return defaultSettings 91 | } 92 | 93 | func saveSettings(_ settings: LumeSettings) throws { 94 | try fileManager.createDirectory(atPath: configDir, withIntermediateDirectories: true) 95 | 96 | // Create a human-readable YAML-like configuration file 97 | var yamlContent = "# Lume Configuration\n\n" 98 | 99 | // Default location 100 | yamlContent += "defaultLocationName: \"\(settings.defaultLocationName)\"\n" 101 | 102 | // Cache directory 103 | yamlContent += "cacheDirectory: \"\(settings.cacheDirectory)\"\n" 104 | 105 | // Caching enabled flag 106 | yamlContent += "cachingEnabled: \(settings.cachingEnabled)\n" 107 | 108 | // VM locations 109 | yamlContent += "\n# VM Locations\nvmLocations:\n" 110 | for location in settings.vmLocations { 111 | yamlContent += " - name: \"\(location.name)\"\n" 112 | yamlContent += " path: \"\(location.path)\"\n" 113 | } 114 | 115 | // Write YAML content to file 116 | try yamlContent.write( 117 | to: URL(fileURLWithPath: configFilePath), atomically: true, encoding: .utf8) 118 | } 119 | 120 | // MARK: - VM Location Management 121 | 122 | func addLocation(_ location: VMLocation) throws { 123 | var settings = getSettings() 124 | 125 | // Validate location name (alphanumeric, dash, underscore) 126 | let nameRegex = try NSRegularExpression(pattern: "^[a-zA-Z0-9_-]+$") 127 | let nameRange = NSRange(location.name.startIndex..., in: location.name) 128 | if nameRegex.firstMatch(in: location.name, range: nameRange) == nil { 129 | throw VMLocationError.invalidLocationName(name: location.name) 130 | } 131 | 132 | // Check for duplicate name 133 | if settings.vmLocations.contains(where: { $0.name == location.name }) { 134 | throw VMLocationError.duplicateLocationName(name: location.name) 135 | } 136 | 137 | // Validate location path 138 | try location.validate() 139 | 140 | // Add location 141 | settings.vmLocations.append(location) 142 | try saveSettings(settings) 143 | } 144 | 145 | func removeLocation(name: String) throws { 146 | var settings = getSettings() 147 | 148 | // Check location exists 149 | guard settings.vmLocations.contains(where: { $0.name == name }) else { 150 | throw VMLocationError.locationNotFound(name: name) 151 | } 152 | 153 | // Prevent removing default location 154 | if name == settings.defaultLocationName { 155 | throw VMLocationError.defaultLocationCannotBeRemoved(name: name) 156 | } 157 | 158 | // Remove location 159 | settings.vmLocations.removeAll(where: { $0.name == name }) 160 | try saveSettings(settings) 161 | } 162 | 163 | func setDefaultLocation(name: String) throws { 164 | var settings = getSettings() 165 | 166 | // Check location exists 167 | guard settings.vmLocations.contains(where: { $0.name == name }) else { 168 | throw VMLocationError.locationNotFound(name: name) 169 | } 170 | 171 | // Set default 172 | settings.defaultLocationName = name 173 | try saveSettings(settings) 174 | } 175 | 176 | func getLocation(name: String) throws -> VMLocation { 177 | let settings = getSettings() 178 | 179 | if let location = settings.vmLocations.first(where: { $0.name == name }) { 180 | return location 181 | } 182 | 183 | throw VMLocationError.locationNotFound(name: name) 184 | } 185 | 186 | // MARK: - Legacy Home Directory Compatibility 187 | 188 | func setHomeDirectory(path: String) throws { 189 | var settings = getSettings() 190 | 191 | let defaultLocation = VMLocation(name: "default", path: path) 192 | try defaultLocation.validate() 193 | 194 | // Replace default location 195 | if let index = settings.vmLocations.firstIndex(where: { $0.name == "default" }) { 196 | settings.vmLocations[index] = defaultLocation 197 | } else { 198 | settings.vmLocations.append(defaultLocation) 199 | settings.defaultLocationName = "default" 200 | } 201 | 202 | try saveSettings(settings) 203 | } 204 | 205 | // MARK: - Cache Directory Management 206 | 207 | func setCacheDirectory(path: String) throws { 208 | var settings = getSettings() 209 | 210 | // Validate path 211 | let expandedPath = (path as NSString).expandingTildeInPath 212 | var isDir: ObjCBool = false 213 | 214 | // If directory exists, check if it's writable 215 | if fileManager.fileExists(atPath: expandedPath, isDirectory: &isDir) { 216 | if !isDir.boolValue { 217 | throw SettingsError.notADirectory(path: expandedPath) 218 | } 219 | 220 | if !fileManager.isWritableFile(atPath: expandedPath) { 221 | throw SettingsError.directoryNotWritable(path: expandedPath) 222 | } 223 | } else { 224 | // Try to create the directory 225 | do { 226 | try fileManager.createDirectory( 227 | atPath: expandedPath, 228 | withIntermediateDirectories: true 229 | ) 230 | } catch { 231 | throw SettingsError.directoryCreationFailed(path: expandedPath, error: error) 232 | } 233 | } 234 | 235 | // Update settings 236 | settings.cacheDirectory = path 237 | try saveSettings(settings) 238 | } 239 | 240 | func getCacheDirectory() -> String { 241 | return getSettings().cacheDirectory 242 | } 243 | 244 | func setCachingEnabled(_ enabled: Bool) throws { 245 | var settings = getSettings() 246 | settings.cachingEnabled = enabled 247 | try saveSettings(settings) 248 | } 249 | 250 | func isCachingEnabled() -> Bool { 251 | return getSettings().cachingEnabled 252 | } 253 | 254 | // MARK: - Private Helpers 255 | 256 | private func ensureConfigDirectoryExists() { 257 | try? fileManager.createDirectory(atPath: configDir, withIntermediateDirectories: true) 258 | } 259 | 260 | private func readSettingsFromFile() -> LumeSettings? { 261 | // Read from YAML file 262 | if fileExists(at: configFilePath) { 263 | do { 264 | let yamlString = try String( 265 | contentsOf: URL(fileURLWithPath: configFilePath), encoding: .utf8) 266 | return parseYamlSettings(yamlString) 267 | } catch { 268 | Logger.error( 269 | "Failed to read settings from YAML file", 270 | metadata: ["error": error.localizedDescription] 271 | ) 272 | } 273 | } 274 | return nil 275 | } 276 | 277 | private func parseYamlSettings(_ yamlString: String) -> LumeSettings? { 278 | // This is a very basic YAML parser for our specific config format 279 | // A real implementation would use a proper YAML library 280 | 281 | var defaultLocationName = "default" 282 | var cacheDirectory = "~/.lume/cache" 283 | var cachingEnabled = true // default to true for backward compatibility 284 | var vmLocations: [VMLocation] = [] 285 | 286 | var inLocationsSection = false 287 | var currentLocation: (name: String?, path: String?) = (nil, nil) 288 | 289 | let lines = yamlString.split(separator: "\n") 290 | 291 | for (_, line) in lines.enumerated() { 292 | let trimmedLine = line.trimmingCharacters(in: .whitespaces) 293 | 294 | // Skip comments and empty lines 295 | if trimmedLine.hasPrefix("#") || trimmedLine.isEmpty { 296 | continue 297 | } 298 | 299 | // Check for section marker 300 | if trimmedLine == "vmLocations:" { 301 | inLocationsSection = true 302 | continue 303 | } 304 | 305 | // In the locations section, handle line indentation more carefully 306 | if inLocationsSection { 307 | if trimmedLine.hasPrefix("-") || trimmedLine.contains("- name:") { 308 | // Process the previous location before starting a new one 309 | if let name = currentLocation.name, let path = currentLocation.path { 310 | vmLocations.append(VMLocation(name: name, path: path)) 311 | } 312 | currentLocation = (nil, nil) 313 | } 314 | 315 | // Process the key-value pairs within a location 316 | if let colonIndex = trimmedLine.firstIndex(of: ":") { 317 | let key = trimmedLine[..<colonIndex].trimmingCharacters(in: .whitespaces) 318 | let rawValue = trimmedLine[trimmedLine.index(after: colonIndex)...] 319 | .trimmingCharacters(in: .whitespaces) 320 | let value = extractValueFromYaml(rawValue) 321 | 322 | if key.hasSuffix("name") { 323 | currentLocation.name = value 324 | } else if key.hasSuffix("path") { 325 | currentLocation.path = value 326 | } 327 | } 328 | } else { 329 | // Process top-level keys outside the locations section 330 | if let colonIndex = trimmedLine.firstIndex(of: ":") { 331 | let key = trimmedLine[..<colonIndex].trimmingCharacters(in: .whitespaces) 332 | let rawValue = trimmedLine[trimmedLine.index(after: colonIndex)...] 333 | .trimmingCharacters(in: .whitespaces) 334 | let value = extractValueFromYaml(rawValue) 335 | 336 | if key == "defaultLocationName" { 337 | defaultLocationName = value 338 | } else if key == "cacheDirectory" { 339 | cacheDirectory = value 340 | } else if key == "cachingEnabled" { 341 | cachingEnabled = value.lowercased() == "true" 342 | } 343 | } 344 | } 345 | } 346 | 347 | // Don't forget to add the last location 348 | if let name = currentLocation.name, let path = currentLocation.path { 349 | vmLocations.append(VMLocation(name: name, path: path)) 350 | } 351 | 352 | // Ensure at least one location exists 353 | if vmLocations.isEmpty { 354 | vmLocations.append(VMLocation(name: "default", path: "~/.lume")) 355 | } 356 | 357 | return LumeSettings( 358 | vmLocations: vmLocations, 359 | defaultLocationName: defaultLocationName, 360 | cacheDirectory: cacheDirectory, 361 | cachingEnabled: cachingEnabled 362 | ) 363 | } 364 | 365 | // Helper method to extract a value from YAML, handling quotes 366 | private func extractValueFromYaml(_ rawValue: String) -> String { 367 | if rawValue.hasPrefix("\"") && rawValue.hasSuffix("\"") && rawValue.count >= 2 { 368 | // Remove the surrounding quotes 369 | let startIndex = rawValue.index(after: rawValue.startIndex) 370 | let endIndex = rawValue.index(before: rawValue.endIndex) 371 | return String(rawValue[startIndex..<endIndex]) 372 | } 373 | return rawValue 374 | } 375 | 376 | // Helper method to output debug information about the current settings 377 | func debugSettings() -> String { 378 | let settings = getSettings() 379 | 380 | var output = "Current Settings:\n" 381 | output += "- Default VM storage: \(settings.defaultLocationName)\n" 382 | output += "- Cache directory: \(settings.cacheDirectory)\n" 383 | output += "- VM Locations (\(settings.vmLocations.count)):\n" 384 | 385 | for (i, location) in settings.vmLocations.enumerated() { 386 | let isDefault = location.name == settings.defaultLocationName 387 | let defaultMark = isDefault ? " (default)" : "" 388 | output += " \(i+1). \(location.name): \(location.path)\(defaultMark)\n" 389 | } 390 | 391 | // Also add raw file content 392 | if fileExists(at: configFilePath) { 393 | if let content = try? String(contentsOf: URL(fileURLWithPath: configFilePath)) { 394 | output += "\nRaw YAML file content:\n" 395 | output += content 396 | } 397 | } 398 | 399 | return output 400 | } 401 | 402 | private func fileExists(at path: String) -> Bool { 403 | fileManager.fileExists(atPath: path) 404 | } 405 | } 406 | 407 | // MARK: - Errors 408 | 409 | enum SettingsError: Error, LocalizedError { 410 | case notADirectory(path: String) 411 | case directoryNotWritable(path: String) 412 | case directoryCreationFailed(path: String, error: Error) 413 | 414 | var errorDescription: String? { 415 | switch self { 416 | case .notADirectory(let path): 417 | return "Path is not a directory: \(path)" 418 | case .directoryNotWritable(let path): 419 | return "Directory is not writable: \(path)" 420 | case .directoryCreationFailed(let path, let error): 421 | return "Failed to create directory at \(path): \(error.localizedDescription)" 422 | } 423 | } 424 | } 425 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/adapters/mlxvlm_adapter.py: -------------------------------------------------------------------------------- ```python 1 | import asyncio 2 | import functools 3 | import warnings 4 | import io 5 | import base64 6 | import math 7 | import re 8 | from concurrent.futures import ThreadPoolExecutor 9 | from typing import Iterator, AsyncIterator, Dict, List, Any, Optional, Tuple, cast 10 | from PIL import Image 11 | from litellm.types.utils import GenericStreamingChunk, ModelResponse 12 | from litellm.llms.custom_llm import CustomLLM 13 | from litellm import completion, acompletion 14 | 15 | # Try to import MLX dependencies 16 | try: 17 | import mlx.core as mx 18 | from mlx_vlm import load, generate 19 | from mlx_vlm.prompt_utils import apply_chat_template 20 | from mlx_vlm.utils import load_config 21 | from transformers.tokenization_utils import PreTrainedTokenizer 22 | MLX_AVAILABLE = True 23 | except ImportError: 24 | MLX_AVAILABLE = False 25 | 26 | # Constants for smart_resize 27 | IMAGE_FACTOR = 28 28 | MIN_PIXELS = 100 * 28 * 28 29 | MAX_PIXELS = 16384 * 28 * 28 30 | MAX_RATIO = 200 31 | 32 | def round_by_factor(number: float, factor: int) -> int: 33 | """Returns the closest integer to 'number' that is divisible by 'factor'.""" 34 | return round(number / factor) * factor 35 | 36 | def ceil_by_factor(number: float, factor: int) -> int: 37 | """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'.""" 38 | return math.ceil(number / factor) * factor 39 | 40 | def floor_by_factor(number: float, factor: int) -> int: 41 | """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'.""" 42 | return math.floor(number / factor) * factor 43 | 44 | def smart_resize( 45 | height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS 46 | ) -> tuple[int, int]: 47 | """ 48 | Rescales the image so that the following conditions are met: 49 | 50 | 1. Both dimensions (height and width) are divisible by 'factor'. 51 | 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. 52 | 3. The aspect ratio of the image is maintained as closely as possible. 53 | """ 54 | if max(height, width) / min(height, width) > MAX_RATIO: 55 | raise ValueError( 56 | f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}" 57 | ) 58 | h_bar = max(factor, round_by_factor(height, factor)) 59 | w_bar = max(factor, round_by_factor(width, factor)) 60 | if h_bar * w_bar > max_pixels: 61 | beta = math.sqrt((height * width) / max_pixels) 62 | h_bar = floor_by_factor(height / beta, factor) 63 | w_bar = floor_by_factor(width / beta, factor) 64 | elif h_bar * w_bar < min_pixels: 65 | beta = math.sqrt(min_pixels / (height * width)) 66 | h_bar = ceil_by_factor(height * beta, factor) 67 | w_bar = ceil_by_factor(width * beta, factor) 68 | return h_bar, w_bar 69 | 70 | 71 | class MLXVLMAdapter(CustomLLM): 72 | """MLX VLM Adapter for running vision-language models locally using MLX.""" 73 | 74 | def __init__(self, **kwargs): 75 | """Initialize the adapter. 76 | 77 | Args: 78 | **kwargs: Additional arguments 79 | """ 80 | super().__init__() 81 | 82 | self.models = {} # Cache for loaded models 83 | self.processors = {} # Cache for loaded processors 84 | self.configs = {} # Cache for loaded configs 85 | self._executor = ThreadPoolExecutor(max_workers=1) # Single thread pool 86 | 87 | def _load_model_and_processor(self, model_name: str): 88 | """Load model and processor if not already cached. 89 | 90 | Args: 91 | model_name: Name of the model to load 92 | 93 | Returns: 94 | Tuple of (model, processor, config) 95 | """ 96 | if not MLX_AVAILABLE: 97 | raise ImportError("MLX VLM dependencies not available. Please install mlx-vlm.") 98 | 99 | if model_name not in self.models: 100 | # Load model and processor 101 | model_obj, processor = load( 102 | model_name, 103 | processor_kwargs={"min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS} 104 | ) 105 | config = load_config(model_name) 106 | 107 | # Cache them 108 | self.models[model_name] = model_obj 109 | self.processors[model_name] = processor 110 | self.configs[model_name] = config 111 | 112 | return self.models[model_name], self.processors[model_name], self.configs[model_name] 113 | 114 | def _process_coordinates(self, text: str, original_size: Tuple[int, int], model_size: Tuple[int, int]) -> str: 115 | """Process coordinates in box tokens based on image resizing using smart_resize approach. 116 | 117 | Args: 118 | text: Text containing box tokens 119 | original_size: Original image size (width, height) 120 | model_size: Model processed image size (width, height) 121 | 122 | Returns: 123 | Text with processed coordinates 124 | """ 125 | # Find all box tokens 126 | box_pattern = r"<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>" 127 | 128 | def process_coords(match): 129 | model_x, model_y = int(match.group(1)), int(match.group(2)) 130 | # Scale coordinates from model space to original image space 131 | # Both original_size and model_size are in (width, height) format 132 | new_x = int(model_x * original_size[0] / model_size[0]) # Width 133 | new_y = int(model_y * original_size[1] / model_size[1]) # Height 134 | return f"<|box_start|>({new_x},{new_y})<|box_end|>" 135 | 136 | return re.sub(box_pattern, process_coords, text) 137 | 138 | def _convert_messages(self, messages: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[Image.Image], Dict[int, Tuple[int, int]], Dict[int, Tuple[int, int]]]: 139 | """Convert OpenAI format messages to MLX VLM format and extract images. 140 | 141 | Args: 142 | messages: Messages in OpenAI format 143 | 144 | Returns: 145 | Tuple of (processed_messages, images, original_sizes, model_sizes) 146 | """ 147 | processed_messages = [] 148 | images = [] 149 | original_sizes = {} # Track original sizes of images for coordinate mapping 150 | model_sizes = {} # Track model processed sizes 151 | image_index = 0 152 | 153 | for message in messages: 154 | processed_message = { 155 | "role": message["role"], 156 | "content": [] 157 | } 158 | 159 | content = message.get("content", []) 160 | if isinstance(content, str): 161 | # Simple text content 162 | processed_message["content"] = content 163 | elif isinstance(content, list): 164 | # Multi-modal content 165 | processed_content = [] 166 | for item in content: 167 | if item.get("type") == "text": 168 | processed_content.append({ 169 | "type": "text", 170 | "text": item.get("text", "") 171 | }) 172 | elif item.get("type") == "image_url": 173 | image_url = item.get("image_url", {}).get("url", "") 174 | pil_image = None 175 | 176 | if image_url.startswith("data:image/"): 177 | # Extract base64 data 178 | base64_data = image_url.split(',')[1] 179 | # Convert base64 to PIL Image 180 | image_data = base64.b64decode(base64_data) 181 | pil_image = Image.open(io.BytesIO(image_data)) 182 | else: 183 | # Handle file path or URL 184 | pil_image = Image.open(image_url) 185 | 186 | # Store original image size for coordinate mapping 187 | original_size = pil_image.size 188 | original_sizes[image_index] = original_size 189 | 190 | # Use smart_resize to determine model size 191 | # Note: smart_resize expects (height, width) but PIL gives (width, height) 192 | height, width = original_size[1], original_size[0] 193 | new_height, new_width = smart_resize(height, width) 194 | # Store model size in (width, height) format for consistent coordinate processing 195 | model_sizes[image_index] = (new_width, new_height) 196 | 197 | # Resize the image using the calculated dimensions from smart_resize 198 | resized_image = pil_image.resize((new_width, new_height)) 199 | images.append(resized_image) 200 | 201 | # Add image placeholder to content 202 | processed_content.append({ 203 | "type": "image" 204 | }) 205 | 206 | image_index += 1 207 | 208 | processed_message["content"] = processed_content 209 | 210 | processed_messages.append(processed_message) 211 | 212 | return processed_messages, images, original_sizes, model_sizes 213 | 214 | def _generate(self, **kwargs) -> str: 215 | """Generate response using the local MLX VLM model. 216 | 217 | Args: 218 | **kwargs: Keyword arguments containing messages and model info 219 | 220 | Returns: 221 | Generated text response 222 | """ 223 | messages = kwargs.get('messages', []) 224 | model_name = kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit') 225 | max_tokens = kwargs.get('max_tokens', 128) 226 | 227 | # Warn about ignored kwargs 228 | ignored_kwargs = set(kwargs.keys()) - {'messages', 'model', 'max_tokens'} 229 | if ignored_kwargs: 230 | warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}") 231 | 232 | # Load model and processor 233 | model, processor, config = self._load_model_and_processor(model_name) 234 | 235 | # Convert messages and extract images 236 | processed_messages, images, original_sizes, model_sizes = self._convert_messages(messages) 237 | 238 | # Process user text input with box coordinates after image processing 239 | # Swap original_size and model_size arguments for inverse transformation 240 | for msg_idx, msg in enumerate(processed_messages): 241 | if msg.get("role") == "user" and isinstance(msg.get("content"), str): 242 | content = msg.get("content", "") 243 | if "<|box_start|>" in content and original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes: 244 | orig_size = original_sizes[0] 245 | model_size = model_sizes[0] 246 | # Swap arguments to perform inverse transformation for user input 247 | processed_messages[msg_idx]["content"] = self._process_coordinates(content, model_size, orig_size) 248 | 249 | try: 250 | # Format prompt according to model requirements using the processor directly 251 | prompt = processor.apply_chat_template( 252 | processed_messages, 253 | tokenize=False, 254 | add_generation_prompt=True, 255 | return_tensors='pt' 256 | ) 257 | tokenizer = cast(PreTrainedTokenizer, processor) 258 | 259 | # Generate response 260 | text_content, usage = generate( 261 | model, 262 | tokenizer, 263 | str(prompt), 264 | images, # type: ignore 265 | verbose=False, 266 | max_tokens=max_tokens 267 | ) 268 | 269 | except Exception as e: 270 | raise RuntimeError(f"Error generating response: {str(e)}") from e 271 | 272 | # Process coordinates in the response back to original image space 273 | if original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes: 274 | # Get original image size and model size (using the first image) 275 | orig_size = original_sizes[0] 276 | model_size = model_sizes[0] 277 | 278 | # Check if output contains box tokens that need processing 279 | if "<|box_start|>" in text_content: 280 | # Process coordinates from model space back to original image space 281 | text_content = self._process_coordinates(text_content, orig_size, model_size) 282 | 283 | return text_content 284 | 285 | def completion(self, *args, **kwargs) -> ModelResponse: 286 | """Synchronous completion method. 287 | 288 | Returns: 289 | ModelResponse with generated text 290 | """ 291 | generated_text = self._generate(**kwargs) 292 | 293 | result = completion( 294 | model=f"mlx/{kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')}", 295 | mock_response=generated_text, 296 | ) 297 | return cast(ModelResponse, result) 298 | 299 | async def acompletion(self, *args, **kwargs) -> ModelResponse: 300 | """Asynchronous completion method. 301 | 302 | Returns: 303 | ModelResponse with generated text 304 | """ 305 | # Run _generate in thread pool to avoid blocking 306 | loop = asyncio.get_event_loop() 307 | generated_text = await loop.run_in_executor( 308 | self._executor, 309 | functools.partial(self._generate, **kwargs) 310 | ) 311 | 312 | result = await acompletion( 313 | model=f"mlx/{kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')}", 314 | mock_response=generated_text, 315 | ) 316 | return cast(ModelResponse, result) 317 | 318 | def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]: 319 | """Synchronous streaming method. 320 | 321 | Returns: 322 | Iterator of GenericStreamingChunk 323 | """ 324 | generated_text = self._generate(**kwargs) 325 | 326 | generic_streaming_chunk: GenericStreamingChunk = { 327 | "finish_reason": "stop", 328 | "index": 0, 329 | "is_finished": True, 330 | "text": generated_text, 331 | "tool_use": None, 332 | "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0}, 333 | } 334 | 335 | yield generic_streaming_chunk 336 | 337 | async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]: 338 | """Asynchronous streaming method. 339 | 340 | Returns: 341 | AsyncIterator of GenericStreamingChunk 342 | """ 343 | # Run _generate in thread pool to avoid blocking 344 | loop = asyncio.get_event_loop() 345 | generated_text = await loop.run_in_executor( 346 | self._executor, 347 | functools.partial(self._generate, **kwargs) 348 | ) 349 | 350 | generic_streaming_chunk: GenericStreamingChunk = { 351 | "finish_reason": "stop", 352 | "index": 0, 353 | "is_finished": True, 354 | "text": generated_text, 355 | "tool_use": None, 356 | "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0}, 357 | } 358 | 359 | yield generic_streaming_chunk ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/loops/omniparser.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | OpenAI computer-use-preview agent loop implementation using liteLLM 3 | Paper: https://arxiv.org/abs/2408.00203 4 | Code: https://github.com/microsoft/OmniParser 5 | """ 6 | 7 | import asyncio 8 | import json 9 | from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple 10 | import litellm 11 | import inspect 12 | import base64 13 | 14 | from ..decorators import register_agent 15 | from ..types import Messages, AgentResponse, Tools, AgentCapability 16 | from ..loops.base import AsyncAgentConfig 17 | 18 | SOM_TOOL_SCHEMA = { 19 | "type": "function", 20 | "name": "computer", 21 | "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.", 22 | "parameters": { 23 | "type": "object", 24 | "properties": { 25 | "action": { 26 | "type": "string", 27 | "enum": [ 28 | "screenshot", 29 | "click", 30 | "double_click", 31 | "drag", 32 | "type", 33 | "keypress", 34 | "scroll", 35 | "move", 36 | "wait", 37 | "get_current_url", 38 | "get_dimensions", 39 | "get_environment" 40 | ], 41 | "description": "The action to perform" 42 | }, 43 | "element_id": { 44 | "type": "integer", 45 | "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)" 46 | }, 47 | "start_element_id": { 48 | "type": "integer", 49 | "description": "The ID of the element to start dragging from (required for drag action)" 50 | }, 51 | "end_element_id": { 52 | "type": "integer", 53 | "description": "The ID of the element to drag to (required for drag action)" 54 | }, 55 | "text": { 56 | "type": "string", 57 | "description": "The text to type (required for type action)" 58 | }, 59 | "keys": { 60 | "type": "string", 61 | "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')" 62 | }, 63 | "button": { 64 | "type": "string", 65 | "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left", 66 | }, 67 | "scroll_x": { 68 | "type": "integer", 69 | "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)", 70 | }, 71 | "scroll_y": { 72 | "type": "integer", 73 | "description": "Vertical scroll amount for scroll action (positive for down, negative for up)", 74 | }, 75 | }, 76 | "required": [ 77 | "action" 78 | ] 79 | } 80 | } 81 | 82 | OMNIPARSER_AVAILABLE = False 83 | try: 84 | from som import OmniParser 85 | OMNIPARSER_AVAILABLE = True 86 | except ImportError: 87 | pass 88 | OMNIPARSER_SINGLETON = None 89 | 90 | def get_parser(): 91 | global OMNIPARSER_SINGLETON 92 | if OMNIPARSER_SINGLETON is None: 93 | OMNIPARSER_SINGLETON = OmniParser() 94 | return OMNIPARSER_SINGLETON 95 | 96 | def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]: 97 | """Get the last computer_call_output message from a messages list. 98 | 99 | Args: 100 | messages: List of messages to search through 101 | 102 | Returns: 103 | The last computer_call_output message dict, or None if not found 104 | """ 105 | for message in reversed(messages): 106 | if isinstance(message, dict) and message.get("type") == "computer_call_output": 107 | return message 108 | return None 109 | 110 | def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[Tools, dict]: 111 | """Prepare tools for OpenAI API format""" 112 | omniparser_tools = [] 113 | id2xy = dict() 114 | 115 | for schema in tool_schemas: 116 | if schema["type"] == "computer": 117 | omniparser_tools.append(SOM_TOOL_SCHEMA) 118 | if "id2xy" in schema: 119 | id2xy = schema["id2xy"] 120 | else: 121 | schema["id2xy"] = id2xy 122 | elif schema["type"] == "function": 123 | # Function tools use OpenAI-compatible schema directly (liteLLM expects this format) 124 | # Schema should be: {type, name, description, parameters} 125 | omniparser_tools.append({ "type": "function", **schema["function"] }) 126 | 127 | return omniparser_tools, id2xy 128 | 129 | async def replace_function_with_computer_call(item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]): 130 | item_type = item.get("type") 131 | 132 | def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]: 133 | if element_id is None: 134 | return (None, None) 135 | return id2xy.get(element_id, (None, None)) 136 | 137 | if item_type == "function_call": 138 | fn_name = item.get("name") 139 | fn_args = json.loads(item.get("arguments", "{}")) 140 | 141 | item_id = item.get("id") 142 | call_id = item.get("call_id") 143 | 144 | if fn_name == "computer": 145 | action = fn_args.get("action") 146 | element_id = fn_args.get("element_id") 147 | start_element_id = fn_args.get("start_element_id") 148 | end_element_id = fn_args.get("end_element_id") 149 | text = fn_args.get("text") 150 | keys = fn_args.get("keys") 151 | button = fn_args.get("button") 152 | scroll_x = fn_args.get("scroll_x") 153 | scroll_y = fn_args.get("scroll_y") 154 | 155 | x, y = _get_xy(element_id) 156 | start_x, start_y = _get_xy(start_element_id) 157 | end_x, end_y = _get_xy(end_element_id) 158 | 159 | action_args = { 160 | "type": action, 161 | "x": x, 162 | "y": y, 163 | "start_x": start_x, 164 | "start_y": start_y, 165 | "end_x": end_x, 166 | "end_y": end_y, 167 | "text": text, 168 | "keys": keys, 169 | "button": button, 170 | "scroll_x": scroll_x, 171 | "scroll_y": scroll_y 172 | } 173 | # Remove None values to keep the JSON clean 174 | action_args = {k: v for k, v in action_args.items() if v is not None} 175 | 176 | return [{ 177 | "type": "computer_call", 178 | "action": action_args, 179 | "id": item_id, 180 | "call_id": call_id, 181 | "status": "completed" 182 | }] 183 | 184 | return [item] 185 | 186 | async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[Tuple[float, float], int]): 187 | """ 188 | Convert computer_call back to function_call format. 189 | Also handles computer_call_output -> function_call_output conversion. 190 | 191 | Args: 192 | item: The item to convert 193 | xy2id: Mapping from (x, y) coordinates to element IDs 194 | """ 195 | item_type = item.get("type") 196 | 197 | def _get_element_id(x: Optional[float], y: Optional[float]) -> Optional[int]: 198 | """Get element ID from coordinates, return None if coordinates are None""" 199 | if x is None or y is None: 200 | return None 201 | return xy2id.get((x, y)) 202 | 203 | if item_type == "computer_call": 204 | action_data = item.get("action", {}) 205 | 206 | # Extract coordinates and convert back to element IDs 207 | element_id = _get_element_id(action_data.get("x"), action_data.get("y")) 208 | start_element_id = _get_element_id(action_data.get("start_x"), action_data.get("start_y")) 209 | end_element_id = _get_element_id(action_data.get("end_x"), action_data.get("end_y")) 210 | 211 | # Build function arguments 212 | fn_args = { 213 | "action": action_data.get("type"), 214 | "element_id": element_id, 215 | "start_element_id": start_element_id, 216 | "end_element_id": end_element_id, 217 | "text": action_data.get("text"), 218 | "keys": action_data.get("keys"), 219 | "button": action_data.get("button"), 220 | "scroll_x": action_data.get("scroll_x"), 221 | "scroll_y": action_data.get("scroll_y") 222 | } 223 | 224 | # Remove None values to keep the JSON clean 225 | fn_args = {k: v for k, v in fn_args.items() if v is not None} 226 | 227 | return [{ 228 | "type": "function_call", 229 | "name": "computer", 230 | "arguments": json.dumps(fn_args), 231 | "id": item.get("id"), 232 | "call_id": item.get("call_id"), 233 | "status": "completed", 234 | 235 | # Fall back to string representation 236 | "content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})" 237 | }] 238 | 239 | elif item_type == "computer_call_output": 240 | # Simple conversion: computer_call_output -> function_call_output 241 | return [{ 242 | "type": "function_call_output", 243 | "call_id": item.get("call_id"), 244 | "content": [item.get("output")], 245 | "id": item.get("id"), 246 | "status": "completed" 247 | }] 248 | 249 | return [item] 250 | 251 | 252 | @register_agent(models=r"omniparser\+.*|omni\+.*", priority=2) 253 | class OmniparserConfig(AsyncAgentConfig): 254 | """Omniparser agent configuration implementing AsyncAgentConfig protocol.""" 255 | 256 | async def predict_step( 257 | self, 258 | messages: List[Dict[str, Any]], 259 | model: str, 260 | tools: Optional[List[Dict[str, Any]]] = None, 261 | max_retries: Optional[int] = None, 262 | stream: bool = False, 263 | computer_handler=None, 264 | use_prompt_caching: Optional[bool] = False, 265 | _on_api_start=None, 266 | _on_api_end=None, 267 | _on_usage=None, 268 | _on_screenshot=None, 269 | **kwargs 270 | ) -> Dict[str, Any]: 271 | """ 272 | OpenAI computer-use-preview agent loop using liteLLM responses. 273 | 274 | Supports OpenAI's computer use preview models. 275 | """ 276 | if not OMNIPARSER_AVAILABLE: 277 | raise ValueError("omniparser loop requires som to be installed. Install it with `pip install cua-som`.") 278 | 279 | tools = tools or [] 280 | 281 | llm_model = model.split('+')[-1] 282 | 283 | # Prepare tools for OpenAI API 284 | openai_tools, id2xy = _prepare_tools_for_omniparser(tools) 285 | 286 | # Find last computer_call_output 287 | last_computer_call_output = get_last_computer_call_output(messages) # type: ignore 288 | if last_computer_call_output: 289 | image_url = last_computer_call_output.get("output", {}).get("image_url", "") 290 | image_data = image_url.split(",")[-1] 291 | if image_data: 292 | parser = get_parser() 293 | result = parser.parse(image_data) 294 | if _on_screenshot: 295 | await _on_screenshot(result.annotated_image_base64, "annotated_image") 296 | for element in result.elements: 297 | id2xy[element.id] = ((element.bbox.x1 + element.bbox.x2) / 2, (element.bbox.y1 + element.bbox.y2) / 2) 298 | 299 | # handle computer calls -> function calls 300 | new_messages = [] 301 | for message in messages: 302 | if not isinstance(message, dict): 303 | message = message.__dict__ 304 | new_messages += await replace_computer_call_with_function(message, id2xy) # type: ignore 305 | messages = new_messages 306 | 307 | # Prepare API call kwargs 308 | api_kwargs = { 309 | "model": llm_model, 310 | "input": messages, 311 | "tools": openai_tools if openai_tools else None, 312 | "stream": stream, 313 | "truncation": "auto", 314 | "num_retries": max_retries, 315 | **kwargs 316 | } 317 | 318 | # Call API start hook 319 | if _on_api_start: 320 | await _on_api_start(api_kwargs) 321 | 322 | print(str(api_kwargs)[:1000]) 323 | 324 | # Use liteLLM responses 325 | response = await litellm.aresponses(**api_kwargs) 326 | 327 | # Call API end hook 328 | if _on_api_end: 329 | await _on_api_end(api_kwargs, response) 330 | 331 | # Extract usage information 332 | usage = { 333 | **response.usage.model_dump(), # type: ignore 334 | "response_cost": response._hidden_params.get("response_cost", 0.0), # type: ignore 335 | } 336 | if _on_usage: 337 | await _on_usage(usage) 338 | 339 | # handle som function calls -> xy computer calls 340 | new_output = [] 341 | for i in range(len(response.output)): # type: ignore 342 | new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy) # type: ignore 343 | 344 | return { 345 | "output": new_output, 346 | "usage": usage 347 | } 348 | 349 | async def predict_click( 350 | self, 351 | model: str, 352 | image_b64: str, 353 | instruction: str, 354 | **kwargs 355 | ) -> Optional[Tuple[float, float]]: 356 | """ 357 | Predict click coordinates using OmniParser and LLM. 358 | 359 | Uses OmniParser to annotate the image with element IDs, then uses LLM 360 | to identify the correct element ID based on the instruction. 361 | """ 362 | if not OMNIPARSER_AVAILABLE: 363 | return None 364 | 365 | # Parse the image with OmniParser to get annotated image and elements 366 | parser = get_parser() 367 | result = parser.parse(image_b64) 368 | 369 | # Extract the LLM model from composed model string 370 | llm_model = model.split('+')[-1] 371 | 372 | # Create system prompt for element ID prediction 373 | SYSTEM_PROMPT = f''' 374 | You are an expert UI element locator. Given a GUI image annotated with numerical IDs over each interactable element, along with a user's element description, provide the ID of the specified element. 375 | 376 | The image shows UI elements with numbered overlays. Each number corresponds to a clickable/interactable element. 377 | 378 | Output only the element ID as a single integer. 379 | '''.strip() 380 | 381 | # Prepare messages for LLM 382 | messages = [ 383 | { 384 | "role": "system", 385 | "content": SYSTEM_PROMPT 386 | }, 387 | { 388 | "role": "user", 389 | "content": [ 390 | { 391 | "type": "image_url", 392 | "image_url": { 393 | "url": f"data:image/png;base64,{result.annotated_image_base64}" 394 | } 395 | }, 396 | { 397 | "type": "text", 398 | "text": f"Find the element: {instruction}" 399 | } 400 | ] 401 | } 402 | ] 403 | 404 | # Call LLM to predict element ID 405 | response = await litellm.acompletion( 406 | model=llm_model, 407 | messages=messages, 408 | max_tokens=10, 409 | temperature=0.1 410 | ) 411 | 412 | # Extract element ID from response 413 | response_text = response.choices[0].message.content.strip() # type: ignore 414 | 415 | # Try to parse the element ID 416 | try: 417 | element_id = int(response_text) 418 | 419 | # Find the element with this ID and return its center coordinates 420 | for element in result.elements: 421 | if element.id == element_id: 422 | center_x = (element.bbox.x1 + element.bbox.x2) / 2 423 | center_y = (element.bbox.y1 + element.bbox.y2) / 2 424 | return (center_x, center_y) 425 | except ValueError: 426 | # If we can't parse the ID, return None 427 | pass 428 | 429 | return None 430 | 431 | def get_capabilities(self) -> List[AgentCapability]: 432 | """Return the capabilities supported by this agent.""" 433 | return ["step"] 434 | ```