#
tokens: 47255/50000 10/513 files (page 11/21)
lines: on (toggle) GitHub
raw markdown copy reset
This is page 11 of 21. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .all-contributorsrc
├── .cursorignore
├── .devcontainer
│   ├── devcontainer.json
│   ├── post-install.sh
│   └── README.md
├── .dockerignore
├── .gitattributes
├── .github
│   ├── FUNDING.yml
│   ├── scripts
│   │   ├── get_pyproject_version.py
│   │   └── tests
│   │       ├── __init__.py
│   │       ├── README.md
│   │       └── test_get_pyproject_version.py
│   └── workflows
│       ├── ci-lume.yml
│       ├── docker-publish-kasm.yml
│       ├── docker-publish-xfce.yml
│       ├── docker-reusable-publish.yml
│       ├── npm-publish-computer.yml
│       ├── npm-publish-core.yml
│       ├── publish-lume.yml
│       ├── pypi-publish-agent.yml
│       ├── pypi-publish-computer-server.yml
│       ├── pypi-publish-computer.yml
│       ├── pypi-publish-core.yml
│       ├── pypi-publish-mcp-server.yml
│       ├── pypi-publish-pylume.yml
│       ├── pypi-publish-som.yml
│       ├── pypi-reusable-publish.yml
│       └── test-validation-script.yml
├── .gitignore
├── .vscode
│   ├── docs.code-workspace
│   ├── launch.json
│   ├── libs-ts.code-workspace
│   ├── lume.code-workspace
│   ├── lumier.code-workspace
│   ├── py.code-workspace
│   └── settings.json
├── blog
│   ├── app-use.md
│   ├── assets
│   │   ├── composite-agents.png
│   │   ├── docker-ubuntu-support.png
│   │   ├── hack-booth.png
│   │   ├── hack-closing-ceremony.jpg
│   │   ├── hack-cua-ollama-hud.jpeg
│   │   ├── hack-leaderboard.png
│   │   ├── hack-the-north.png
│   │   ├── hack-winners.jpeg
│   │   ├── hack-workshop.jpeg
│   │   ├── hud-agent-evals.png
│   │   └── trajectory-viewer.jpeg
│   ├── bringing-computer-use-to-the-web.md
│   ├── build-your-own-operator-on-macos-1.md
│   ├── build-your-own-operator-on-macos-2.md
│   ├── composite-agents.md
│   ├── cua-hackathon.md
│   ├── hack-the-north.md
│   ├── hud-agent-evals.md
│   ├── human-in-the-loop.md
│   ├── introducing-cua-cloud-containers.md
│   ├── lume-to-containerization.md
│   ├── sandboxed-python-execution.md
│   ├── training-computer-use-models-trajectories-1.md
│   ├── trajectory-viewer.md
│   ├── ubuntu-docker-support.md
│   └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│   ├── .gitignore
│   ├── .prettierrc
│   ├── content
│   │   └── docs
│   │       ├── agent-sdk
│   │       │   ├── agent-loops.mdx
│   │       │   ├── benchmarks
│   │       │   │   ├── index.mdx
│   │       │   │   ├── interactive.mdx
│   │       │   │   ├── introduction.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── osworld-verified.mdx
│   │       │   │   ├── screenspot-pro.mdx
│   │       │   │   └── screenspot-v2.mdx
│   │       │   ├── callbacks
│   │       │   │   ├── agent-lifecycle.mdx
│   │       │   │   ├── cost-saving.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── logging.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── pii-anonymization.mdx
│   │       │   │   └── trajectories.mdx
│   │       │   ├── chat-history.mdx
│   │       │   ├── custom-computer-handlers.mdx
│   │       │   ├── custom-tools.mdx
│   │       │   ├── customizing-computeragent.mdx
│   │       │   ├── integrations
│   │       │   │   ├── hud.mdx
│   │       │   │   └── meta.json
│   │       │   ├── message-format.mdx
│   │       │   ├── meta.json
│   │       │   ├── migration-guide.mdx
│   │       │   ├── prompt-caching.mdx
│   │       │   ├── supported-agents
│   │       │   │   ├── composed-agents.mdx
│   │       │   │   ├── computer-use-agents.mdx
│   │       │   │   ├── grounding-models.mdx
│   │       │   │   ├── human-in-the-loop.mdx
│   │       │   │   └── meta.json
│   │       │   ├── supported-model-providers
│   │       │   │   ├── index.mdx
│   │       │   │   └── local-models.mdx
│   │       │   └── usage-tracking.mdx
│   │       ├── computer-sdk
│   │       │   ├── cloud-vm-management.mdx
│   │       │   ├── commands.mdx
│   │       │   ├── computer-ui.mdx
│   │       │   ├── computers.mdx
│   │       │   ├── meta.json
│   │       │   └── sandboxed-python.mdx
│   │       ├── index.mdx
│   │       ├── libraries
│   │       │   ├── agent
│   │       │   │   └── index.mdx
│   │       │   ├── computer
│   │       │   │   └── index.mdx
│   │       │   ├── computer-server
│   │       │   │   ├── Commands.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── REST-API.mdx
│   │       │   │   └── WebSocket-API.mdx
│   │       │   ├── core
│   │       │   │   └── index.mdx
│   │       │   ├── lume
│   │       │   │   ├── cli-reference.mdx
│   │       │   │   ├── faq.md
│   │       │   │   ├── http-api.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── meta.json
│   │       │   │   └── prebuilt-images.mdx
│   │       │   ├── lumier
│   │       │   │   ├── building-lumier.mdx
│   │       │   │   ├── docker-compose.mdx
│   │       │   │   ├── docker.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   └── meta.json
│   │       │   ├── mcp-server
│   │       │   │   ├── client-integrations.mdx
│   │       │   │   ├── configuration.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── llm-integrations.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── tools.mdx
│   │       │   │   └── usage.mdx
│   │       │   └── som
│   │       │       ├── configuration.mdx
│   │       │       └── index.mdx
│   │       ├── meta.json
│   │       ├── quickstart-cli.mdx
│   │       ├── quickstart-devs.mdx
│   │       └── telemetry.mdx
│   ├── next.config.mjs
│   ├── package-lock.json
│   ├── package.json
│   ├── pnpm-lock.yaml
│   ├── postcss.config.mjs
│   ├── public
│   │   └── img
│   │       ├── agent_gradio_ui.png
│   │       ├── agent.png
│   │       ├── cli.png
│   │       ├── computer.png
│   │       ├── som_box_threshold.png
│   │       └── som_iou_threshold.png
│   ├── README.md
│   ├── source.config.ts
│   ├── src
│   │   ├── app
│   │   │   ├── (home)
│   │   │   │   ├── [[...slug]]
│   │   │   │   │   └── page.tsx
│   │   │   │   └── layout.tsx
│   │   │   ├── api
│   │   │   │   └── search
│   │   │   │       └── route.ts
│   │   │   ├── favicon.ico
│   │   │   ├── global.css
│   │   │   ├── layout.config.tsx
│   │   │   ├── layout.tsx
│   │   │   ├── llms.mdx
│   │   │   │   └── [[...slug]]
│   │   │   │       └── route.ts
│   │   │   └── llms.txt
│   │   │       └── route.ts
│   │   ├── assets
│   │   │   ├── discord-black.svg
│   │   │   ├── discord-white.svg
│   │   │   ├── logo-black.svg
│   │   │   └── logo-white.svg
│   │   ├── components
│   │   │   ├── iou.tsx
│   │   │   └── mermaid.tsx
│   │   ├── lib
│   │   │   ├── llms.ts
│   │   │   └── source.ts
│   │   └── mdx-components.tsx
│   └── tsconfig.json
├── examples
│   ├── agent_examples.py
│   ├── agent_ui_examples.py
│   ├── cloud_api_examples.py
│   ├── computer_examples_windows.py
│   ├── computer_examples.py
│   ├── computer_ui_examples.py
│   ├── computer-example-ts
│   │   ├── .env.example
│   │   ├── .gitignore
│   │   ├── .prettierrc
│   │   ├── package-lock.json
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── README.md
│   │   ├── src
│   │   │   ├── helpers.ts
│   │   │   └── index.ts
│   │   └── tsconfig.json
│   ├── docker_examples.py
│   ├── evals
│   │   ├── hud_eval_examples.py
│   │   └── wikipedia_most_linked.txt
│   ├── pylume_examples.py
│   ├── sandboxed_functions_examples.py
│   ├── som_examples.py
│   ├── utils.py
│   └── winsandbox_example.py
├── img
│   ├── agent_gradio_ui.png
│   ├── agent.png
│   ├── cli.png
│   ├── computer.png
│   ├── logo_black.png
│   └── logo_white.png
├── libs
│   ├── kasm
│   │   ├── Dockerfile
│   │   ├── LICENSE
│   │   ├── README.md
│   │   └── src
│   │       └── ubuntu
│   │           └── install
│   │               └── firefox
│   │                   ├── custom_startup.sh
│   │                   ├── firefox.desktop
│   │                   └── install_firefox.sh
│   ├── lume
│   │   ├── .cursorignore
│   │   ├── CONTRIBUTING.md
│   │   ├── Development.md
│   │   ├── img
│   │   │   └── cli.png
│   │   ├── Package.resolved
│   │   ├── Package.swift
│   │   ├── README.md
│   │   ├── resources
│   │   │   └── lume.entitlements
│   │   ├── scripts
│   │   │   ├── build
│   │   │   │   ├── build-debug.sh
│   │   │   │   ├── build-release-notarized.sh
│   │   │   │   └── build-release.sh
│   │   │   └── install.sh
│   │   ├── src
│   │   │   ├── Commands
│   │   │   │   ├── Clone.swift
│   │   │   │   ├── Config.swift
│   │   │   │   ├── Create.swift
│   │   │   │   ├── Delete.swift
│   │   │   │   ├── Get.swift
│   │   │   │   ├── Images.swift
│   │   │   │   ├── IPSW.swift
│   │   │   │   ├── List.swift
│   │   │   │   ├── Logs.swift
│   │   │   │   ├── Options
│   │   │   │   │   └── FormatOption.swift
│   │   │   │   ├── Prune.swift
│   │   │   │   ├── Pull.swift
│   │   │   │   ├── Push.swift
│   │   │   │   ├── Run.swift
│   │   │   │   ├── Serve.swift
│   │   │   │   ├── Set.swift
│   │   │   │   └── Stop.swift
│   │   │   ├── ContainerRegistry
│   │   │   │   ├── ImageContainerRegistry.swift
│   │   │   │   ├── ImageList.swift
│   │   │   │   └── ImagesPrinter.swift
│   │   │   ├── Errors
│   │   │   │   └── Errors.swift
│   │   │   ├── FileSystem
│   │   │   │   ├── Home.swift
│   │   │   │   ├── Settings.swift
│   │   │   │   ├── VMConfig.swift
│   │   │   │   ├── VMDirectory.swift
│   │   │   │   └── VMLocation.swift
│   │   │   ├── LumeController.swift
│   │   │   ├── Main.swift
│   │   │   ├── Server
│   │   │   │   ├── Handlers.swift
│   │   │   │   ├── HTTP.swift
│   │   │   │   ├── Requests.swift
│   │   │   │   ├── Responses.swift
│   │   │   │   └── Server.swift
│   │   │   ├── Utils
│   │   │   │   ├── CommandRegistry.swift
│   │   │   │   ├── CommandUtils.swift
│   │   │   │   ├── Logger.swift
│   │   │   │   ├── NetworkUtils.swift
│   │   │   │   ├── Path.swift
│   │   │   │   ├── ProcessRunner.swift
│   │   │   │   ├── ProgressLogger.swift
│   │   │   │   ├── String.swift
│   │   │   │   └── Utils.swift
│   │   │   ├── Virtualization
│   │   │   │   ├── DarwinImageLoader.swift
│   │   │   │   ├── DHCPLeaseParser.swift
│   │   │   │   ├── ImageLoaderFactory.swift
│   │   │   │   └── VMVirtualizationService.swift
│   │   │   ├── VM
│   │   │   │   ├── DarwinVM.swift
│   │   │   │   ├── LinuxVM.swift
│   │   │   │   ├── VM.swift
│   │   │   │   ├── VMDetails.swift
│   │   │   │   ├── VMDetailsPrinter.swift
│   │   │   │   ├── VMDisplayResolution.swift
│   │   │   │   └── VMFactory.swift
│   │   │   └── VNC
│   │   │       ├── PassphraseGenerator.swift
│   │   │       └── VNCService.swift
│   │   └── tests
│   │       ├── Mocks
│   │       │   ├── MockVM.swift
│   │       │   ├── MockVMVirtualizationService.swift
│   │       │   └── MockVNCService.swift
│   │       ├── VM
│   │       │   └── VMDetailsPrinterTests.swift
│   │       ├── VMTests.swift
│   │       ├── VMVirtualizationServiceTests.swift
│   │       └── VNCServiceTests.swift
│   ├── lumier
│   │   ├── .dockerignore
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   └── src
│   │       ├── bin
│   │       │   └── entry.sh
│   │       ├── config
│   │       │   └── constants.sh
│   │       ├── hooks
│   │       │   └── on-logon.sh
│   │       └── lib
│   │           ├── utils.sh
│   │           └── vm.sh
│   ├── python
│   │   ├── agent
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── agent
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── adapters
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── huggingfacelocal_adapter.py
│   │   │   │   │   ├── human_adapter.py
│   │   │   │   │   ├── mlxvlm_adapter.py
│   │   │   │   │   └── models
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── generic.py
│   │   │   │   │       ├── internvl.py
│   │   │   │   │       ├── opencua.py
│   │   │   │   │       └── qwen2_5_vl.py
│   │   │   │   ├── agent.py
│   │   │   │   ├── callbacks
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── budget_manager.py
│   │   │   │   │   ├── image_retention.py
│   │   │   │   │   ├── logging.py
│   │   │   │   │   ├── operator_validator.py
│   │   │   │   │   ├── pii_anonymization.py
│   │   │   │   │   ├── prompt_instructions.py
│   │   │   │   │   ├── telemetry.py
│   │   │   │   │   └── trajectory_saver.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── computers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cua.py
│   │   │   │   │   └── custom.py
│   │   │   │   ├── decorators.py
│   │   │   │   ├── human_tool
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   ├── server.py
│   │   │   │   │   └── ui.py
│   │   │   │   ├── integrations
│   │   │   │   │   └── hud
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── agent.py
│   │   │   │   │       └── proxy.py
│   │   │   │   ├── loops
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── anthropic.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── composed_grounded.py
│   │   │   │   │   ├── gemini.py
│   │   │   │   │   ├── glm45v.py
│   │   │   │   │   ├── gta1.py
│   │   │   │   │   ├── holo.py
│   │   │   │   │   ├── internvl.py
│   │   │   │   │   ├── model_types.csv
│   │   │   │   │   ├── moondream3.py
│   │   │   │   │   ├── omniparser.py
│   │   │   │   │   ├── openai.py
│   │   │   │   │   ├── opencua.py
│   │   │   │   │   └── uitars.py
│   │   │   │   ├── proxy
│   │   │   │   │   ├── examples.py
│   │   │   │   │   └── handlers.py
│   │   │   │   ├── responses.py
│   │   │   │   ├── types.py
│   │   │   │   └── ui
│   │   │   │       ├── __init__.py
│   │   │   │       ├── __main__.py
│   │   │   │       └── gradio
│   │   │   │           ├── __init__.py
│   │   │   │           ├── app.py
│   │   │   │           └── ui_components.py
│   │   │   ├── benchmarks
│   │   │   │   ├── .gitignore
│   │   │   │   ├── contrib.md
│   │   │   │   ├── interactive.py
│   │   │   │   ├── models
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   └── gta1.py
│   │   │   │   ├── README.md
│   │   │   │   ├── ss-pro.py
│   │   │   │   ├── ss-v2.py
│   │   │   │   └── utils.py
│   │   │   ├── example.py
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── computer
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer
│   │   │   │   ├── __init__.py
│   │   │   │   ├── computer.py
│   │   │   │   ├── diorama_computer.py
│   │   │   │   ├── helpers.py
│   │   │   │   ├── interface
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   ├── models.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── logger.py
│   │   │   │   ├── models.py
│   │   │   │   ├── providers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cloud
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── docker
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── lume
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── lume_api.py
│   │   │   │   │   ├── lumier
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── types.py
│   │   │   │   │   └── winsandbox
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── provider.py
│   │   │   │   │       └── setup_script.ps1
│   │   │   │   ├── ui
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   └── gradio
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       └── app.py
│   │   │   │   └── utils.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── computer-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── diorama
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── diorama_computer.py
│   │   │   │   │   ├── diorama.py
│   │   │   │   │   ├── draw.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── safezone.py
│   │   │   │   ├── handlers
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── main.py
│   │   │   │   ├── server.py
│   │   │   │   └── watchdog.py
│   │   │   ├── examples
│   │   │   │   ├── __init__.py
│   │   │   │   └── usage_example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   ├── run_server.py
│   │   │   └── test_connection.py
│   │   ├── core
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── core
│   │   │   │   ├── __init__.py
│   │   │   │   └── telemetry
│   │   │   │       ├── __init__.py
│   │   │   │       └── posthog.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── mcp-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── CONCURRENT_SESSIONS.md
│   │   │   ├── mcp_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── server.py
│   │   │   │   └── session_manager.py
│   │   │   ├── pdm.lock
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── scripts
│   │   │       ├── install_mcp_server.sh
│   │   │       └── start_mcp_server.sh
│   │   ├── pylume
│   │   │   ├── __init__.py
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── pylume
│   │   │   │   ├── __init__.py
│   │   │   │   ├── client.py
│   │   │   │   ├── exceptions.py
│   │   │   │   ├── lume
│   │   │   │   ├── models.py
│   │   │   │   ├── pylume.py
│   │   │   │   └── server.py
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   └── som
│   │       ├── .bumpversion.cfg
│   │       ├── LICENSE
│   │       ├── poetry.toml
│   │       ├── pyproject.toml
│   │       ├── README.md
│   │       ├── som
│   │       │   ├── __init__.py
│   │       │   ├── detect.py
│   │       │   ├── detection.py
│   │       │   ├── models.py
│   │       │   ├── ocr.py
│   │       │   ├── util
│   │       │   │   └── utils.py
│   │       │   └── visualization.py
│   │       └── tests
│   │           └── test_omniparser.py
│   ├── typescript
│   │   ├── .gitignore
│   │   ├── .nvmrc
│   │   ├── agent
│   │   │   ├── examples
│   │   │   │   ├── playground-example.html
│   │   │   │   └── README.md
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── client.ts
│   │   │   │   ├── index.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   └── client.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── biome.json
│   │   ├── computer
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── computer
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── providers
│   │   │   │   │   │   ├── base.ts
│   │   │   │   │   │   ├── cloud.ts
│   │   │   │   │   │   └── index.ts
│   │   │   │   │   └── types.ts
│   │   │   │   ├── index.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── base.ts
│   │   │   │   │   ├── factory.ts
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── linux.ts
│   │   │   │   │   ├── macos.ts
│   │   │   │   │   └── windows.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   ├── computer
│   │   │   │   │   └── cloud.test.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── factory.test.ts
│   │   │   │   │   ├── index.test.ts
│   │   │   │   │   ├── linux.test.ts
│   │   │   │   │   ├── macos.test.ts
│   │   │   │   │   └── windows.test.ts
│   │   │   │   └── setup.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── core
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── index.ts
│   │   │   │   └── telemetry
│   │   │   │       ├── clients
│   │   │   │       │   ├── index.ts
│   │   │   │       │   └── posthog.ts
│   │   │   │       └── index.ts
│   │   │   ├── tests
│   │   │   │   └── telemetry.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── pnpm-workspace.yaml
│   │   └── README.md
│   └── xfce
│       ├── .dockerignore
│       ├── .gitignore
│       ├── Dockerfile
│       ├── README.md
│       └── src
│           ├── scripts
│           │   ├── resize-display.sh
│           │   ├── start-computer-server.sh
│           │   ├── start-novnc.sh
│           │   ├── start-vnc.sh
│           │   └── xstartup.sh
│           ├── supervisor
│           │   └── supervisord.conf
│           └── xfce-config
│               ├── helpers.rc
│               ├── xfce4-power-manager.xml
│               └── xfce4-session.xml
├── LICENSE.md
├── Makefile
├── notebooks
│   ├── agent_nb.ipynb
│   ├── blog
│   │   ├── build-your-own-operator-on-macos-1.ipynb
│   │   └── build-your-own-operator-on-macos-2.ipynb
│   ├── composite_agents_docker_nb.ipynb
│   ├── computer_nb.ipynb
│   ├── computer_server_nb.ipynb
│   ├── customizing_computeragent.ipynb
│   ├── eval_osworld.ipynb
│   ├── ollama_nb.ipynb
│   ├── pylume_nb.ipynb
│   ├── README.md
│   ├── sota_hackathon_cloud.ipynb
│   └── sota_hackathon.ipynb
├── pdm.lock
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── samples
│   └── community
│       ├── global-online
│       │   └── README.md
│       └── hack-the-north
│           └── README.md
├── scripts
│   ├── build-uv.sh
│   ├── build.ps1
│   ├── build.sh
│   ├── cleanup.sh
│   ├── playground-docker.sh
│   ├── playground.sh
│   └── run-docker-dev.sh
└── tests
    ├── pytest.ini
    ├── shell_cmd.py
    ├── test_files.py
    ├── test_mcp_server_session_management.py
    ├── test_mcp_server_streaming.py
    ├── test_shell_bash.py
    ├── test_telemetry.py
    ├── test_venv.py
    └── test_watchdog.py
```

# Files

--------------------------------------------------------------------------------
/libs/python/agent/agent/adapters/human_adapter.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import asyncio
  3 | import requests
  4 | from typing import List, Dict, Any, Iterator, AsyncIterator
  5 | from litellm.types.utils import GenericStreamingChunk, ModelResponse
  6 | from litellm.llms.custom_llm import CustomLLM
  7 | from litellm import completion, acompletion
  8 | 
  9 | 
 10 | class HumanAdapter(CustomLLM):
 11 |     """Human Adapter for human-in-the-loop completions.
 12 |     
 13 |     This adapter sends completion requests to a human completion server
 14 |     where humans can review and respond to AI requests.
 15 |     """
 16 |     
 17 |     def __init__(self, base_url: str | None = None, timeout: float = 300.0, **kwargs):
 18 |         """Initialize the human adapter.
 19 |         
 20 |         Args:
 21 |             base_url: Base URL for the human completion server.
 22 |                      Defaults to HUMAN_BASE_URL environment variable or http://localhost:8002
 23 |             timeout: Timeout in seconds for waiting for human response
 24 |             **kwargs: Additional arguments
 25 |         """
 26 |         super().__init__()
 27 |         self.base_url = base_url or os.getenv('HUMAN_BASE_URL', 'http://localhost:8002')
 28 |         self.timeout = timeout
 29 |         
 30 |         # Ensure base_url doesn't end with slash
 31 |         self.base_url = self.base_url.rstrip('/')
 32 |     
 33 |     def _queue_completion(self, messages: List[Dict[str, Any]], model: str) -> str:
 34 |         """Queue a completion request and return the call ID.
 35 |         
 36 |         Args:
 37 |             messages: Messages in OpenAI format
 38 |             model: Model name
 39 |             
 40 |         Returns:
 41 |             Call ID for tracking the request
 42 |             
 43 |         Raises:
 44 |             Exception: If queueing fails
 45 |         """
 46 |         try:
 47 |             response = requests.post(
 48 |                 f"{self.base_url}/queue",
 49 |                 json={"messages": messages, "model": model},
 50 |                 timeout=10
 51 |             )
 52 |             response.raise_for_status()
 53 |             return response.json()["id"]
 54 |         except requests.RequestException as e:
 55 |             raise Exception(f"Failed to queue completion request: {e}")
 56 |     
 57 |     def _wait_for_completion(self, call_id: str) -> Dict[str, Any]:
 58 |         """Wait for human to complete the call.
 59 |         
 60 |         Args:
 61 |             call_id: ID of the queued completion call
 62 |             
 63 |         Returns:
 64 |             Dict containing response and/or tool_calls
 65 |             
 66 |         Raises:
 67 |             TimeoutError: If timeout is exceeded
 68 |             Exception: If completion fails
 69 |         """
 70 |         import time
 71 |         
 72 |         start_time = time.time()
 73 |         
 74 |         while True:
 75 |             try:
 76 |                 # Check status
 77 |                 status_response = requests.get(f"{self.base_url}/status/{call_id}")
 78 |                 status_response.raise_for_status()
 79 |                 status_data = status_response.json()
 80 |                 
 81 |                 if status_data["status"] == "completed":
 82 |                     result = {}
 83 |                     if "response" in status_data and status_data["response"]:
 84 |                         result["response"] = status_data["response"]
 85 |                     if "tool_calls" in status_data and status_data["tool_calls"]:
 86 |                         result["tool_calls"] = status_data["tool_calls"]
 87 |                     return result
 88 |                 elif status_data["status"] == "failed":
 89 |                     error_msg = status_data.get("error", "Unknown error")
 90 |                     raise Exception(f"Completion failed: {error_msg}")
 91 |                 
 92 |                 # Check timeout
 93 |                 if time.time() - start_time > self.timeout:
 94 |                     raise TimeoutError(f"Timeout waiting for human response after {self.timeout} seconds")
 95 |                 
 96 |                 # Wait before checking again
 97 |                 time.sleep(1.0)
 98 |                 
 99 |             except requests.RequestException as e:
100 |                 if time.time() - start_time > self.timeout:
101 |                     raise TimeoutError(f"Timeout waiting for human response: {e}")
102 |                 # Continue trying if we haven't timed out
103 |                 time.sleep(1.0)
104 |     
105 |     async def _async_wait_for_completion(self, call_id: str) -> Dict[str, Any]:
106 |         """Async version of wait_for_completion.
107 |         
108 |         Args:
109 |             call_id: ID of the queued completion call
110 |             
111 |         Returns:
112 |             Dict containing response and/or tool_calls
113 |             
114 |         Raises:
115 |             TimeoutError: If timeout is exceeded
116 |             Exception: If completion fails
117 |         """
118 |         import aiohttp
119 |         import time
120 |         
121 |         start_time = time.time()
122 |         
123 |         async with aiohttp.ClientSession() as session:
124 |             while True:
125 |                 try:
126 |                     # Check status
127 |                     async with session.get(f"{self.base_url}/status/{call_id}") as response:
128 |                         response.raise_for_status()
129 |                         status_data = await response.json()
130 |                     
131 |                     if status_data["status"] == "completed":
132 |                         result = {}
133 |                         if "response" in status_data and status_data["response"]:
134 |                             result["response"] = status_data["response"]
135 |                         if "tool_calls" in status_data and status_data["tool_calls"]:
136 |                             result["tool_calls"] = status_data["tool_calls"]
137 |                         return result
138 |                     elif status_data["status"] == "failed":
139 |                         error_msg = status_data.get("error", "Unknown error")
140 |                         raise Exception(f"Completion failed: {error_msg}")
141 |                     
142 |                     # Check timeout
143 |                     if time.time() - start_time > self.timeout:
144 |                         raise TimeoutError(f"Timeout waiting for human response after {self.timeout} seconds")
145 |                     
146 |                     # Wait before checking again
147 |                     await asyncio.sleep(1.0)
148 |                     
149 |                 except Exception as e:
150 |                     if time.time() - start_time > self.timeout:
151 |                         raise TimeoutError(f"Timeout waiting for human response: {e}")
152 |                     # Continue trying if we haven't timed out
153 |                     await asyncio.sleep(1.0)
154 |     
155 |     def _generate_response(self, messages: List[Dict[str, Any]], model: str) -> Dict[str, Any]:
156 |         """Generate a human response for the given messages.
157 |         
158 |         Args:
159 |             messages: Messages in OpenAI format
160 |             model: Model name
161 |             
162 |         Returns:
163 |             Dict containing response and/or tool_calls
164 |         """
165 |         # Queue the completion request
166 |         call_id = self._queue_completion(messages, model)
167 |         
168 |         # Wait for human response
169 |         response = self._wait_for_completion(call_id)
170 |         
171 |         return response
172 |     
173 |     async def _async_generate_response(self, messages: List[Dict[str, Any]], model: str) -> Dict[str, Any]:
174 |         """Async version of _generate_response.
175 |         
176 |         Args:
177 |             messages: Messages in OpenAI format
178 |             model: Model name
179 |             
180 |         Returns:
181 |             Dict containing response and/or tool_calls
182 |         """
183 |         # Queue the completion request (sync operation)
184 |         call_id = self._queue_completion(messages, model)
185 |         
186 |         # Wait for human response (async)
187 |         response = await self._async_wait_for_completion(call_id)
188 |         
189 |         return response
190 |     
191 |     def completion(self, *args, **kwargs) -> ModelResponse:
192 |         """Synchronous completion method.
193 |         
194 |         Returns:
195 |             ModelResponse with human-generated text or tool calls
196 |         """
197 |         messages = kwargs.get('messages', [])
198 |         model = kwargs.get('model', 'human')
199 |         
200 |         # Generate human response
201 |         human_response_data = self._generate_response(messages, model)
202 |         
203 |         # Create ModelResponse with proper structure
204 |         from litellm.types.utils import ModelResponse, Choices, Message
205 |         import uuid
206 |         import time
207 |         
208 |         # Create message content based on response type
209 |         if "tool_calls" in human_response_data and human_response_data["tool_calls"]:
210 |             # Tool calls response
211 |             message = Message(
212 |                 role="assistant",
213 |                 content=human_response_data.get("response", ""),
214 |                 tool_calls=human_response_data["tool_calls"]
215 |             )
216 |         else:
217 |             # Text response
218 |             message = Message(
219 |                 role="assistant",
220 |                 content=human_response_data.get("response", "")
221 |             )
222 |         
223 |         choice = Choices(
224 |             finish_reason="stop",
225 |             index=0,
226 |             message=message
227 |         )
228 |         
229 |         result = ModelResponse(
230 |             id=f"human-{uuid.uuid4()}",
231 |             choices=[choice],
232 |             created=int(time.time()),
233 |             model=f"human/{model}",
234 |             object="chat.completion"
235 |         )
236 |         
237 |         return result
238 |     
239 |     async def acompletion(self, *args, **kwargs) -> ModelResponse:
240 |         """Asynchronous completion method.
241 |         
242 |         Returns:
243 |             ModelResponse with human-generated text or tool calls
244 |         """
245 |         messages = kwargs.get('messages', [])
246 |         model = kwargs.get('model', 'human')
247 |         
248 |         # Generate human response
249 |         human_response_data = await self._async_generate_response(messages, model)
250 |         
251 |         # Create ModelResponse with proper structure
252 |         from litellm.types.utils import ModelResponse, Choices, Message
253 |         import uuid
254 |         import time
255 |         
256 |         # Create message content based on response type
257 |         if "tool_calls" in human_response_data and human_response_data["tool_calls"]:
258 |             # Tool calls response
259 |             message = Message(
260 |                 role="assistant",
261 |                 content=human_response_data.get("response", ""),
262 |                 tool_calls=human_response_data["tool_calls"]
263 |             )
264 |         else:
265 |             # Text response
266 |             message = Message(
267 |                 role="assistant",
268 |                 content=human_response_data.get("response", "")
269 |             )
270 |         
271 |         choice = Choices(
272 |             finish_reason="stop",
273 |             index=0,
274 |             message=message
275 |         )
276 |         
277 |         result = ModelResponse(
278 |             id=f"human-{uuid.uuid4()}",
279 |             choices=[choice],
280 |             created=int(time.time()),
281 |             model=f"human/{model}",
282 |             object="chat.completion"
283 |         )
284 |         
285 |         return result
286 |     
287 |     def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
288 |         """Synchronous streaming method.
289 |         
290 |         Yields:
291 |             Streaming chunks with human-generated text or tool calls
292 |         """
293 |         messages = kwargs.get('messages', [])
294 |         model = kwargs.get('model', 'human')
295 |         
296 |         # Generate human response
297 |         human_response_data = self._generate_response(messages, model)
298 |         
299 |         import time
300 |         
301 |         # Handle tool calls vs text response
302 |         if "tool_calls" in human_response_data and human_response_data["tool_calls"]:
303 |             # Stream tool calls as a single chunk
304 |             generic_chunk: GenericStreamingChunk = {
305 |                 "finish_reason": "tool_calls",
306 |                 "index": 0,
307 |                 "is_finished": True,
308 |                 "text": human_response_data.get("response", ""),
309 |                 "tool_use": human_response_data["tool_calls"],
310 |                 "usage": {"completion_tokens": 1, "prompt_tokens": 0, "total_tokens": 1},
311 |             }
312 |             yield generic_chunk
313 |         else:
314 |             # Stream text response
315 |             response_text = human_response_data.get("response", "")
316 |             generic_chunk: GenericStreamingChunk = {
317 |                 "finish_reason": "stop",
318 |                 "index": 0,
319 |                 "is_finished": True,
320 |                 "text": response_text,
321 |                 "tool_use": None,
322 |                 "usage": {"completion_tokens": len(response_text.split()), "prompt_tokens": 0, "total_tokens": len(response_text.split())},
323 |             }
324 |             yield generic_chunk
325 |     
326 |     async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
327 |         """Asynchronous streaming method.
328 |         
329 |         Yields:
330 |             Streaming chunks with human-generated text or tool calls
331 |         """
332 |         messages = kwargs.get('messages', [])
333 |         model = kwargs.get('model', 'human')
334 |         
335 |         # Generate human response
336 |         human_response = await self._async_generate_response(messages, model)
337 |         
338 |         # Return as single streaming chunk
339 |         generic_streaming_chunk: GenericStreamingChunk = {
340 |             "finish_reason": "stop",
341 |             "index": 0,
342 |             "is_finished": True,
343 |             "text": human_response,
344 |             "tool_use": None,
345 |             "usage": {"completion_tokens": len(human_response.split()), "prompt_tokens": 0, "total_tokens": len(human_response.split())},
346 |         }
347 |         
348 |         yield generic_streaming_chunk
```

--------------------------------------------------------------------------------
/blog/bringing-computer-use-to-the-web.md:
--------------------------------------------------------------------------------

```markdown
  1 | # Bringing Computer-Use to the Web
  2 | 
  3 | *Published on August 5, 2025 by Morgan Dean*
  4 | 
  5 | In one of our original posts, we explored building Computer-Use Operators on macOS - first with a [manual implementation](build-your-own-operator-on-macos-1.md) using OpenAI's `computer-use-preview` model, then with our [cua-agent framework](build-your-own-operator-on-macos-2.md) for Python developers. While these tutorials have been incredibly popular, we've received consistent feedback from our community: **"Can we use C/ua with JavaScript and TypeScript?"**
  6 | 
  7 | Today, we're excited to announce the release of the **`@trycua/computer` Web SDK** - a new library that allows you to control your C/ua cloud containers from any JavaScript or TypeScript project. With this library, you can click, type, and grab screenshots from your cloud containers - no extra servers required.
  8 | 
  9 | With this new SDK, you can easily develop CUA experiences like the one below, which we will release soon as open source.
 10 | 
 11 | <div align="center">
 12 |   <video src="https://github.com/user-attachments/assets/e213d6c3-73b6-48dd-a7d9-ed761ed74f89" width="600" controls></video>
 13 | </div>
 14 | 
 15 | Let’s see how it works.
 16 | 
 17 | ## What You'll Learn
 18 | 
 19 | By the end of this tutorial, you'll be able to:
 20 | 
 21 | - Set up the `@trycua/computer` npm library in any JavaScript/TypeScript project
 22 | - Connect OpenAI's computer-use model to C/ua cloud containers from web applications
 23 | - Build computer-use agents that work in Node.js, React, Vue, or any web framework
 24 | - Handle different types of computer actions (clicking, typing, scrolling) from web code
 25 | - Implement the complete computer-use loop in JavaScript/TypeScript
 26 | - Integrate AI automation into existing web applications and workflows
 27 | 
 28 | **Prerequisites:**
 29 | 
 30 | - Node.js 16+ and npm/yarn/pnpm
 31 | - Basic JavaScript or TypeScript knowledge
 32 | - OpenAI API access (Tier 3+ for computer-use-preview)
 33 | - C/ua cloud container credits ([get started here](https://trycua.com/pricing))
 34 | 
 35 | **Estimated Time:** 45-60 minutes
 36 | 
 37 | ## Access Requirements
 38 | 
 39 | ### OpenAI Model Availability
 40 | 
 41 | At the time of writing, the **computer-use-preview** model has limited availability:
 42 | 
 43 | - Only accessible to OpenAI tier 3+ users
 44 | - Additional application process may be required even for eligible users
 45 | - Cannot be used in the OpenAI Playground
 46 | - Outside of ChatGPT Operator, usage is restricted to the new **Responses API**
 47 | 
 48 | Luckily, the `@trycua/computer` library can be used in conjunction with other models, like [Anthropic’s Computer Use](https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool) or [UI-TARS](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B). You’ll just have to write your own handler to parse the model output for interfacing with the container.
 49 | 
 50 | ### C/ua Cloud Containers
 51 | 
 52 | To follow this guide, you’ll need access to a C/ua cloud container.
 53 | 
 54 | Getting access is simple: purchase credits from our [pricing page](https://trycua.com/pricing), then create and provision a new container instance from the [dashboard](https://trycua.com/dashboard/containers). With your container running, you'll be ready to leverage the web SDK and bring automation to your JavaScript or TypeScript applications.
 55 | 
 56 | ## Understanding the Flow
 57 | 
 58 | ### OpenAI API Overview
 59 | 
 60 | Let's start with the basics. In our case, we'll use OpenAI's API to communicate with their computer-use model.
 61 | 
 62 | Think of it like this:
 63 | 
 64 | 1. We send the model a screenshot of our container and tell it what we want it to do
 65 | 2. The model looks at the screenshot and decides what actions to take
 66 | 3. It sends back instructions (like "click here" or "type this")
 67 | 4. We execute those instructions in our container.
 68 | 
 69 | ### Model Setup
 70 | 
 71 | Here's how we set up the computer-use model for web development:
 72 | 
 73 | ```javascript
 74 | const res = await openai.responses.create({
 75 |   model: 'computer-use-preview',
 76 |   tools: [
 77 |     {
 78 |       type: 'computer_use_preview',
 79 |       display_width: 1024,
 80 |       display_height: 768,
 81 |       environment: 'linux', // we're using a linux container
 82 |     },
 83 |   ],
 84 |   input: [
 85 |     {
 86 |       role: 'user',
 87 |       content: [
 88 |         // what we want the ai to do
 89 |         { type: 'input_text', text: 'Open firefox and go to trycua.com' },
 90 |         // first screenshot of the vm
 91 |         {
 92 |           type: 'input_image',
 93 |           image_url: `data:image/png;base64,${screenshotBase64}`,
 94 |           detail: 'auto',
 95 |         },
 96 |       ],
 97 |     },
 98 |   ],
 99 |   truncation: 'auto'
100 | });
101 | ```
102 | 
103 | ### Understanding the Response
104 | 
105 | When we send a request, the API sends back a response that looks like this:
106 | 
107 | ```json
108 | "output": [
109 |     {
110 |         "type": "reasoning",           // The AI explains what it's thinking
111 |         "id": "rs_67cc...",
112 |         "summary": [
113 |             {
114 |                 "type": "summary_text",
115 |                 "text": "Clicking on the browser address bar."
116 |             }
117 |         ]
118 |     },
119 |     {
120 |         "type": "computer_call",       // The actual action to perform
121 |         "id": "cu_67cc...",
122 |         "call_id": "call_zw3...",      // Used to track previous calls
123 |         "action": {
124 |             "type": "click",           // What kind of action (click, type, etc.)
125 |             "button": "left",          // Which mouse button to use
126 |             "x": 156,                  // Where to click (coordinates)
127 |             "y": 50
128 |         },
129 |         "pending_safety_checks": [],   // Any safety warnings to consider
130 |         "status": "completed"          // Whether the action was successful
131 |     }
132 | ]
133 | 
134 | ```
135 | 
136 | Each response contains:
137 | 
138 | 1. **Reasoning**: The AI's explanation of what it's doing
139 | 2. **Action**: The specific computer action to perform
140 | 3. **Safety Checks**: Any potential risks to review
141 | 4. **Status**: Whether everything worked as planned
142 | 
143 | ## Implementation Guide
144 | 
145 | ### Provision a C/ua Cloud Container
146 | 
147 |   1. Visit [trycua.com](https://trycua.com), sign up, purchase [credits](https://trycua.com/pricing), and create a new container instance from the [dashboard](https://trycua.com/dashboard).
148 |   2. Create an API key from the dashboard — be sure to save it in a secure location before continuing.
149 |   3. Start the cloud container from the dashboard.
150 | 
151 | ### Environment Setup
152 | 
153 |   1. Install required packages with your preferred package manager:
154 | 
155 |       ```bash
156 |       npm install --save @trycua/computer # or yarn, pnpm, bun
157 |       npm install --save openai # or yarn, pnpm, bun
158 |       ```
159 | 
160 |       Works with any JavaScript/TypeScript project setup - whether you're using Create React App, Next.js, Vue, Angular, or plain JavaScript.
161 | 
162 |   2. Save your OpenAI API key, C/ua API key, and container name to a `.env` file:
163 | 
164 |       ```bash
165 |       OPENAI_API_KEY=openai-api-key
166 |       CUA_API_KEY=cua-api-key
167 |       CUA_CONTAINER_NAME=cua-cloud-container-name
168 |       ```
169 | 
170 |       These environment variables work the same whether you're using vanilla JavaScript, TypeScript, or any web framework.
171 | 
172 | ## Building the Agent
173 | 
174 | ### Mapping API Actions to `@trycua/computer` Interface Methods
175 | 
176 | This helper function handles a `computer_call` action from the OpenAI API — converting the action into an equivalent action from the `@trycua/computer` interface. These actions will execute on the initialized `Computer` instance. For example, `await computer.interface.leftClick()` sends a mouse left click to the current cursor position.
177 | 
178 | Whether you're using JavaScript or TypeScript, the interface remains the same:
179 | 
180 | ```javascript
181 | export async function executeAction(
182 |   computer: Computer,
183 |   action: OpenAI.Responses.ResponseComputerToolCall['action']
184 | ) {
185 |   switch (action.type) {
186 |     case 'click':
187 |       const { x, y, button } = action;
188 |       console.log(`Executing click at (${x}, ${y}) with button '${button}'.`);
189 |       await computer.interface.moveCursor(x, y);
190 |       if (button === 'right') await computer.interface.rightClick();
191 |       else await computer.interface.leftClick();
192 |       break;
193 |     case 'type':
194 |       const { text } = action;
195 |       console.log(`Typing text: ${text}`);
196 |       await computer.interface.typeText(text);
197 |       break;
198 |     case 'scroll':
199 |       const { x: locX, y: locY, scroll_x, scroll_y } = action;
200 |       console.log(
201 |         `Scrolling at (${locX}, ${locY}) with offsets (scroll_x=${scroll_x}, scroll_y=${scroll_y}).`
202 |       );
203 |       await computer.interface.moveCursor(locX, locY);
204 |       await computer.interface.scroll(scroll_x, scroll_y);
205 |       break;
206 |     case 'keypress':
207 |       const { keys } = action;
208 |       for (const key of keys) {
209 |         console.log(`Pressing key: ${key}.`);
210 |         // Map common key names to CUA equivalents
211 |         if (key.toLowerCase() === 'enter') {
212 |           await computer.interface.pressKey('return');
213 |         } else if (key.toLowerCase() === 'space') {
214 |           await computer.interface.pressKey('space');
215 |         } else {
216 |           await computer.interface.pressKey(key);
217 |         }
218 |       }
219 |       break;
220 |     case 'wait':
221 |       console.log(`Waiting for 3 seconds.`);
222 |       await new Promise((resolve) => setTimeout(resolve, 3 * 1000));
223 |       break;
224 |     case 'screenshot':
225 |       console.log('Taking screenshot.');
226 |       // This is handled automatically in the main loop, but we can take an extra one if requested
227 |       const screenshot = await computer.interface.screenshot();
228 |       return screenshot;
229 |     default:
230 |       console.log(`Unrecognized action: ${action.type}`);
231 |       break;
232 |   }
233 | }
234 | ```
235 | 
236 | ### Implementing the Computer-Use Loop
237 | 
238 | This section defines a loop that:
239 | 
240 | 1. Initializes the `Computer` instance (connecting to a Linux cloud container).
241 | 2. Captures a screenshot of the current state.
242 | 3. Sends the screenshot (with a user prompt) to the OpenAI Responses API using the `computer-use-preview` model.
243 | 4. Processes the returned `computer_call` action and executes it using our helper function.
244 | 5. Captures an updated screenshot after the action.
245 | 6. Send the updated screenshot and loops until no more actions are returned.
246 | 
247 | ```javascript
248 | const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
249 | 
250 | // Initialize the Computer Connection
251 | const computer = new Computer({
252 |   apiKey: process.env.CUA_API_KEY!,
253 |   name: process.env.CUA_CONTAINER_NAME!,
254 |   osType: OSType.LINUX,
255 | });
256 | 
257 | await computer.run();
258 | // Take the initial screenshot
259 | const screenshot = await computer.interface.screenshot();
260 | const screenshotBase64 = screenshot.toString('base64');
261 | 
262 | // Setup openai config for computer use
263 | const computerUseConfig: OpenAI.Responses.ResponseCreateParamsNonStreaming = {
264 |   model: 'computer-use-preview',
265 |   tools: [
266 |     {
267 |       type: 'computer_use_preview',
268 |       display_width: 1024,
269 |       display_height: 768,
270 |       environment: 'linux', // we're using a linux vm
271 |     },
272 |   ],
273 |   truncation: 'auto',
274 | };
275 | 
276 | // Send initial screenshot to the openai computer use model
277 | let res = await openai.responses.create({
278 |   ...computerUseConfig,
279 |   input: [
280 |     {
281 |       role: 'user',
282 |       content: [
283 |         // what we want the ai to do
284 |         { type: 'input_text', text: 'open firefox and go to trycua.com' },
285 |         // current screenshot of the vm
286 |         {
287 |           type: 'input_image',
288 |           image_url: `data:image/png;base64,${screenshotBase64}`,
289 |           detail: 'auto',
290 |         },
291 |       ],
292 |     },
293 |   ],
294 | });
295 | 
296 | // Loop until there are no more computer use actions.
297 | while (true) {
298 |   const computerCalls = res.output.filter((o) => o.type === 'computer_call');
299 |   if (computerCalls.length < 1) {
300 |     console.log('No more computer calls. Loop complete.');
301 |     break;
302 |   }
303 |   // Get the first call
304 |   const call = computerCalls[0];
305 |   const action = call.action;
306 |   console.log('Received action from OpenAI Responses API:', action);
307 |   let ackChecks: OpenAI.Responses.ResponseComputerToolCall.PendingSafetyCheck[] =
308 |     [];
309 |   if (call.pending_safety_checks.length > 0) {
310 |     console.log('Safety checks pending:', call.pending_safety_checks);
311 |     // In a real implementation, you would want to get user confirmation here.
312 |     ackChecks = call.pending_safety_checks;
313 |   }
314 | 
315 |   // Execute the action in the container
316 |   await executeAction(computer, action);
317 |   // Wait for changes to process within the container (1sec)
318 |   await new Promise((resolve) => setTimeout(resolve, 1000));
319 | 
320 |   // Capture new screenshot
321 |   const newScreenshot = await computer.interface.screenshot();
322 |   const newScreenshotBase64 = newScreenshot.toString('base64');
323 | 
324 |   // Screenshot back as computer_call_output
325 | 
326 |   res = await openai.responses.create({
327 |     ...computerUseConfig,
328 |     previous_response_id: res.id,
329 |     input: [
330 |       {
331 |         type: 'computer_call_output',
332 |         call_id: call.call_id,
333 |         acknowledged_safety_checks: ackChecks,
334 |         output: {
335 |           type: 'computer_screenshot',
336 |           image_url: `data:image/png;base64,${newScreenshotBase64}`,
337 |         },
338 |       },
339 |     ],
340 |   });
341 | }
342 | ```
343 | 
344 | You can find the full example on [GitHub](https://github.com/trycua/cua/tree/main/examples/computer-example-ts).
345 | 
346 | ## What's Next?
347 | 
348 | The `@trycua/computer` Web SDK opens up some interesting possibilities. You could build browser-based testing tools, create interactive demos for your products, or automate repetitive workflows directly from your web apps.
349 | 
350 | We're working on more examples and better documentation - if you build something cool with this SDK, we'd love to see it. Drop by our [Discord](https://discord.gg/cua-ai) and share what you're working on.
351 | 
352 | Happy automating on the web!
353 | 
```

--------------------------------------------------------------------------------
/tests/test_mcp_server_session_management.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Tests for MCP Server Session Management functionality.
  3 | 
  4 | This module tests the new concurrent session management and resource lifecycle features.
  5 | """
  6 | 
  7 | import asyncio
  8 | import importlib.util
  9 | import sys
 10 | import types
 11 | import time
 12 | from pathlib import Path
 13 | 
 14 | import pytest
 15 | 
 16 | 
 17 | def _install_stub_module(name: str, module: types.ModuleType, registry: dict[str, types.ModuleType | None]) -> None:
 18 |     registry[name] = sys.modules.get(name)
 19 |     sys.modules[name] = module
 20 | 
 21 | 
 22 | @pytest.fixture
 23 | def server_module():
 24 |     """Create a server module with stubbed dependencies for testing."""
 25 |     stubbed_modules: dict[str, types.ModuleType | None] = {}
 26 | 
 27 |     # Stub MCP Context primitives
 28 |     mcp_module = types.ModuleType("mcp")
 29 |     mcp_module.__path__ = []  # mark as package
 30 | 
 31 |     mcp_server_module = types.ModuleType("mcp.server")
 32 |     mcp_server_module.__path__ = []
 33 | 
 34 |     fastmcp_module = types.ModuleType("mcp.server.fastmcp")
 35 | 
 36 |     class _StubContext:
 37 |         async def yield_message(self, *args, **kwargs):
 38 |             return None
 39 | 
 40 |         async def yield_tool_call(self, *args, **kwargs):
 41 |             return None
 42 | 
 43 |         async def yield_tool_output(self, *args, **kwargs):
 44 |             return None
 45 | 
 46 |         def report_progress(self, *_args, **_kwargs):
 47 |             return None
 48 | 
 49 |         def info(self, *_args, **_kwargs):
 50 |             return None
 51 | 
 52 |         def error(self, *_args, **_kwargs):
 53 |             return None
 54 | 
 55 |     class _StubImage:
 56 |         def __init__(self, format: str, data: bytes):
 57 |             self.format = format
 58 |             self.data = data
 59 | 
 60 |     class _StubFastMCP:
 61 |         def __init__(self, name: str):
 62 |             self.name = name
 63 |             self._tools: dict[str, types.FunctionType] = {}
 64 | 
 65 |         def tool(self, *args, **kwargs):
 66 |             def decorator(func):
 67 |                 self._tools[func.__name__] = func
 68 |                 return func
 69 | 
 70 |             return decorator
 71 | 
 72 |         def run(self):
 73 |             return None
 74 | 
 75 |     fastmcp_module.Context = _StubContext
 76 |     fastmcp_module.FastMCP = _StubFastMCP
 77 |     fastmcp_module.Image = _StubImage
 78 | 
 79 |     _install_stub_module("mcp", mcp_module, stubbed_modules)
 80 |     _install_stub_module("mcp.server", mcp_server_module, stubbed_modules)
 81 |     _install_stub_module("mcp.server.fastmcp", fastmcp_module, stubbed_modules)
 82 | 
 83 |     # Stub Computer module
 84 |     computer_module = types.ModuleType("computer")
 85 | 
 86 |     class _StubInterface:
 87 |         async def screenshot(self) -> bytes:
 88 |             return b"test-screenshot-data"
 89 | 
 90 |     class _StubComputer:
 91 |         def __init__(self, *args, **kwargs):
 92 |             self.interface = _StubInterface()
 93 | 
 94 |         async def run(self):
 95 |             return None
 96 | 
 97 |     computer_module.Computer = _StubComputer
 98 | 
 99 |     _install_stub_module("computer", computer_module, stubbed_modules)
100 | 
101 |     # Stub agent module
102 |     agent_module = types.ModuleType("agent")
103 | 
104 |     class _StubComputerAgent:
105 |         def __init__(self, *args, **kwargs):
106 |             pass
107 | 
108 |         async def run(self, *_args, **_kwargs):
109 |             # Simulate agent execution with streaming
110 |             yield {
111 |                 "output": [
112 |                     {
113 |                         "type": "message",
114 |                         "role": "assistant",
115 |                         "content": [{"type": "output_text", "text": "Task completed"}]
116 |                     }
117 |                 ]
118 |             }
119 | 
120 |     agent_module.ComputerAgent = _StubComputerAgent
121 | 
122 |     _install_stub_module("agent", agent_module, stubbed_modules)
123 | 
124 |     # Stub session manager module
125 |     session_manager_module = types.ModuleType("mcp_server.session_manager")
126 | 
127 |     class _StubSessionInfo:
128 |         def __init__(self, session_id: str, computer, created_at: float, last_activity: float):
129 |             self.session_id = session_id
130 |             self.computer = computer
131 |             self.created_at = created_at
132 |             self.last_activity = last_activity
133 |             self.active_tasks = set()
134 |             self.is_shutting_down = False
135 | 
136 |     class _StubSessionManager:
137 |         def __init__(self):
138 |             self._sessions = {}
139 |             self._session_lock = asyncio.Lock()
140 | 
141 |         async def get_session(self, session_id=None):
142 |             """Context manager that returns a session."""
143 |             if session_id is None:
144 |                 session_id = "test-session-123"
145 |             
146 |             async with self._session_lock:
147 |                 if session_id not in self._sessions:
148 |                     computer = _StubComputer()
149 |                     session = _StubSessionInfo(
150 |                         session_id=session_id,
151 |                         computer=computer,
152 |                         created_at=time.time(),
153 |                         last_activity=time.time()
154 |                     )
155 |                     self._sessions[session_id] = session
156 |                 
157 |                 return self._sessions[session_id]
158 | 
159 |         async def register_task(self, session_id: str, task_id: str):
160 |             pass
161 | 
162 |         async def unregister_task(self, session_id: str, task_id: str):
163 |             pass
164 | 
165 |         async def cleanup_session(self, session_id: str):
166 |             async with self._session_lock:
167 |                 self._sessions.pop(session_id, None)
168 | 
169 |         def get_session_stats(self):
170 |             return {
171 |                 "total_sessions": len(self._sessions),
172 |                 "max_concurrent": 10,
173 |                 "sessions": {sid: {"active_tasks": 0} for sid in self._sessions}
174 |             }
175 | 
176 |     _stub_session_manager = _StubSessionManager()
177 | 
178 |     def get_session_manager():
179 |         return _stub_session_manager
180 | 
181 |     async def initialize_session_manager():
182 |         return _stub_session_manager
183 | 
184 |     async def shutdown_session_manager():
185 |         pass
186 | 
187 |     session_manager_module.get_session_manager = get_session_manager
188 |     session_manager_module.initialize_session_manager = initialize_session_manager
189 |     session_manager_module.shutdown_session_manager = shutdown_session_manager
190 | 
191 |     _install_stub_module("mcp_server.session_manager", session_manager_module, stubbed_modules)
192 | 
193 |     # Load the actual server module
194 |     module_name = "mcp_server_server_under_test"
195 |     module_path = Path("libs/python/mcp-server/mcp_server/server.py").resolve()
196 |     spec = importlib.util.spec_from_file_location(module_name, module_path)
197 |     server_module = importlib.util.module_from_spec(spec)
198 |     assert spec and spec.loader
199 |     spec.loader.exec_module(server_module)
200 | 
201 |     server_instance = getattr(server_module, "server", None)
202 |     if server_instance is not None and hasattr(server_instance, "_tools"):
203 |         for name, func in server_instance._tools.items():
204 |             setattr(server_module, name, func)
205 | 
206 |     try:
207 |         yield server_module
208 |     finally:
209 |         sys.modules.pop(module_name, None)
210 |         for name, original in stubbed_modules.items():
211 |             if original is None:
212 |                 sys.modules.pop(name, None)
213 |             else:
214 |                 sys.modules[name] = original
215 | 
216 | 
217 | class FakeContext:
218 |     """Fake context for testing."""
219 |     
220 |     def __init__(self) -> None:
221 |         self.events: list[tuple] = []
222 |         self.progress_updates: list[float] = []
223 | 
224 |     def info(self, message: str) -> None:
225 |         self.events.append(("info", message))
226 | 
227 |     def error(self, message: str) -> None:
228 |         self.events.append(("error", message))
229 | 
230 |     def report_progress(self, value: float) -> None:
231 |         self.progress_updates.append(value)
232 | 
233 |     async def yield_message(self, *, role: str, content):
234 |         timestamp = asyncio.get_running_loop().time()
235 |         self.events.append(("message", role, content, timestamp))
236 | 
237 |     async def yield_tool_call(self, *, name: str | None, call_id: str, input):
238 |         timestamp = asyncio.get_running_loop().time()
239 |         self.events.append(("tool_call", name, call_id, input, timestamp))
240 | 
241 |     async def yield_tool_output(self, *, call_id: str, output, is_error: bool = False):
242 |         timestamp = asyncio.get_running_loop().time()
243 |         self.events.append(("tool_output", call_id, output, is_error, timestamp))
244 | 
245 | 
246 | def test_screenshot_cua_with_session_id(server_module):
247 |     """Test that screenshot_cua works with session management."""
248 |     async def _run_test():
249 |         ctx = FakeContext()
250 |         result = await server_module.screenshot_cua(ctx, session_id="test-session")
251 |         
252 |         assert result.format == "png"
253 |         assert result.data == b"test-screenshot-data"
254 | 
255 |     asyncio.run(_run_test())
256 | 
257 | 
258 | def test_screenshot_cua_creates_new_session(server_module):
259 |     """Test that screenshot_cua creates a new session when none provided."""
260 |     async def _run_test():
261 |         ctx = FakeContext()
262 |         result = await server_module.screenshot_cua(ctx)
263 |         
264 |         assert result.format == "png"
265 |         assert result.data == b"test-screenshot-data"
266 | 
267 |     asyncio.run(_run_test())
268 | 
269 | 
270 | def test_run_cua_task_with_session_management(server_module):
271 |     """Test that run_cua_task works with session management."""
272 |     async def _run_test():
273 |         ctx = FakeContext()
274 |         task = "Test task"
275 |         session_id = "test-session-456"
276 |         
277 |         text_result, image = await server_module.run_cua_task(ctx, task, session_id)
278 |         
279 |         assert "Task completed" in text_result
280 |         assert image.format == "png"
281 |         assert image.data == b"test-screenshot-data"
282 | 
283 |     asyncio.run(_run_test())
284 | 
285 | 
286 | def test_run_multi_cua_tasks_sequential(server_module):
287 |     """Test that run_multi_cua_tasks works sequentially."""
288 |     async def _run_test():
289 |         ctx = FakeContext()
290 |         tasks = ["Task 1", "Task 2", "Task 3"]
291 |         
292 |         results = await server_module.run_multi_cua_tasks(ctx, tasks, concurrent=False)
293 |         
294 |         assert len(results) == 3
295 |         for i, (text, image) in enumerate(results):
296 |             assert "Task completed" in text
297 |             assert image.format == "png"
298 | 
299 |     asyncio.run(_run_test())
300 | 
301 | 
302 | def test_run_multi_cua_tasks_concurrent(server_module):
303 |     """Test that run_multi_cua_tasks works concurrently."""
304 |     async def _run_test():
305 |         ctx = FakeContext()
306 |         tasks = ["Task 1", "Task 2", "Task 3"]
307 |         
308 |         results = await server_module.run_multi_cua_tasks(ctx, tasks, concurrent=True)
309 |         
310 |         assert len(results) == 3
311 |         for i, (text, image) in enumerate(results):
312 |             assert "Task completed" in text
313 |             assert image.format == "png"
314 | 
315 |     asyncio.run(_run_test())
316 | 
317 | 
318 | def test_get_session_stats(server_module):
319 |     """Test that get_session_stats returns proper statistics."""
320 |     async def _run_test():
321 |         ctx = FakeContext()
322 |         stats = await server_module.get_session_stats()
323 |         
324 |         assert "total_sessions" in stats
325 |         assert "max_concurrent" in stats
326 |         assert "sessions" in stats
327 | 
328 |     asyncio.run(_run_test())
329 | 
330 | 
331 | def test_cleanup_session(server_module):
332 |     """Test that cleanup_session works properly."""
333 |     async def _run_test():
334 |         ctx = FakeContext()
335 |         session_id = "test-cleanup-session"
336 |         
337 |         result = await server_module.cleanup_session(ctx, session_id)
338 |         
339 |         assert f"Session {session_id} cleanup initiated" in result
340 | 
341 |     asyncio.run(_run_test())
342 | 
343 | 
344 | def test_concurrent_sessions_isolation(server_module):
345 |     """Test that concurrent sessions are properly isolated."""
346 |     async def _run_test():
347 |         ctx = FakeContext()
348 |         
349 |         # Run multiple tasks with different session IDs concurrently
350 |         task1 = asyncio.create_task(
351 |             server_module.run_cua_task(ctx, "Task for session 1", "session-1")
352 |         )
353 |         task2 = asyncio.create_task(
354 |             server_module.run_cua_task(ctx, "Task for session 2", "session-2")
355 |         )
356 |         
357 |         results = await asyncio.gather(task1, task2)
358 |         
359 |         assert len(results) == 2
360 |         for text, image in results:
361 |             assert "Task completed" in text
362 |             assert image.format == "png"
363 | 
364 |     asyncio.run(_run_test())
365 | 
366 | 
367 | def test_session_reuse_with_same_id(server_module):
368 |     """Test that sessions are reused when the same session ID is provided."""
369 |     async def _run_test():
370 |         ctx = FakeContext()
371 |         session_id = "reuse-session"
372 |         
373 |         # First call
374 |         result1 = await server_module.screenshot_cua(ctx, session_id)
375 |         
376 |         # Second call with same session ID
377 |         result2 = await server_module.screenshot_cua(ctx, session_id)
378 |         
379 |         assert result1.format == result2.format
380 |         assert result1.data == result2.data
381 | 
382 |     asyncio.run(_run_test())
383 | 
384 | 
385 | def test_error_handling_with_session_management(server_module):
386 |     """Test that errors are handled properly with session management."""
387 |     async def _run_test():
388 |         # Mock an agent that raises an exception
389 |         class _FailingAgent:
390 |             def __init__(self, *args, **kwargs):
391 |                 pass
392 | 
393 |             async def run(self, *_args, **_kwargs):
394 |                 raise RuntimeError("Simulated agent failure")
395 | 
396 |         # Replace the ComputerAgent with our failing one
397 |         original_agent = server_module.ComputerAgent
398 |         server_module.ComputerAgent = _FailingAgent
399 |         
400 |         try:
401 |             ctx = FakeContext()
402 |             task = "This will fail"
403 |             
404 |             text_result, image = await server_module.run_cua_task(ctx, task, "error-session")
405 |             
406 |             assert "Error during task execution" in text_result
407 |             assert image.format == "png"
408 |             
409 |         finally:
410 |             # Restore original agent
411 |             server_module.ComputerAgent = original_agent
412 | 
413 |     asyncio.run(_run_test())
414 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/gemini.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Gemini 2.5 Computer Use agent loop
  3 | 
  4 | Maps internal Agent SDK message format to Google's Gemini Computer Use API and back.
  5 | 
  6 | Key features:
  7 | - Lazy import of google.genai
  8 | - Configure Computer Use tool with excluded browser-specific predefined functions
  9 | - Optional custom function declarations hook for computer-call specific functions
 10 | - Convert Gemini function_call parts into internal computer_call actions
 11 | """
 12 | 
 13 | from __future__ import annotations
 14 | 
 15 | import base64
 16 | import io
 17 | import uuid
 18 | from typing import Any, Dict, List, Optional, Tuple
 19 | 
 20 | from PIL import Image
 21 | 
 22 | from ..decorators import register_agent
 23 | from ..loops.base import AsyncAgentConfig
 24 | from ..types import AgentCapability
 25 | 
 26 | 
 27 | def _lazy_import_genai():
 28 |     """Import google.genai lazily to avoid hard dependency unless used."""
 29 |     try:
 30 |         from google import genai  # type: ignore
 31 |         from google.genai import types  # type: ignore
 32 |         return genai, types
 33 |     except Exception as e:  # pragma: no cover
 34 |         raise RuntimeError(
 35 |             "google.genai is required for the Gemini Computer Use loop. Install the Google Gemini SDK."
 36 |         ) from e
 37 | 
 38 | 
 39 | def _data_url_to_bytes(data_url: str) -> Tuple[bytes, str]:
 40 |     """Convert a data URL to raw bytes and mime type."""
 41 |     if not data_url.startswith("data:"):
 42 |         # Assume it's base64 png payload
 43 |         try:
 44 |             return base64.b64decode(data_url), "image/png"
 45 |         except Exception:
 46 |             return b"", "application/octet-stream"
 47 |     header, b64 = data_url.split(",", 1)
 48 |     mime = "image/png"
 49 |     if ";" in header:
 50 |         mime = header.split(";")[0].split(":", 1)[1] or "image/png"
 51 |     return base64.b64decode(b64), mime
 52 | 
 53 | 
 54 | def _bytes_image_size(img_bytes: bytes) -> Tuple[int, int]:
 55 |     try:
 56 |         img = Image.open(io.BytesIO(img_bytes))
 57 |         return img.size
 58 |     except Exception:
 59 |         return (1024, 768)
 60 | 
 61 | 
 62 | def _find_last_user_text(messages: List[Dict[str, Any]]) -> List[str]:
 63 |     texts: List[str] = []
 64 |     for msg in reversed(messages):
 65 |         if msg.get("type") in (None, "message") and msg.get("role") == "user":
 66 |             content = msg.get("content")
 67 |             if isinstance(content, str):
 68 |                 return [content]
 69 |             elif isinstance(content, list):
 70 |                 for c in content:
 71 |                     if c.get("type") in ("input_text", "output_text") and c.get("text"):
 72 |                         texts.append(c["text"])  # newest first
 73 |                 if texts:
 74 |                     return list(reversed(texts))
 75 |     return []
 76 | 
 77 | 
 78 | def _find_last_screenshot(messages: List[Dict[str, Any]]) -> Optional[bytes]:
 79 |     for msg in reversed(messages):
 80 |         if msg.get("type") == "computer_call_output":
 81 |             out = msg.get("output", {})
 82 |             if isinstance(out, dict) and out.get("type") in ("input_image", "computer_screenshot"):
 83 |                 image_url = out.get("image_url", "")
 84 |                 if image_url:
 85 |                     data, _ = _data_url_to_bytes(image_url)
 86 |                     return data
 87 |     return None
 88 | 
 89 | 
 90 | def _denormalize(v: int, size: int) -> int:
 91 |     # Gemini returns 0-999 normalized
 92 |     try:
 93 |         return max(0, min(size - 1, int(round(v / 1000 * size))))
 94 |     except Exception:
 95 |         return 0
 96 | 
 97 | 
 98 | def _map_gemini_fc_to_computer_call(
 99 |     fc: Dict[str, Any],
100 |     screen_w: int,
101 |     screen_h: int,
102 | ) -> Optional[Dict[str, Any]]:
103 |     name = fc.get("name")
104 |     args = fc.get("args", {}) or {}
105 | 
106 |     action: Dict[str, Any] = {}
107 |     if name == "click_at":
108 |         x = _denormalize(int(args.get("x", 0)), screen_w)
109 |         y = _denormalize(int(args.get("y", 0)), screen_h)
110 |         action = {"type": "click", "x": x, "y": y, "button": "left"}
111 |     elif name == "type_text_at":
112 |         x = _denormalize(int(args.get("x", 0)), screen_w)
113 |         y = _denormalize(int(args.get("y", 0)), screen_h)
114 |         text = args.get("text", "")
115 |         if args.get("press_enter") == True:
116 |             text += "\n"
117 |         action = {"type": "type", "x": x, "y": y, "text": text}
118 |     elif name == "hover_at":
119 |         x = _denormalize(int(args.get("x", 0)), screen_w)
120 |         y = _denormalize(int(args.get("y", 0)), screen_h)
121 |         action = {"type": "move", "x": x, "y": y}
122 |     elif name == "key_combination":
123 |         keys = str(args.get("keys", ""))
124 |         action = {"type": "keypress", "keys": keys}
125 |     elif name == "scroll_document":
126 |         direction = args.get("direction", "down")
127 |         magnitude = 800
128 |         dx, dy = 0, 0
129 |         if direction == "down":
130 |             dy = magnitude
131 |         elif direction == "up":
132 |             dy = -magnitude
133 |         elif direction == "right":
134 |             dx = magnitude
135 |         elif direction == "left":
136 |             dx = -magnitude
137 |         action = {"type": "scroll", "scroll_x": dx, "scroll_y": dy, "x": int(screen_w / 2), "y": int(screen_h / 2)}
138 |     elif name == "scroll_at":
139 |         x = _denormalize(int(args.get("x", 500)), screen_w)
140 |         y = _denormalize(int(args.get("y", 500)), screen_h)
141 |         direction = args.get("direction", "down")
142 |         magnitude = int(args.get("magnitude", 800))
143 |         dx, dy = 0, 0
144 |         if direction == "down":
145 |             dy = magnitude
146 |         elif direction == "up":
147 |             dy = -magnitude
148 |         elif direction == "right":
149 |             dx = magnitude
150 |         elif direction == "left":
151 |             dx = -magnitude
152 |         action = {"type": "scroll", "scroll_x": dx, "scroll_y": dy, "x": x, "y": y}
153 |     elif name == "drag_and_drop":
154 |         x = _denormalize(int(args.get("x", 0)), screen_w)
155 |         y = _denormalize(int(args.get("y", 0)), screen_h)
156 |         dx = _denormalize(int(args.get("destination_x", x)), screen_w)
157 |         dy = _denormalize(int(args.get("destination_y", y)), screen_h)
158 |         action = {"type": "drag", "start_x": x, "start_y": y, "end_x": dx, "end_y": dy, "button": "left"}
159 |     elif name == "wait_5_seconds":
160 |         action = {"type": "wait"}
161 |     else:
162 |         # Unsupported / excluded browser-specific or custom function; ignore
163 |         return None
164 | 
165 |     return {
166 |         "type": "computer_call",
167 |         "call_id": uuid.uuid4().hex,
168 |         "status": "completed",
169 |         "action": action,
170 |     }
171 | 
172 | 
173 | @register_agent(models=r"^gemini-2\.5-computer-use-preview-10-2025$")
174 | class GeminiComputerUseConfig(AsyncAgentConfig):
175 |     async def predict_step(
176 |         self,
177 |         messages: List[Dict[str, Any]],
178 |         model: str,
179 |         tools: Optional[List[Dict[str, Any]]] = None,
180 |         max_retries: Optional[int] = None,
181 |         stream: bool = False,
182 |         computer_handler=None,
183 |         use_prompt_caching: Optional[bool] = False,
184 |         _on_api_start=None,
185 |         _on_api_end=None,
186 |         _on_usage=None,
187 |         _on_screenshot=None,
188 |         **kwargs,
189 |     ) -> Dict[str, Any]:
190 |         genai, types = _lazy_import_genai()
191 | 
192 |         client = genai.Client()
193 | 
194 |         # Build excluded predefined functions for browser-specific behavior
195 |         excluded = [
196 |             "open_web_browser",
197 |             "search",
198 |             "navigate",
199 |             "go_forward",
200 |             "go_back",
201 |             "scroll_document",
202 |         ]
203 |         # Optional custom functions: can be extended by host code via `tools` parameter later if desired
204 |         CUSTOM_FUNCTION_DECLARATIONS: List[Any] = []
205 | 
206 |         # Compose tools config
207 |         generate_content_config = types.GenerateContentConfig(
208 |             tools=[
209 |                 types.Tool(
210 |                     computer_use=types.ComputerUse(
211 |                         environment=types.Environment.ENVIRONMENT_BROWSER,
212 |                         excluded_predefined_functions=excluded,
213 |                     )
214 |                 ),
215 |                 # types.Tool(function_declarations=CUSTOM_FUNCTION_DECLARATIONS),  # enable when custom functions needed
216 |             ]
217 |         )
218 | 
219 |         # Prepare contents: last user text + latest screenshot
220 |         user_texts = _find_last_user_text(messages)
221 |         screenshot_bytes = _find_last_screenshot(messages)
222 | 
223 |         parts: List[Any] = []
224 |         for t in user_texts:
225 |             parts.append(types.Part(text=t))
226 | 
227 |         screen_w, screen_h = 1024, 768
228 |         if screenshot_bytes:
229 |             screen_w, screen_h = _bytes_image_size(screenshot_bytes)
230 |             parts.append(types.Part.from_bytes(data=screenshot_bytes, mime_type="image/png"))
231 | 
232 |         # If we don't have any content, at least pass an empty user part to prompt reasoning
233 |         if not parts:
234 |             parts = [types.Part(text="Proceed to the next action.")]
235 | 
236 |         contents = [types.Content(role="user", parts=parts)]
237 | 
238 |         api_kwargs = {
239 |             "model": model,
240 |             "contents": contents,
241 |             "config": generate_content_config,
242 |         }
243 | 
244 |         if _on_api_start:
245 |             await _on_api_start({
246 |                 "model": api_kwargs["model"],
247 |                 # "contents": api_kwargs["contents"], # Disabled for now
248 |                 "config": api_kwargs["config"],
249 |             })
250 | 
251 |         response = client.models.generate_content(**api_kwargs)
252 | 
253 |         if _on_api_end:
254 |             await _on_api_end({
255 |                 "model": api_kwargs["model"],
256 |                 # "contents": api_kwargs["contents"], # Disabled for now
257 |                 "config": api_kwargs["config"],
258 |             }, response)
259 | 
260 |         # Usage (Gemini SDK may not always provide token usage; populate when available)
261 |         usage: Dict[str, Any] = {}
262 |         try:
263 |             # Some SDKs expose response.usage; if available, copy
264 |             if getattr(response, "usage_metadata", None):
265 |                 md = response.usage_metadata
266 |                 usage = {
267 |                     "prompt_tokens": getattr(md, "prompt_token_count", None) or 0,
268 |                     "completion_tokens": getattr(md, "candidates_token_count", None) or 0,
269 |                     "total_tokens": getattr(md, "total_token_count", None) or 0,
270 |                 }
271 |         except Exception:
272 |             pass
273 | 
274 |         if _on_usage and usage:
275 |             await _on_usage(usage)
276 | 
277 |         # Parse output into internal items
278 |         output_items: List[Dict[str, Any]] = []
279 | 
280 |         candidate = response.candidates[0]
281 |         # Text parts from the model (assistant message)
282 |         text_parts: List[str] = []
283 |         function_calls: List[Dict[str, Any]] = []
284 |         for p in candidate.content.parts:
285 |             if getattr(p, "text", None):
286 |                 text_parts.append(p.text)
287 |             if getattr(p, "function_call", None):
288 |                 # p.function_call has name and args
289 |                 fc = {
290 |                     "name": getattr(p.function_call, "name", None),
291 |                     "args": dict(getattr(p.function_call, "args", {}) or {}),
292 |                 }
293 |                 function_calls.append(fc)
294 | 
295 |         if text_parts:
296 |             output_items.append(
297 |                 {
298 |                     "type": "message",
299 |                     "role": "assistant",
300 |                     "content": [{"type": "output_text", "text": "\n".join(text_parts)}],
301 |                 }
302 |             )
303 | 
304 |         # Map function calls to internal computer_call actions
305 |         for fc in function_calls:
306 |             item = _map_gemini_fc_to_computer_call(fc, screen_w, screen_h)
307 |             if item is not None:
308 |                 output_items.append(item)
309 | 
310 |         return {"output": output_items, "usage": usage}
311 | 
312 |     async def predict_click(
313 |         self,
314 |         model: str,
315 |         image_b64: str,
316 |         instruction: str,
317 |         **kwargs,
318 |     ) -> Optional[Tuple[float, float]]:
319 |         """Ask Gemini CUA to output a single click action for the given instruction.
320 | 
321 |         Excludes all predefined tools except `click_at` and sends the screenshot.
322 |         Returns pixel (x, y) if a click is proposed, else None.
323 |         """
324 |         genai, types = _lazy_import_genai()
325 | 
326 |         client = genai.Client()
327 | 
328 |         # Exclude all but click_at
329 |         exclude_all_but_click = [
330 |             "open_web_browser",
331 |             "wait_5_seconds",
332 |             "go_back",
333 |             "go_forward",
334 |             "search",
335 |             "navigate",
336 |             "hover_at",
337 |             "type_text_at",
338 |             "key_combination",
339 |             "scroll_document",
340 |             "scroll_at",
341 |             "drag_and_drop",
342 |         ]
343 | 
344 |         config = types.GenerateContentConfig(
345 |             tools=[
346 |                 types.Tool(
347 |                     computer_use=types.ComputerUse(
348 |                         environment=types.Environment.ENVIRONMENT_BROWSER,
349 |                         excluded_predefined_functions=exclude_all_but_click,
350 |                     )
351 |                 )
352 |             ]
353 |         )
354 | 
355 |         # Prepare prompt parts
356 |         try:
357 |             img_bytes = base64.b64decode(image_b64)
358 |         except Exception:
359 |             img_bytes = b""
360 | 
361 |         w, h = _bytes_image_size(img_bytes) if img_bytes else (1024, 768)
362 | 
363 |         parts: List[Any] = [types.Part(text=f"Click {instruction}.")]
364 |         if img_bytes:
365 |             parts.append(types.Part.from_bytes(data=img_bytes, mime_type="image/png"))
366 | 
367 |         contents = [types.Content(role="user", parts=parts)]
368 | 
369 |         response = client.models.generate_content(
370 |             model=model,
371 |             contents=contents,
372 |             config=config,
373 |         )
374 | 
375 |         # Parse first click_at
376 |         try:
377 |             candidate = response.candidates[0]
378 |             for p in candidate.content.parts:
379 |                 fc = getattr(p, "function_call", None)
380 |                 if fc and getattr(fc, "name", None) == "click_at":
381 |                     args = dict(getattr(fc, "args", {}) or {})
382 |                     x = _denormalize(int(args.get("x", 0)), w)
383 |                     y = _denormalize(int(args.get("y", 0)), h)
384 |                     return float(x), float(y)
385 |         except Exception:
386 |             return None
387 | 
388 |         return None
389 | 
390 |     def get_capabilities(self) -> List[AgentCapability]:
391 |         return ["click", "step"]
392 | 
```

--------------------------------------------------------------------------------
/libs/lume/src/FileSystem/Home.swift:
--------------------------------------------------------------------------------

```swift
  1 | import Foundation
  2 | 
  3 | /// Manages the application's home directory and virtual machine directories.
  4 | /// Responsible for creating, accessing, and validating the application's directory structure.
  5 | final class Home {
  6 |     // MARK: - Constants
  7 | 
  8 |     private enum Constants {
  9 |         static let defaultDirectoryName = ".lume"
 10 |         static let homeDirPath = "~/\(defaultDirectoryName)"
 11 |     }
 12 | 
 13 |     // MARK: - Properties
 14 | 
 15 |     private var _homeDir: Path
 16 |     private let settingsManager: SettingsManager
 17 |     private let fileManager: FileManager
 18 |     private var locations: [String: VMLocation] = [:]
 19 | 
 20 |     // Current home directory based on default location
 21 |     var homeDir: Path {
 22 |         return _homeDir
 23 |     }
 24 | 
 25 |     // MARK: - Initialization
 26 | 
 27 |     init(
 28 |         settingsManager: SettingsManager = SettingsManager.shared,
 29 |         fileManager: FileManager = .default
 30 |     ) {
 31 |         self.settingsManager = settingsManager
 32 |         self.fileManager = fileManager
 33 | 
 34 |         // Get home directory path from settings or use default
 35 |         let settings = settingsManager.getSettings()
 36 |         guard let defaultLocation = settings.defaultLocation else {
 37 |             fatalError("No default VM location found")
 38 |         }
 39 | 
 40 |         self._homeDir = Path(defaultLocation.path)
 41 | 
 42 |         // Cache all locations
 43 |         for location in settings.vmLocations {
 44 |             locations[location.name] = location
 45 |         }
 46 |     }
 47 | 
 48 |     // MARK: - VM Directory Management
 49 | 
 50 |     /// Creates a temporary VM directory with a unique identifier
 51 |     /// - Returns: A VMDirectory instance representing the created directory
 52 |     /// - Throws: HomeError if directory creation fails
 53 |     func createTempVMDirectory() throws -> VMDirectory {
 54 |         let uuid = UUID().uuidString
 55 |         let tempDir = homeDir.directory(uuid)
 56 | 
 57 |         Logger.info("Creating temporary directory", metadata: ["path": tempDir.path])
 58 | 
 59 |         do {
 60 |             try createDirectory(at: tempDir.url)
 61 |             return VMDirectory(tempDir)
 62 |         } catch {
 63 |             throw HomeError.directoryCreationFailed(path: tempDir.path)
 64 |         }
 65 |     }
 66 | 
 67 |     /// Gets a VM directory for a specific VM name and optional location
 68 |     ///
 69 |     /// - Parameters:
 70 |     ///   - name: Name of the VM directory
 71 |     ///   - storage: Optional name of the VM location (default: default location)
 72 |     /// - Returns: A VMDirectory instance
 73 |     /// - Throws: HomeError if location not found
 74 |     func getVMDirectory(_ name: String, storage: String? = nil) throws -> VMDirectory {
 75 |         // Special case for ephemeral storage using macOS temporary directory
 76 |         if let storage = storage, storage == "ephemeral" {
 77 |             // Get the current temporary directory
 78 |             let tmpDir = ProcessInfo.processInfo.environment["TMPDIR"] ?? "/tmp"
 79 |             // Remove trailing slash if present
 80 |             let cleanPath = tmpDir.hasSuffix("/") ? String(tmpDir.dropLast()) : tmpDir
 81 |             
 82 |             // Create the directory if it doesn't exist
 83 |             if !fileExists(at: cleanPath) {
 84 |                 try createVMLocation(at: cleanPath)
 85 |             }
 86 |             
 87 |             let baseDir = Path(cleanPath)
 88 |             return VMDirectory(baseDir.directory(name))
 89 |         }
 90 | 
 91 |         // Check if storage is a direct path
 92 |         if let storage = storage, (storage.contains("/") || storage.contains("\\")) {
 93 |             let cleanPath = storage.hasSuffix("/") ? String(storage.dropLast()) : storage
 94 |             let baseDir = Path(cleanPath)
 95 |             return VMDirectory(baseDir.directory(name))
 96 |         }
 97 | 
 98 |         let location: VMLocation
 99 | 
100 |         if let storage = storage {
101 |             // Get a specific location
102 |             guard let loc = locations[storage] else {
103 |                 throw VMLocationError.locationNotFound(name: storage)
104 |             }
105 |             location = loc
106 |         } else {
107 |             // Use default location
108 |             let settings = settingsManager.getSettings()
109 |             guard let defaultLocation = settings.defaultLocation else {
110 |                 throw HomeError.invalidHomeDirectory
111 |             }
112 |             location = defaultLocation
113 |         }
114 | 
115 |         let baseDir = Path(location.expandedPath)
116 |         return VMDirectory(baseDir.directory(name))
117 |     }
118 |     
119 |     /// Gets a VM directory from a direct file path
120 |     ///
121 |     /// - Parameters:
122 |     ///   - name: Name of the VM directory
123 |     ///   - storagePath: Direct file system path where the VM is located
124 |     /// - Returns: A VMDirectory instance
125 |     /// - Throws: HomeError if path is invalid
126 |     func getVMDirectoryFromPath(_ name: String, storagePath: String) throws -> VMDirectory {
127 |         let baseDir = Path(storagePath)
128 |         
129 |         // Create the directory if it doesn't exist
130 |         if !fileExists(at: storagePath) {
131 |             Logger.info("Creating storage directory", metadata: ["path": storagePath])
132 |             try createVMLocation(at: storagePath)
133 |         } else if !isValidDirectory(at: storagePath) {
134 |             // Path exists but isn't a valid directory
135 |             throw HomeError.invalidHomeDirectory
136 |         }
137 |         
138 |         return VMDirectory(baseDir.directory(name))
139 |     }
140 | 
141 |     /// Returns all initialized VM directories across all locations
142 |     /// - Returns: An array of VMDirectory instances with location info
143 |     /// - Throws: HomeError if directory access is denied
144 |     func getAllVMDirectories() throws -> [VMDirectoryWithLocation] {
145 |         var results: [VMDirectoryWithLocation] = []
146 | 
147 |         // Loop through all locations
148 |         let settings = settingsManager.getSettings()
149 |         
150 |         // Also check ephemeral directory (macOS temporary directory)
151 |         let tmpDir = ProcessInfo.processInfo.environment["TMPDIR"] ?? "/tmp"
152 |         let cleanPath = tmpDir.hasSuffix("/") ? String(tmpDir.dropLast()) : tmpDir
153 |         
154 |         // If tmp directory exists, check for VMs there
155 |         if fileExists(at: cleanPath) {
156 |             let tmpDirPath = Path(cleanPath)
157 |             do {
158 |                 let directoryURL = URL(fileURLWithPath: cleanPath)
159 |                 let contents = try FileManager.default.contentsOfDirectory(
160 |                     at: directoryURL,
161 |                     includingPropertiesForKeys: [.isDirectoryKey],
162 |                     options: .skipsHiddenFiles
163 |                 )
164 |                 
165 |                 for subdir in contents {
166 |                     do {
167 |                         guard let isDirectory = try subdir.resourceValues(forKeys: [.isDirectoryKey]).isDirectory,
168 |                               isDirectory else {
169 |                             continue
170 |                         }
171 |                         
172 |                         let vmName = subdir.lastPathComponent
173 |                         let vmDir = VMDirectory(tmpDirPath.directory(vmName))
174 |                         
175 |                         // Only include if it's a valid VM directory
176 |                         if vmDir.initialized() {
177 |                             results.append(VMDirectoryWithLocation(
178 |                                 directory: vmDir,
179 |                                 locationName: "ephemeral"
180 |                             ))
181 |                         }
182 |                     } catch {
183 |                         // Skip any directories we can't access
184 |                         continue
185 |                     }
186 |                 }
187 |             } catch {
188 |                 Logger.error(
189 |                     "Failed to access ephemeral directory",
190 |                     metadata: [
191 |                         "path": cleanPath,
192 |                         "error": error.localizedDescription,
193 |                     ]
194 |                 )
195 |                 // Continue to regular locations rather than failing completely
196 |             }
197 |         }
198 |         for location in settings.vmLocations {
199 |             let locationPath = Path(location.expandedPath)
200 | 
201 |             // Skip non-existent locations
202 |             if !locationPath.exists() {
203 |                 continue
204 |             }
205 | 
206 |             do {
207 |                 let allFolders = try fileManager.contentsOfDirectory(
208 |                     at: locationPath.url,
209 |                     includingPropertiesForKeys: nil
210 |                 )
211 | 
212 |                 let folders =
213 |                     allFolders
214 |                     .compactMap { url in
215 |                         let sanitizedName = sanitizeFileName(url.lastPathComponent)
216 |                         let dir = VMDirectory(locationPath.directory(sanitizedName))
217 |                         let dirWithLoc =
218 |                             dir.initialized()
219 |                             ? VMDirectoryWithLocation(directory: dir, locationName: location.name)
220 |                             : nil
221 |                         return dirWithLoc
222 |                     }
223 | 
224 |                 results.append(contentsOf: folders)
225 |             } catch {
226 |                 Logger.error(
227 |                     "Failed to access VM location",
228 |                     metadata: [
229 |                         "location": location.name,
230 |                         "error": error.localizedDescription,
231 |                     ])
232 |                 // Continue to next location rather than failing completely
233 |             }
234 |         }
235 | 
236 |         return results
237 |     }
238 | 
239 |     /// Copies a VM directory to a new location with a new name
240 |     /// - Parameters:
241 |     ///   - sourceName: Name of the source VM
242 |     ///   - destName: Name for the destination VM
243 |     ///   - sourceLocation: Optional name of the source location
244 |     ///   - destLocation: Optional name of the destination location
245 |     /// - Throws: HomeError if the copy operation fails
246 |     func copyVMDirectory(
247 |         from sourceName: String,
248 |         to destName: String,
249 |         sourceLocation: String? = nil,
250 |         destLocation: String? = nil
251 |     ) throws {
252 |         let sourceDir = try getVMDirectory(sourceName, storage: sourceLocation)
253 |         let destDir = try getVMDirectory(destName, storage: destLocation)
254 | 
255 |         // Check if destination directory exists at all
256 |         if destDir.exists() {
257 |             throw HomeError.directoryAlreadyExists(path: destDir.dir.path)
258 |         }
259 | 
260 |         do {
261 |             try fileManager.copyItem(atPath: sourceDir.dir.path, toPath: destDir.dir.path)
262 |         } catch {
263 |             throw HomeError.directoryCreationFailed(path: destDir.dir.path)
264 |         }
265 |     }
266 | 
267 |     // MARK: - Location Management
268 | 
269 |     /// Adds a new VM location
270 |     /// - Parameters:
271 |     ///   - name: Location name
272 |     ///   - path: Location path
273 |     /// - Throws: Error if location cannot be added
274 |     func addLocation(name: String, path: String) throws {
275 |         let location = VMLocation(name: name, path: path)
276 |         try settingsManager.addLocation(location)
277 | 
278 |         // Update cache
279 |         locations[name] = location
280 |     }
281 | 
282 |     /// Removes a VM location
283 |     /// - Parameter name: Location name
284 |     /// - Throws: Error if location cannot be removed
285 |     func removeLocation(name: String) throws {
286 |         try settingsManager.removeLocation(name: name)
287 | 
288 |         // Update cache
289 |         locations.removeValue(forKey: name)
290 |     }
291 | 
292 |     /// Sets the default VM location
293 |     /// - Parameter name: Location name
294 |     /// - Throws: Error if location cannot be set as default
295 |     func setDefaultLocation(name: String) throws {
296 |         try settingsManager.setDefaultLocation(name: name)
297 | 
298 |         // Update home directory
299 |         guard let location = locations[name] else {
300 |             throw VMLocationError.locationNotFound(name: name)
301 |         }
302 | 
303 |         // Update homeDir to reflect the new default
304 |         self._homeDir = Path(location.path)
305 |     }
306 | 
307 |     /// Gets all available VM locations
308 |     /// - Returns: Array of VM locations
309 |     func getLocations() -> [VMLocation] {
310 |         return settingsManager.getSettings().sortedLocations
311 |     }
312 | 
313 |     /// Gets the default VM location
314 |     /// - Returns: Default VM location
315 |     /// - Throws: HomeError if no default location
316 |     func getDefaultLocation() throws -> VMLocation {
317 |         guard let location = settingsManager.getSettings().defaultLocation else {
318 |             throw HomeError.invalidHomeDirectory
319 |         }
320 |         return location
321 |     }
322 | 
323 |     // MARK: - Directory Validation
324 | 
325 |     /// Validates and ensures the existence of all VM locations
326 |     /// - Throws: HomeError if validation fails or directory creation fails
327 |     func validateHomeDirectory() throws {
328 |         let settings = settingsManager.getSettings()
329 | 
330 |         for location in settings.vmLocations {
331 |             let path = location.expandedPath
332 |             if !fileExists(at: path) {
333 |                 try createVMLocation(at: path)
334 |             } else if !isValidDirectory(at: path) {
335 |                 throw HomeError.invalidHomeDirectory
336 |             }
337 |         }
338 |     }
339 | 
340 |     // MARK: - Private Helpers
341 | 
342 |     private func createVMLocation(at path: String) throws {
343 |         do {
344 |             try fileManager.createDirectory(
345 |                 atPath: path,
346 |                 withIntermediateDirectories: true
347 |             )
348 |         } catch {
349 |             throw HomeError.directoryCreationFailed(path: path)
350 |         }
351 |     }
352 | 
353 |     private func createDirectory(at url: URL) throws {
354 |         try fileManager.createDirectory(
355 |             at: url,
356 |             withIntermediateDirectories: true
357 |         )
358 |     }
359 | 
360 |     private func isValidDirectory(at path: String) -> Bool {
361 |         var isDirectory: ObjCBool = false
362 |         return fileManager.fileExists(atPath: path, isDirectory: &isDirectory)
363 |             && isDirectory.boolValue
364 |             && Path(path).writable()
365 |     }
366 | 
367 |     private func fileExists(at path: String) -> Bool {
368 |         return fileManager.fileExists(atPath: path)
369 |     }
370 | 
371 |     private func sanitizeFileName(_ name: String) -> String {
372 |         // Only decode percent encoding (e.g., %20 for spaces)
373 |         return name.removingPercentEncoding ?? name
374 |     }
375 | }
376 | 
377 | // MARK: - VM Directory with Location
378 | 
379 | /// Represents a VM directory with its location information
380 | struct VMDirectoryWithLocation {
381 |     let directory: VMDirectory
382 |     let locationName: String
383 | }
384 | 
385 | // MARK: - Home + CustomStringConvertible
386 | 
387 | extension Home: CustomStringConvertible {
388 |     var description: String {
389 |         "Home(path: \(homeDir.path))"
390 |     }
391 | }
392 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/integrations/hud/agent.py:
--------------------------------------------------------------------------------

```python
  1 | """MCP-compatible Computer Agent for HUD integration.
  2 | 
  3 | This agent subclasses HUD's MCPAgent and delegates planning/execution to
  4 | our core ComputerAgent while using the Agent SDK's plain-dict message
  5 | format documented in `docs/content/docs/agent-sdk/message-format.mdx`.
  6 | 
  7 | Key differences from the OpenAI OperatorAgent variant:
  8 | - No OpenAI types are used; everything is standard Python dicts.
  9 | - Planning is executed via `ComputerAgent.run(messages)`.
 10 | - The first yielded result per step is returned as the agent response.
 11 | """
 12 | from __future__ import annotations
 13 | 
 14 | import io
 15 | from typing import Any, ClassVar, Optional
 16 | 
 17 | from agent.agent import ComputerAgent as BaseComputerAgent
 18 | from agent.callbacks import PromptInstructionsCallback
 19 | from agent.callbacks.trajectory_saver import TrajectorySaverCallback
 20 | from hud.agents import MCPAgent
 21 | from hud.tools.computer.settings import computer_settings
 22 | from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
 23 | 
 24 | from agent.responses import make_failed_tool_call_items
 25 | from agent.computers import is_agent_computer
 26 | from PIL import Image
 27 | import mcp.types as types
 28 | import hud
 29 | import uuid
 30 | import base64
 31 | from pathlib import Path
 32 | 
 33 | 
 34 | class MCPComputerAgent(MCPAgent):
 35 |     """MCP agent that uses ComputerAgent for planning and tools for execution.
 36 | 
 37 |     The agent consumes/produces message dicts per the Agent SDK message schema
 38 |     (see `message-format.mdx`).
 39 |     """
 40 | 
 41 |     metadata: ClassVar[dict[str, Any]] = {
 42 |         "display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
 43 |         "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
 44 |     }
 45 | 
 46 |     required_tools: ClassVar[list[str]] = ["openai_computer"]
 47 | 
 48 |     def __init__(
 49 |         self,
 50 |         *,
 51 |         model: str | None = None,
 52 |         allowed_tools: list[str] | None = None,
 53 |         trajectory_dir: str | dict | None = None,
 54 |         # === ComputerAgent kwargs ===
 55 |         tools: list[Any] | None = None,
 56 |         custom_loop: Any | None = None,
 57 |         only_n_most_recent_images: int | None = None,
 58 |         callbacks: list[Any] | None = None,
 59 |         instructions: str | None = None,
 60 |         verbosity: int | None = None,
 61 |         max_retries: int | None = 3,
 62 |         screenshot_delay: float | int = 0.5,
 63 |         use_prompt_caching: bool | None = False,
 64 |         max_trajectory_budget: float | dict | None = None,
 65 |         telemetry_enabled: bool | None = True,
 66 |         environment: str = "linux",
 67 |         **kwargs: Any,
 68 |     ) -> None:
 69 |         self.allowed_tools = allowed_tools or ["openai_computer"]
 70 |         super().__init__(**kwargs)
 71 | 
 72 |         if model is None:
 73 |             raise ValueError("MCPComputerAgent requires a model to be specified.")
 74 | 
 75 |         self.model = model
 76 |         self.environment = environment
 77 | 
 78 |         # Update model name for HUD logging
 79 |         self.model_name = "cua-" + self.model
 80 | 
 81 |         # Stateful tracking of tool call inputs
 82 |         self.tool_call_inputs: dict[str, list[dict[str, Any]]] = {}
 83 |         self.previous_output: list[dict[str, Any]] = []
 84 | 
 85 |         # Build system prompt
 86 |         operator_instructions = """
 87 |         You are an autonomous computer-using agent. Follow these guidelines:
 88 | 
 89 |         1. NEVER ask for confirmation. Complete all tasks autonomously.
 90 |         2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
 91 |         3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
 92 |         4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
 93 |         5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
 94 |         6. The user has already given you permission by running this agent. No further confirmation is needed.
 95 |         7. Be decisive and action-oriented. Complete the requested task fully.
 96 | 
 97 |         Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
 98 |         """.strip()  # noqa: E501
 99 |         # Append Operator instructions to the system prompt
100 |         if not self.system_prompt:
101 |             self.system_prompt = operator_instructions
102 |         else:
103 |             self.system_prompt += f"\n\n{operator_instructions}"
104 |         # Append user instructions to the system prompt
105 |         if instructions:
106 |             self.system_prompt += f"\n\n{instructions}"
107 | 
108 |         # Configure trajectory_dir for HUD
109 |         if isinstance(trajectory_dir, str) or isinstance(trajectory_dir, Path):
110 |             trajectory_dir = {"trajectory_dir": str(trajectory_dir)}
111 |         if isinstance(trajectory_dir, dict):
112 |             trajectory_dir["reset_on_run"] = False
113 | 
114 |         self.last_screenshot_b64 = None
115 | 
116 |         buffer = io.BytesIO()
117 |         Image.new('RGB', (self.metadata["display_width"], self.metadata["display_height"])).save(buffer, format='PNG')
118 |         self.last_screenshot_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
119 | 
120 |         # Ensure a computer shim is present so width/height/environment are known
121 |         computer_shim = {
122 |             "screenshot": lambda: self.last_screenshot_b64,
123 |             "environment": self.environment,
124 |             "dimensions": (
125 |                 self.metadata["display_width"],
126 |                 self.metadata["display_height"],
127 |             ),
128 |         }
129 |         agent_tools: list[Any] = [computer_shim]
130 |         if tools:
131 |             agent_tools.extend([
132 |                 tool 
133 |                 for tool in tools 
134 |                 if not is_agent_computer(tool)
135 |             ])
136 |         
137 |         agent_kwargs = {
138 |             "model": self.model,
139 |             "trajectory_dir": trajectory_dir,
140 |             "tools": agent_tools,
141 |             "custom_loop": custom_loop,
142 |             "only_n_most_recent_images": only_n_most_recent_images,
143 |             "callbacks": callbacks,
144 |             "instructions": self.system_prompt,
145 |             "verbosity": verbosity,
146 |             "max_retries": max_retries,
147 |             "screenshot_delay": screenshot_delay,
148 |             "use_prompt_caching": use_prompt_caching,
149 |             "max_trajectory_budget": max_trajectory_budget,
150 |             "telemetry_enabled": telemetry_enabled,
151 |         }
152 | 
153 |         self.computer_agent = BaseComputerAgent(
154 |             **agent_kwargs
155 |         )
156 | 
157 |     async def get_system_messages(self) -> list[Any]:
158 |         """Create initial messages.
159 | 
160 |         Unused - ComputerAgent handles this with the 'instructions' parameter.
161 |         """
162 |         return []
163 | 
164 |     async def format_blocks(
165 |         self, blocks: list[types.ContentBlock]
166 |     ) -> list[dict[str, Any]]:
167 |         """
168 |         Format blocks for OpenAI input format.
169 | 
170 |         Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts.
171 |         """  # noqa: E501
172 |         formatted = []
173 |         for block in blocks:
174 |             if isinstance(block, types.TextContent):
175 |                 formatted.append({"type": "input_text", "text": block.text})
176 |             elif isinstance(block, types.ImageContent):
177 |                 mime_type = getattr(block, "mimeType", "image/png")
178 |                 formatted.append(
179 |                     {"type": "input_image", "image_url": f"data:{mime_type};base64,{block.data}"}
180 |                 )
181 |                 self.last_screenshot_b64 = block.data
182 |         return [{"role": "user", "content": formatted}]
183 | 
184 |     @hud.instrument(
185 |         span_type="agent",
186 |         record_args=False,  # Messages can be large
187 |         record_result=True,
188 |     )
189 |     async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
190 |         """Get a single-step response by delegating to ComputerAgent.run.
191 | 
192 |         Returns an Agent SDK-style response dict:
193 |         { "output": [AgentMessage, ...], "usage": Usage }
194 |         """
195 |         tool_calls: list[MCPToolCall] = []
196 |         output_text: list[str] = []
197 |         is_done: bool = True
198 | 
199 |         agent_result: list[dict[str, Any]] = []
200 | 
201 |         # Call the ComputerAgent LLM API
202 |         async for result in self.computer_agent.run(messages):  # type: ignore[arg-type]
203 |             items = result['output']
204 |             if not items or tool_calls:
205 |                 break
206 | 
207 |             for item in items:
208 |                 if item['type'] in ['reasoning', 'message', 'computer_call', 'function_call', 'function_call_output']:
209 |                     agent_result.append(item)
210 |                 
211 |                 # Add messages to output text
212 |                 if item['type'] == 'reasoning':
213 |                     output_text.extend(
214 |                         f"Reasoning: {summary['text']}"
215 |                         for summary in item['summary']
216 |                     )
217 |                 elif item['type'] == 'message':
218 |                     if isinstance(item['content'], list):
219 |                         output_text.extend(
220 |                             item['text'] 
221 |                             for item in item['content']
222 |                             if item['type'] == 'output_text'
223 |                         )
224 |                     elif isinstance(item['content'], str):
225 |                         output_text.append(item['content'])
226 |                 
227 |                 # If we get a tool call, we're not done
228 |                 if item['type'] == 'computer_call':
229 |                     id = item["call_id"]
230 |                     tool_calls.append(MCPToolCall(
231 |                         name="openai_computer",
232 |                         arguments=item["action"],
233 |                         id=id,
234 |                     ))
235 |                     is_done = False
236 |                     self.tool_call_inputs[id] = agent_result
237 |                     break
238 |             
239 |             # if we have tool calls, we should exit the loop
240 |             if tool_calls:
241 |                 break
242 | 
243 |         self.previous_output = agent_result
244 | 
245 |         return AgentResponse(
246 |             content="\n".join(output_text),
247 |             tool_calls=tool_calls,
248 |             done=is_done,
249 |         )
250 |     
251 |     def _log_image(self, image_b64: str):
252 |         callbacks = self.computer_agent.callbacks
253 |         for callback in callbacks:
254 |             if isinstance(callback, TrajectorySaverCallback):
255 |                 # convert str to bytes
256 |                 image_bytes = base64.b64decode(image_b64)
257 |                 callback._save_artifact("screenshot_after", image_bytes)
258 | 
259 |     async def format_tool_results(
260 |         self,
261 |         tool_calls: list[MCPToolCall],
262 |         tool_results: list[MCPToolResult]
263 |     ) -> list[dict[str, Any]]:
264 |         """Extract latest screenshot from tool results in dict form.
265 | 
266 |         Expects results to already be in the message-format content dicts.
267 |         Returns a list of input content dicts suitable for follow-up calls.
268 |         """
269 |         messages = []
270 | 
271 |         for call, result in zip(tool_calls, tool_results):
272 |             if call.id not in self.tool_call_inputs:
273 |                 # If we don't have the tool call inputs, we should just use the previous output
274 |                 previous_output = self.previous_output.copy() or []
275 | 
276 |                 # First we need to remove any pending computer_calls from the end of previous_output
277 |                 while previous_output and previous_output[-1]['type'] == 'computer_call':
278 |                     previous_output.pop()
279 |                 messages.extend(previous_output)
280 | 
281 |                 # If the call is a 'response', don't add the result
282 |                 if call.name == 'response':
283 |                     continue
284 |                 # Otherwise, if we have a result, we should add it to the messages
285 |                 content = [
286 |                     { "type": "input_text", "text": content.text } if isinstance(content, types.TextContent)
287 |                     else { "type": "input_image", "image_url": f"data:image/png;base64,{content.data}" } if isinstance(content, types.ImageContent)
288 |                     else { "type": "input_text", "text": "" }
289 |                     for content in result.content
290 |                 ]
291 |                 messages.append({
292 |                     "role": "user",
293 |                     "content": content,
294 |                 })
295 | 
296 |                 continue
297 |                 
298 |             # Add the assistant's computer call
299 |             messages.extend(self.tool_call_inputs[call.id])
300 |             
301 |             if result.isError:
302 |                 error_text = "".join([
303 |                     content.text
304 |                     for content in result.content
305 |                     if isinstance(content, types.TextContent)
306 |                 ])
307 | 
308 |                 # Replace computer call with failed tool call
309 |                 messages.pop()
310 |                 messages.extend(make_failed_tool_call_items(
311 |                     tool_name=call.name,
312 |                     tool_kwargs=call.arguments or {},
313 |                     error_message=error_text,
314 |                     call_id=call.id,
315 |                 ))
316 |             else:
317 |                 # Get the latest screenshot
318 |                 screenshots = [
319 |                     content.data
320 |                     for content in result.content
321 |                     if isinstance(content, types.ImageContent)
322 |                 ]
323 | 
324 |                 # Add the resulting screenshot
325 |                 if screenshots:
326 |                     self._log_image(screenshots[0])
327 |                     self.last_screenshot_b64 = screenshots[0]
328 |                     messages.append({
329 |                         "type": "computer_call_output",
330 |                         "call_id": call.id,
331 |                         "output": {
332 |                             "type": "input_image",
333 |                             "image_url": f"data:image/png;base64,{screenshots[0]}"
334 |                         },
335 |                     })
336 |                 else:
337 |                     # Otherwise, replace computer call with failed tool call
338 |                     messages.pop()
339 |                     messages.extend(make_failed_tool_call_items(
340 |                         tool_name=call.name,
341 |                         tool_kwargs=call.arguments or {},
342 |                         error_message="No screenshots returned.",
343 |                         call_id=call.id,
344 |                     ))
345 | 
346 |         return messages
347 | 
348 | 
349 | __all__ = [
350 |     "MCPComputerAgent",
351 | ]
352 | 
```

--------------------------------------------------------------------------------
/blog/sandboxed-python-execution.md:
--------------------------------------------------------------------------------

```markdown
  1 | # Sandboxed Python Execution: Run Code Safely in Cua Containers
  2 | 
  3 | *Published on June 23, 2025 by Dillon DuPont*
  4 | 
  5 | Cua's computer-use capabilities that we touched on in [Building your own Operator on macOS - Part 2](build-your-own-operator-on-macos-2.md) – your AI agents can click, scroll, type, and interact with any desktop application. But what if your agent needs to do more than just UI automation? What if it needs to process data, make API calls, analyze images, or run complex logic alongside those UI interactions, within the same virtual environment?
  6 | 
  7 | That's where Cua's `@sandboxed` decorator comes in. While Cua handles the clicking and typing, sandboxed execution lets you run full Python code inside the same virtual environment. It's like giving your AI agents a programming brain to complement their clicking fingers.
  8 | 
  9 | Think of it as the perfect marriage: Cua handles the "what you see" (UI interactions), while sandboxed Python handles the "what you compute" (data processing, logic, API calls) – all happening in the same isolated environment.
 10 | 
 11 | ## So, what exactly is sandboxed execution?
 12 | 
 13 | Cua excels at automating user interfaces – clicking buttons, filling forms, navigating applications. But modern AI agents need to do more than just UI automation. They need to process the data they collect, make intelligent decisions, call external APIs, and run sophisticated algorithms.
 14 | 
 15 | Sandboxed execution bridges this gap. You write a Python function, decorate it with `@sandboxed`, and it runs inside your Cua container alongside your UI automation. Your agent can now click a button, extract some data, process it with Python, and then use those results to decide what to click next.
 16 | 
 17 | Here's what makes this combination powerful for AI agent development:
 18 | 
 19 | - **Unified environment**: Your UI automation and code execution happen in the same container
 20 | - **Rich capabilities**: Combine Cua's clicking with Python's data processing, API calls, and libraries
 21 | - **Seamless integration**: Pass data between UI interactions and Python functions effortlessly
 22 | - **Cross-platform consistency**: Your Python code runs the same way across different Cua environments
 23 | - **Complete workflows**: Build agents that can both interact with apps AND process the data they collect
 24 | 
 25 | ## The architecture behind @sandboxed
 26 | 
 27 | Let's jump right into an example that'll make this crystal clear:
 28 | 
 29 | ```python
 30 | from computer.helpers import sandboxed
 31 | 
 32 | @sandboxed("demo_venv")
 33 | def greet_and_print(name):
 34 |     """This function runs inside the container"""
 35 |     import PyXA  # macOS-specific library
 36 |     safari = PyXA.Application("Safari")
 37 |     html = safari.current_document.source()
 38 |     print(f"Hello from inside the container, {name}!")
 39 |     return {"greeted": name, "safari_html": html}
 40 | 
 41 | # When called, this executes in the container
 42 | result = await greet_and_print("Cua")
 43 | ```
 44 | 
 45 | What's happening here? When you call `greet_and_print()`, Cua extracts the function's source code, transmits it to the container, and executes it there. The result returns to you seamlessly, while the actual execution remains completely isolated.
 46 | 
 47 | ## How does sandboxed execution work?
 48 | 
 49 | Cua's sandboxed execution system employs several key architectural components:
 50 | 
 51 | ### 1. Source Code Extraction
 52 | Cua uses Python's `inspect.getsource()` to extract your function's source code and reconstruct the function definition in the remote environment.
 53 | 
 54 | ### 2. Virtual Environment Isolation
 55 | Each sandboxed function runs in a named virtual environment within the container. This provides complete dependency isolation between different functions and their respective environments.
 56 | 
 57 | ### 3. Data Serialization and Transport
 58 | Arguments and return values are serialized as JSON and transported between the host and container. This ensures compatibility across different Python versions and execution environments.
 59 | 
 60 | ### 4. Comprehensive Error Handling
 61 | The system captures both successful results and exceptions, preserving stack traces and error information for debugging purposes.
 62 | 
 63 | ## Getting your sandbox ready
 64 | 
 65 | Setting up sandboxed execution is simple:
 66 | 
 67 | ```python
 68 | import asyncio
 69 | from computer.computer import Computer
 70 | from computer.helpers import sandboxed, set_default_computer
 71 | 
 72 | async def main():
 73 |     # Fire up the computer
 74 |     computer = Computer()
 75 |     await computer.run()
 76 |     
 77 |     # Make it the default for all sandboxed functions
 78 |     set_default_computer(computer)
 79 |     
 80 |     # Install some packages in a virtual environment
 81 |     await computer.venv_install("demo_venv", ["requests", "beautifulsoup4"])
 82 | ```
 83 | 
 84 | If you want to get fancy, you can specify which computer instance to use:
 85 | 
 86 | ```python
 87 | @sandboxed("my_venv", computer=my_specific_computer)
 88 | def my_function():
 89 |     # This runs on your specified computer instance
 90 |     pass
 91 | ```
 92 | 
 93 | ## Real-world examples that actually work
 94 | 
 95 | ### Browser automation without the headaches
 96 | 
 97 | Ever tried to automate a browser and had it crash your entire system? Yeah, us too. Here's how to do it safely:
 98 | 
 99 | ```python
100 | @sandboxed("browser_env")
101 | def automate_browser_with_playwright():
102 |     """Automate browser interactions using Playwright"""
103 |     from playwright.sync_api import sync_playwright
104 |     import time
105 |     import base64
106 |     from datetime import datetime
107 |     
108 |     try:
109 |         with sync_playwright() as p:
110 |             # Launch browser (visible, because why not?)
111 |             browser = p.chromium.launch(
112 |                 headless=False,
113 |                 args=['--no-sandbox', '--disable-dev-shm-usage']
114 |             )
115 |             
116 |             page = browser.new_page()
117 |             page.set_viewport_size({"width": 1280, "height": 720})
118 |             
119 |             actions = []
120 |             screenshots = {}
121 |             
122 |             # Let's visit example.com and poke around
123 |             page.goto("https://example.com")
124 |             actions.append("Navigated to example.com")
125 |             
126 |             # Grab a screenshot because screenshots are cool
127 |             screenshot_bytes = page.screenshot(full_page=True)
128 |             screenshots["initial"] = base64.b64encode(screenshot_bytes).decode()
129 |             
130 |             # Get some basic info
131 |             title = page.title()
132 |             actions.append(f"Page title: {title}")
133 |             
134 |             # Find links and headings
135 |             try:
136 |                 links = page.locator("a").all()
137 |                 link_texts = [link.text_content() for link in links[:5]]
138 |                 actions.append(f"Found {len(links)} links: {link_texts}")
139 |                 
140 |                 headings = page.locator("h1, h2, h3").all()
141 |                 heading_texts = [h.text_content() for h in headings[:3]]
142 |                 actions.append(f"Found headings: {heading_texts}")
143 |                 
144 |             except Exception as e:
145 |                 actions.append(f"Element interaction error: {str(e)}")
146 |             
147 |             # Let's try a form for good measure
148 |             try:
149 |                 page.goto("https://httpbin.org/forms/post")
150 |                 actions.append("Navigated to form page")
151 |                 
152 |                 # Fill out the form
153 |                 page.fill('input[name="custname"]', "Test User from Sandboxed Environment")
154 |                 page.fill('input[name="custtel"]', "555-0123")
155 |                 page.fill('input[name="custemail"]', "[email protected]")
156 |                 page.select_option('select[name="size"]', "large")
157 |                 
158 |                 actions.append("Filled out form fields")
159 |                 
160 |                 # Submit and see what happens
161 |                 page.click('input[type="submit"]')
162 |                 page.wait_for_load_state("networkidle")
163 |                 
164 |                 actions.append("Submitted form")
165 |                 
166 |             except Exception as e:
167 |                 actions.append(f"Form interaction error: {str(e)}")
168 |             
169 |             browser.close()
170 |             
171 |             return {
172 |                 "actions_performed": actions,
173 |                 "screenshots": screenshots,
174 |                 "success": True
175 |             }
176 |             
177 |     except Exception as e:
178 |         return {"error": f"Browser automation failed: {str(e)}"}
179 | 
180 | # Install Playwright and its browsers
181 | await computer.venv_install("browser_env", ["playwright"])
182 | await computer.venv_cmd("browser_env", "playwright install chromium")
183 | 
184 | # Run the automation
185 | result = await automate_browser_with_playwright()
186 | print(f"Performed {len(result.get('actions_performed', []))} actions")
187 | ```
188 | 
189 | ### Building code analysis agents
190 | 
191 | Want to build agents that can analyze code safely? Here's a security audit tool that won't accidentally `eval()` your system into oblivion:
192 | 
193 | ```python
194 | @sandboxed("analysis_env")
195 | def security_audit_tool(code_snippet):
196 |     """Analyze code for potential security issues"""
197 |     import ast
198 |     import re
199 |     
200 |     issues = []
201 |     
202 |     # Check for the usual suspects
203 |     dangerous_patterns = [
204 |         (r'eval\s*\(', "Use of eval() function"),
205 |         (r'exec\s*\(', "Use of exec() function"),
206 |         (r'__import__\s*\(', "Dynamic import usage"),
207 |         (r'subprocess\.', "Subprocess usage"),
208 |         (r'os\.system\s*\(', "OS system call"),
209 |     ]
210 |     
211 |     for pattern, description in dangerous_patterns:
212 |         if re.search(pattern, code_snippet):
213 |             issues.append(description)
214 |     
215 |     # Get fancy with AST analysis
216 |     try:
217 |         tree = ast.parse(code_snippet)
218 |         for node in ast.walk(tree):
219 |             if isinstance(node, ast.Call):
220 |                 if hasattr(node.func, 'id'):
221 |                     if node.func.id in ['eval', 'exec', 'compile']:
222 |                         issues.append(f"Dangerous function call: {node.func.id}")
223 |     except SyntaxError:
224 |         issues.append("Syntax error in code")
225 |     
226 |     return {
227 |         "security_issues": issues,
228 |         "risk_level": "HIGH" if len(issues) > 2 else "MEDIUM" if issues else "LOW"
229 |     }
230 | 
231 | # Test it on some sketchy code
232 | audit_result = await security_audit_tool("eval(user_input)")
233 | print(f"Security audit: {audit_result}")
234 | ```
235 | 
236 | ### Desktop automation in the cloud
237 | 
238 | Here's where things get really interesting. Cua Cloud Sandbox comes with full desktop environments, so you can automate GUIs:
239 | 
240 | ```python
241 | @sandboxed("desktop_env")
242 | def take_screenshot_and_analyze():
243 |     """Take a screenshot and analyze the desktop"""
244 |     import io
245 |     import base64
246 |     from PIL import ImageGrab
247 |     from datetime import datetime
248 |     
249 |     try:
250 |         # Grab the screen
251 |         screenshot = ImageGrab.grab()
252 |         
253 |         # Convert to base64 for easy transport
254 |         buffer = io.BytesIO()
255 |         screenshot.save(buffer, format='PNG')
256 |         screenshot_data = base64.b64encode(buffer.getvalue()).decode()
257 |         
258 |         # Get some basic info
259 |         screen_info = {
260 |             "size": screenshot.size,
261 |             "mode": screenshot.mode,
262 |             "timestamp": datetime.now().isoformat()
263 |         }
264 |         
265 |         # Analyze the colors (because why not?)
266 |         colors = screenshot.getcolors(maxcolors=256*256*256)
267 |         dominant_color = max(colors, key=lambda x: x[0])[1] if colors else None
268 |         
269 |         return {
270 |             "screenshot_base64": screenshot_data,
271 |             "screen_info": screen_info,
272 |             "dominant_color": dominant_color,
273 |             "unique_colors": len(colors) if colors else 0
274 |         }
275 |         
276 |     except Exception as e:
277 |         return {"error": f"Screenshot failed: {str(e)}"}
278 | 
279 | # Install the dependencies
280 | await computer.venv_install("desktop_env", ["Pillow"])
281 | 
282 | # Take and analyze a screenshot
283 | result = await take_screenshot_and_analyze()
284 | print("Desktop analysis complete!")
285 | ```
286 | 
287 | ## Pro tips for sandboxed success
288 | 
289 | ### Keep it self-contained
290 | Always put your imports inside the function. Trust us on this one:
291 | 
292 | ```python
293 | @sandboxed("good_env")
294 | def good_function():
295 |     import os  # Import inside the function
296 |     import json
297 |     
298 |     # Your code here
299 |     return {"result": "success"}
300 | ```
301 | 
302 | ### Install dependencies first
303 | Don't forget to install packages before using them:
304 | 
305 | ```python
306 | # Install first
307 | await computer.venv_install("my_env", ["pandas", "numpy", "matplotlib"])
308 | 
309 | @sandboxed("my_env")
310 | def data_analysis():
311 |     import pandas as pd
312 |     import numpy as np
313 |     # Now you can use them
314 | ```
315 | 
316 | ### Use descriptive environment names
317 | Future you will thank you:
318 | 
319 | ```python
320 | @sandboxed("data_processing_env")
321 | def process_data(): pass
322 | 
323 | @sandboxed("web_scraping_env") 
324 | def scrape_site(): pass
325 | 
326 | @sandboxed("ml_training_env")
327 | def train_model(): pass
328 | ```
329 | 
330 | ### Always handle errors gracefully
331 | Things break. Plan for it:
332 | 
333 | ```python
334 | @sandboxed("robust_env")
335 | def robust_function(data):
336 |     try:
337 |         result = process_data(data)
338 |         return {"success": True, "result": result}
339 |     except Exception as e:
340 |         return {"success": False, "error": str(e)}
341 | ```
342 | 
343 | ## What about performance?
344 | 
345 | Let's be honest – there's some overhead here. Code needs to be serialized, sent over the network, and executed remotely. But for most use cases, the benefits far outweigh the costs.
346 | 
347 | If you're building something performance-critical, consider:
348 | - Batching multiple operations into a single sandboxed function
349 | - Minimizing data transfer between host and container
350 | - Using persistent virtual environments
351 | 
352 | ## The security angle
353 | 
354 | This is where sandboxed execution really shines:
355 | 
356 | 1. **Complete process isolation** – code runs in a separate container
357 | 2. **File system protection** – limited access to your host files
358 | 3. **Network isolation** – controlled network access
359 | 4. **Clean environments** – no package conflicts or pollution
360 | 5. **Resource limits** – container-level constraints keep things in check
361 | 
362 | ## Ready to get started?
363 | 
364 | The `@sandboxed` decorator is one of those features that sounds simple but opens up a world of possibilities. Whether you're testing sketchy code, building AI agents, or just want to keep your development environment pristine, it's got you covered.
365 | 
366 | Give it a try in your next Cua project and see how liberating it feels to run code without fear!
367 | 
368 | Happy coding (safely)!
369 | 
370 | ---
371 | 
372 | *Want to dive deeper? Check out our [sandboxed functions examples](https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py) and [virtual environment tests](https://github.com/trycua/cua/blob/main/tests/venv.py) on GitHub. Questions? Come chat with us on Discord!*
373 | 
```

--------------------------------------------------------------------------------
/libs/lume/src/FileSystem/Settings.swift:
--------------------------------------------------------------------------------

```swift
  1 | import Foundation
  2 | 
  3 | /// Manages the application settings using a config file
  4 | struct LumeSettings: Codable, Sendable {
  5 |     var vmLocations: [VMLocation]
  6 |     var defaultLocationName: String
  7 |     var cacheDirectory: String
  8 |     var cachingEnabled: Bool
  9 | 
 10 |     var defaultLocation: VMLocation? {
 11 |         vmLocations.first { $0.name == defaultLocationName }
 12 |     }
 13 | 
 14 |     // For backward compatibility
 15 |     var homeDirectory: String {
 16 |         defaultLocation?.path ?? "~/.lume"
 17 |     }
 18 | 
 19 |     static let defaultSettings = LumeSettings(
 20 |         vmLocations: [
 21 |             VMLocation(name: "default", path: "~/.lume")
 22 |         ],
 23 |         defaultLocationName: "default",
 24 |         cacheDirectory: "~/.lume/cache",
 25 |         cachingEnabled: true
 26 |     )
 27 | 
 28 |     /// Gets all locations sorted by name
 29 |     var sortedLocations: [VMLocation] {
 30 |         vmLocations.sorted { $0.name < $1.name }
 31 |     }
 32 | }
 33 | 
 34 | final class SettingsManager: @unchecked Sendable {
 35 |     // MARK: - Constants
 36 | 
 37 |     private enum Constants {
 38 |         // Default path for config
 39 |         static let fallbackConfigDir = "~/.config/lume"
 40 |         static let configFileName = "config.yaml"
 41 |     }
 42 | 
 43 |     // MARK: - Properties
 44 | 
 45 |     static let shared = SettingsManager()
 46 |     private let fileManager: FileManager
 47 | 
 48 |     // Get the config directory following XDG spec
 49 |     private var configDir: String {
 50 |         // Check XDG_CONFIG_HOME environment variable first
 51 |         if let xdgConfigHome = ProcessInfo.processInfo.environment["XDG_CONFIG_HOME"] {
 52 |             return "\(xdgConfigHome)/lume"
 53 |         }
 54 |         // Fall back to default
 55 |         return (Constants.fallbackConfigDir as NSString).expandingTildeInPath
 56 |     }
 57 | 
 58 |     // Path to config file
 59 |     private var configFilePath: String {
 60 |         return "\(configDir)/\(Constants.configFileName)"
 61 |     }
 62 | 
 63 |     // MARK: - Initialization
 64 | 
 65 |     init(fileManager: FileManager = .default) {
 66 |         self.fileManager = fileManager
 67 |         ensureConfigDirectoryExists()
 68 |     }
 69 | 
 70 |     // MARK: - Settings Access
 71 | 
 72 |     func getSettings() -> LumeSettings {
 73 |         if let settings = readSettingsFromFile() {
 74 |             return settings
 75 |         }
 76 | 
 77 |         // No settings file found, use defaults
 78 |         let defaultSettings = LumeSettings(
 79 |             vmLocations: [
 80 |                 VMLocation(name: "default", path: "~/.lume")
 81 |             ],
 82 |             defaultLocationName: "default",
 83 |             cacheDirectory: "~/.lume/cache",
 84 |             cachingEnabled: true
 85 |         )
 86 | 
 87 |         // Try to save default settings
 88 |         try? saveSettings(defaultSettings)
 89 | 
 90 |         return defaultSettings
 91 |     }
 92 | 
 93 |     func saveSettings(_ settings: LumeSettings) throws {
 94 |         try fileManager.createDirectory(atPath: configDir, withIntermediateDirectories: true)
 95 | 
 96 |         // Create a human-readable YAML-like configuration file
 97 |         var yamlContent = "# Lume Configuration\n\n"
 98 | 
 99 |         // Default location
100 |         yamlContent += "defaultLocationName: \"\(settings.defaultLocationName)\"\n"
101 | 
102 |         // Cache directory
103 |         yamlContent += "cacheDirectory: \"\(settings.cacheDirectory)\"\n"
104 | 
105 |         // Caching enabled flag
106 |         yamlContent += "cachingEnabled: \(settings.cachingEnabled)\n"
107 | 
108 |         // VM locations
109 |         yamlContent += "\n# VM Locations\nvmLocations:\n"
110 |         for location in settings.vmLocations {
111 |             yamlContent += "  - name: \"\(location.name)\"\n"
112 |             yamlContent += "    path: \"\(location.path)\"\n"
113 |         }
114 | 
115 |         // Write YAML content to file
116 |         try yamlContent.write(
117 |             to: URL(fileURLWithPath: configFilePath), atomically: true, encoding: .utf8)
118 |     }
119 | 
120 |     // MARK: - VM Location Management
121 | 
122 |     func addLocation(_ location: VMLocation) throws {
123 |         var settings = getSettings()
124 | 
125 |         // Validate location name (alphanumeric, dash, underscore)
126 |         let nameRegex = try NSRegularExpression(pattern: "^[a-zA-Z0-9_-]+$")
127 |         let nameRange = NSRange(location.name.startIndex..., in: location.name)
128 |         if nameRegex.firstMatch(in: location.name, range: nameRange) == nil {
129 |             throw VMLocationError.invalidLocationName(name: location.name)
130 |         }
131 | 
132 |         // Check for duplicate name
133 |         if settings.vmLocations.contains(where: { $0.name == location.name }) {
134 |             throw VMLocationError.duplicateLocationName(name: location.name)
135 |         }
136 | 
137 |         // Validate location path
138 |         try location.validate()
139 | 
140 |         // Add location
141 |         settings.vmLocations.append(location)
142 |         try saveSettings(settings)
143 |     }
144 | 
145 |     func removeLocation(name: String) throws {
146 |         var settings = getSettings()
147 | 
148 |         // Check location exists
149 |         guard settings.vmLocations.contains(where: { $0.name == name }) else {
150 |             throw VMLocationError.locationNotFound(name: name)
151 |         }
152 | 
153 |         // Prevent removing default location
154 |         if name == settings.defaultLocationName {
155 |             throw VMLocationError.defaultLocationCannotBeRemoved(name: name)
156 |         }
157 | 
158 |         // Remove location
159 |         settings.vmLocations.removeAll(where: { $0.name == name })
160 |         try saveSettings(settings)
161 |     }
162 | 
163 |     func setDefaultLocation(name: String) throws {
164 |         var settings = getSettings()
165 | 
166 |         // Check location exists
167 |         guard settings.vmLocations.contains(where: { $0.name == name }) else {
168 |             throw VMLocationError.locationNotFound(name: name)
169 |         }
170 | 
171 |         // Set default
172 |         settings.defaultLocationName = name
173 |         try saveSettings(settings)
174 |     }
175 | 
176 |     func getLocation(name: String) throws -> VMLocation {
177 |         let settings = getSettings()
178 | 
179 |         if let location = settings.vmLocations.first(where: { $0.name == name }) {
180 |             return location
181 |         }
182 | 
183 |         throw VMLocationError.locationNotFound(name: name)
184 |     }
185 | 
186 |     // MARK: - Legacy Home Directory Compatibility
187 | 
188 |     func setHomeDirectory(path: String) throws {
189 |         var settings = getSettings()
190 | 
191 |         let defaultLocation = VMLocation(name: "default", path: path)
192 |         try defaultLocation.validate()
193 | 
194 |         // Replace default location
195 |         if let index = settings.vmLocations.firstIndex(where: { $0.name == "default" }) {
196 |             settings.vmLocations[index] = defaultLocation
197 |         } else {
198 |             settings.vmLocations.append(defaultLocation)
199 |             settings.defaultLocationName = "default"
200 |         }
201 | 
202 |         try saveSettings(settings)
203 |     }
204 | 
205 |     // MARK: - Cache Directory Management
206 | 
207 |     func setCacheDirectory(path: String) throws {
208 |         var settings = getSettings()
209 | 
210 |         // Validate path
211 |         let expandedPath = (path as NSString).expandingTildeInPath
212 |         var isDir: ObjCBool = false
213 | 
214 |         // If directory exists, check if it's writable
215 |         if fileManager.fileExists(atPath: expandedPath, isDirectory: &isDir) {
216 |             if !isDir.boolValue {
217 |                 throw SettingsError.notADirectory(path: expandedPath)
218 |             }
219 | 
220 |             if !fileManager.isWritableFile(atPath: expandedPath) {
221 |                 throw SettingsError.directoryNotWritable(path: expandedPath)
222 |             }
223 |         } else {
224 |             // Try to create the directory
225 |             do {
226 |                 try fileManager.createDirectory(
227 |                     atPath: expandedPath,
228 |                     withIntermediateDirectories: true
229 |                 )
230 |             } catch {
231 |                 throw SettingsError.directoryCreationFailed(path: expandedPath, error: error)
232 |             }
233 |         }
234 | 
235 |         // Update settings
236 |         settings.cacheDirectory = path
237 |         try saveSettings(settings)
238 |     }
239 | 
240 |     func getCacheDirectory() -> String {
241 |         return getSettings().cacheDirectory
242 |     }
243 | 
244 |     func setCachingEnabled(_ enabled: Bool) throws {
245 |         var settings = getSettings()
246 |         settings.cachingEnabled = enabled
247 |         try saveSettings(settings)
248 |     }
249 | 
250 |     func isCachingEnabled() -> Bool {
251 |         return getSettings().cachingEnabled
252 |     }
253 | 
254 |     // MARK: - Private Helpers
255 | 
256 |     private func ensureConfigDirectoryExists() {
257 |         try? fileManager.createDirectory(atPath: configDir, withIntermediateDirectories: true)
258 |     }
259 | 
260 |     private func readSettingsFromFile() -> LumeSettings? {
261 |         // Read from YAML file
262 |         if fileExists(at: configFilePath) {
263 |             do {
264 |                 let yamlString = try String(
265 |                     contentsOf: URL(fileURLWithPath: configFilePath), encoding: .utf8)
266 |                 return parseYamlSettings(yamlString)
267 |             } catch {
268 |                 Logger.error(
269 |                     "Failed to read settings from YAML file",
270 |                     metadata: ["error": error.localizedDescription]
271 |                 )
272 |             }
273 |         }
274 |         return nil
275 |     }
276 | 
277 |     private func parseYamlSettings(_ yamlString: String) -> LumeSettings? {
278 |         // This is a very basic YAML parser for our specific config format
279 |         // A real implementation would use a proper YAML library
280 | 
281 |         var defaultLocationName = "default"
282 |         var cacheDirectory = "~/.lume/cache"
283 |         var cachingEnabled = true  // default to true for backward compatibility
284 |         var vmLocations: [VMLocation] = []
285 | 
286 |         var inLocationsSection = false
287 |         var currentLocation: (name: String?, path: String?) = (nil, nil)
288 | 
289 |         let lines = yamlString.split(separator: "\n")
290 | 
291 |         for (_, line) in lines.enumerated() {
292 |             let trimmedLine = line.trimmingCharacters(in: .whitespaces)
293 | 
294 |             // Skip comments and empty lines
295 |             if trimmedLine.hasPrefix("#") || trimmedLine.isEmpty {
296 |                 continue
297 |             }
298 | 
299 |             // Check for section marker
300 |             if trimmedLine == "vmLocations:" {
301 |                 inLocationsSection = true
302 |                 continue
303 |             }
304 | 
305 |             // In the locations section, handle line indentation more carefully
306 |             if inLocationsSection {
307 |                 if trimmedLine.hasPrefix("-") || trimmedLine.contains("- name:") {
308 |                     // Process the previous location before starting a new one
309 |                     if let name = currentLocation.name, let path = currentLocation.path {
310 |                         vmLocations.append(VMLocation(name: name, path: path))
311 |                     }
312 |                     currentLocation = (nil, nil)
313 |                 }
314 | 
315 |                 // Process the key-value pairs within a location
316 |                 if let colonIndex = trimmedLine.firstIndex(of: ":") {
317 |                     let key = trimmedLine[..<colonIndex].trimmingCharacters(in: .whitespaces)
318 |                     let rawValue = trimmedLine[trimmedLine.index(after: colonIndex)...]
319 |                         .trimmingCharacters(in: .whitespaces)
320 |                     let value = extractValueFromYaml(rawValue)
321 | 
322 |                     if key.hasSuffix("name") {
323 |                         currentLocation.name = value
324 |                     } else if key.hasSuffix("path") {
325 |                         currentLocation.path = value
326 |                     }
327 |                 }
328 |             } else {
329 |                 // Process top-level keys outside the locations section
330 |                 if let colonIndex = trimmedLine.firstIndex(of: ":") {
331 |                     let key = trimmedLine[..<colonIndex].trimmingCharacters(in: .whitespaces)
332 |                     let rawValue = trimmedLine[trimmedLine.index(after: colonIndex)...]
333 |                         .trimmingCharacters(in: .whitespaces)
334 |                     let value = extractValueFromYaml(rawValue)
335 | 
336 |                     if key == "defaultLocationName" {
337 |                         defaultLocationName = value
338 |                     } else if key == "cacheDirectory" {
339 |                         cacheDirectory = value
340 |                     } else if key == "cachingEnabled" {
341 |                         cachingEnabled = value.lowercased() == "true"
342 |                     }
343 |                 }
344 |             }
345 |         }
346 | 
347 |         // Don't forget to add the last location
348 |         if let name = currentLocation.name, let path = currentLocation.path {
349 |             vmLocations.append(VMLocation(name: name, path: path))
350 |         }
351 | 
352 |         // Ensure at least one location exists
353 |         if vmLocations.isEmpty {
354 |             vmLocations.append(VMLocation(name: "default", path: "~/.lume"))
355 |         }
356 | 
357 |         return LumeSettings(
358 |             vmLocations: vmLocations,
359 |             defaultLocationName: defaultLocationName,
360 |             cacheDirectory: cacheDirectory,
361 |             cachingEnabled: cachingEnabled
362 |         )
363 |     }
364 | 
365 |     // Helper method to extract a value from YAML, handling quotes
366 |     private func extractValueFromYaml(_ rawValue: String) -> String {
367 |         if rawValue.hasPrefix("\"") && rawValue.hasSuffix("\"") && rawValue.count >= 2 {
368 |             // Remove the surrounding quotes
369 |             let startIndex = rawValue.index(after: rawValue.startIndex)
370 |             let endIndex = rawValue.index(before: rawValue.endIndex)
371 |             return String(rawValue[startIndex..<endIndex])
372 |         }
373 |         return rawValue
374 |     }
375 | 
376 |     // Helper method to output debug information about the current settings
377 |     func debugSettings() -> String {
378 |         let settings = getSettings()
379 | 
380 |         var output = "Current Settings:\n"
381 |         output += "- Default VM storage: \(settings.defaultLocationName)\n"
382 |         output += "- Cache directory: \(settings.cacheDirectory)\n"
383 |         output += "- VM Locations (\(settings.vmLocations.count)):\n"
384 | 
385 |         for (i, location) in settings.vmLocations.enumerated() {
386 |             let isDefault = location.name == settings.defaultLocationName
387 |             let defaultMark = isDefault ? " (default)" : ""
388 |             output += "  \(i+1). \(location.name): \(location.path)\(defaultMark)\n"
389 |         }
390 | 
391 |         // Also add raw file content
392 |         if fileExists(at: configFilePath) {
393 |             if let content = try? String(contentsOf: URL(fileURLWithPath: configFilePath)) {
394 |                 output += "\nRaw YAML file content:\n"
395 |                 output += content
396 |             }
397 |         }
398 | 
399 |         return output
400 |     }
401 | 
402 |     private func fileExists(at path: String) -> Bool {
403 |         fileManager.fileExists(atPath: path)
404 |     }
405 | }
406 | 
407 | // MARK: - Errors
408 | 
409 | enum SettingsError: Error, LocalizedError {
410 |     case notADirectory(path: String)
411 |     case directoryNotWritable(path: String)
412 |     case directoryCreationFailed(path: String, error: Error)
413 | 
414 |     var errorDescription: String? {
415 |         switch self {
416 |         case .notADirectory(let path):
417 |             return "Path is not a directory: \(path)"
418 |         case .directoryNotWritable(let path):
419 |             return "Directory is not writable: \(path)"
420 |         case .directoryCreationFailed(let path, let error):
421 |             return "Failed to create directory at \(path): \(error.localizedDescription)"
422 |         }
423 |     }
424 | }
425 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/adapters/mlxvlm_adapter.py:
--------------------------------------------------------------------------------

```python
  1 | import asyncio
  2 | import functools
  3 | import warnings
  4 | import io
  5 | import base64
  6 | import math
  7 | import re
  8 | from concurrent.futures import ThreadPoolExecutor
  9 | from typing import Iterator, AsyncIterator, Dict, List, Any, Optional, Tuple, cast
 10 | from PIL import Image
 11 | from litellm.types.utils import GenericStreamingChunk, ModelResponse
 12 | from litellm.llms.custom_llm import CustomLLM
 13 | from litellm import completion, acompletion
 14 | 
 15 | # Try to import MLX dependencies
 16 | try:
 17 |     import mlx.core as mx
 18 |     from mlx_vlm import load, generate
 19 |     from mlx_vlm.prompt_utils import apply_chat_template
 20 |     from mlx_vlm.utils import load_config
 21 |     from transformers.tokenization_utils import PreTrainedTokenizer
 22 |     MLX_AVAILABLE = True
 23 | except ImportError:
 24 |     MLX_AVAILABLE = False
 25 | 
 26 | # Constants for smart_resize
 27 | IMAGE_FACTOR = 28
 28 | MIN_PIXELS = 100 * 28 * 28
 29 | MAX_PIXELS = 16384 * 28 * 28
 30 | MAX_RATIO = 200
 31 | 
 32 | def round_by_factor(number: float, factor: int) -> int:
 33 |     """Returns the closest integer to 'number' that is divisible by 'factor'."""
 34 |     return round(number / factor) * factor
 35 | 
 36 | def ceil_by_factor(number: float, factor: int) -> int:
 37 |     """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
 38 |     return math.ceil(number / factor) * factor
 39 | 
 40 | def floor_by_factor(number: float, factor: int) -> int:
 41 |     """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
 42 |     return math.floor(number / factor) * factor
 43 | 
 44 | def smart_resize(
 45 |     height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
 46 | ) -> tuple[int, int]:
 47 |     """
 48 |     Rescales the image so that the following conditions are met:
 49 | 
 50 |     1. Both dimensions (height and width) are divisible by 'factor'.
 51 |     2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
 52 |     3. The aspect ratio of the image is maintained as closely as possible.
 53 |     """
 54 |     if max(height, width) / min(height, width) > MAX_RATIO:
 55 |         raise ValueError(
 56 |             f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
 57 |         )
 58 |     h_bar = max(factor, round_by_factor(height, factor))
 59 |     w_bar = max(factor, round_by_factor(width, factor))
 60 |     if h_bar * w_bar > max_pixels:
 61 |         beta = math.sqrt((height * width) / max_pixels)
 62 |         h_bar = floor_by_factor(height / beta, factor)
 63 |         w_bar = floor_by_factor(width / beta, factor)
 64 |     elif h_bar * w_bar < min_pixels:
 65 |         beta = math.sqrt(min_pixels / (height * width))
 66 |         h_bar = ceil_by_factor(height * beta, factor)
 67 |         w_bar = ceil_by_factor(width * beta, factor)
 68 |     return h_bar, w_bar
 69 | 
 70 | 
 71 | class MLXVLMAdapter(CustomLLM):
 72 |     """MLX VLM Adapter for running vision-language models locally using MLX."""
 73 |     
 74 |     def __init__(self, **kwargs):
 75 |         """Initialize the adapter.
 76 |         
 77 |         Args:
 78 |             **kwargs: Additional arguments
 79 |         """
 80 |         super().__init__()
 81 |         
 82 |         self.models = {}  # Cache for loaded models
 83 |         self.processors = {}  # Cache for loaded processors
 84 |         self.configs = {}  # Cache for loaded configs
 85 |         self._executor = ThreadPoolExecutor(max_workers=1)  # Single thread pool
 86 |         
 87 |     def _load_model_and_processor(self, model_name: str):
 88 |         """Load model and processor if not already cached.
 89 |         
 90 |         Args:
 91 |             model_name: Name of the model to load
 92 |             
 93 |         Returns:
 94 |             Tuple of (model, processor, config)
 95 |         """
 96 |         if not MLX_AVAILABLE:
 97 |             raise ImportError("MLX VLM dependencies not available. Please install mlx-vlm.")
 98 |         
 99 |         if model_name not in self.models:
100 |             # Load model and processor
101 |             model_obj, processor = load(
102 |                 model_name, 
103 |                 processor_kwargs={"min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS}
104 |             )
105 |             config = load_config(model_name)
106 |             
107 |             # Cache them
108 |             self.models[model_name] = model_obj
109 |             self.processors[model_name] = processor
110 |             self.configs[model_name] = config
111 |             
112 |         return self.models[model_name], self.processors[model_name], self.configs[model_name]
113 |     
114 |     def _process_coordinates(self, text: str, original_size: Tuple[int, int], model_size: Tuple[int, int]) -> str:
115 |         """Process coordinates in box tokens based on image resizing using smart_resize approach.
116 |         
117 |         Args:
118 |             text: Text containing box tokens
119 |             original_size: Original image size (width, height)
120 |             model_size: Model processed image size (width, height)
121 |             
122 |         Returns:
123 |             Text with processed coordinates
124 |         """
125 |         # Find all box tokens
126 |         box_pattern = r"<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>"
127 |         
128 |         def process_coords(match):
129 |             model_x, model_y = int(match.group(1)), int(match.group(2))
130 |             # Scale coordinates from model space to original image space
131 |             # Both original_size and model_size are in (width, height) format
132 |             new_x = int(model_x * original_size[0] / model_size[0])  # Width
133 |             new_y = int(model_y * original_size[1] / model_size[1])  # Height
134 |             return f"<|box_start|>({new_x},{new_y})<|box_end|>"
135 |         
136 |         return re.sub(box_pattern, process_coords, text)
137 |     
138 |     def _convert_messages(self, messages: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[Image.Image], Dict[int, Tuple[int, int]], Dict[int, Tuple[int, int]]]:
139 |         """Convert OpenAI format messages to MLX VLM format and extract images.
140 |         
141 |         Args:
142 |             messages: Messages in OpenAI format
143 |             
144 |         Returns:
145 |             Tuple of (processed_messages, images, original_sizes, model_sizes)
146 |         """
147 |         processed_messages = []
148 |         images = []
149 |         original_sizes = {}  # Track original sizes of images for coordinate mapping
150 |         model_sizes = {}  # Track model processed sizes
151 |         image_index = 0
152 |         
153 |         for message in messages:
154 |             processed_message = {
155 |                 "role": message["role"],
156 |                 "content": []
157 |             }
158 |             
159 |             content = message.get("content", [])
160 |             if isinstance(content, str):
161 |                 # Simple text content
162 |                 processed_message["content"] = content
163 |             elif isinstance(content, list):
164 |                 # Multi-modal content
165 |                 processed_content = []
166 |                 for item in content:
167 |                     if item.get("type") == "text":
168 |                         processed_content.append({
169 |                             "type": "text",
170 |                             "text": item.get("text", "")
171 |                         })
172 |                     elif item.get("type") == "image_url":
173 |                         image_url = item.get("image_url", {}).get("url", "")
174 |                         pil_image = None
175 |                         
176 |                         if image_url.startswith("data:image/"):
177 |                             # Extract base64 data
178 |                             base64_data = image_url.split(',')[1]
179 |                             # Convert base64 to PIL Image
180 |                             image_data = base64.b64decode(base64_data)
181 |                             pil_image = Image.open(io.BytesIO(image_data))
182 |                         else:
183 |                             # Handle file path or URL
184 |                             pil_image = Image.open(image_url)
185 |                         
186 |                         # Store original image size for coordinate mapping
187 |                         original_size = pil_image.size
188 |                         original_sizes[image_index] = original_size
189 |                         
190 |                         # Use smart_resize to determine model size
191 |                         # Note: smart_resize expects (height, width) but PIL gives (width, height)
192 |                         height, width = original_size[1], original_size[0]
193 |                         new_height, new_width = smart_resize(height, width)
194 |                         # Store model size in (width, height) format for consistent coordinate processing
195 |                         model_sizes[image_index] = (new_width, new_height)
196 |                         
197 |                         # Resize the image using the calculated dimensions from smart_resize
198 |                         resized_image = pil_image.resize((new_width, new_height))
199 |                         images.append(resized_image)
200 |                         
201 |                         # Add image placeholder to content
202 |                         processed_content.append({
203 |                             "type": "image"
204 |                         })
205 |                         
206 |                         image_index += 1
207 |                 
208 |                 processed_message["content"] = processed_content
209 |             
210 |             processed_messages.append(processed_message)
211 |         
212 |         return processed_messages, images, original_sizes, model_sizes
213 |     
214 |     def _generate(self, **kwargs) -> str:
215 |         """Generate response using the local MLX VLM model.
216 |         
217 |         Args:
218 |             **kwargs: Keyword arguments containing messages and model info
219 |             
220 |         Returns:
221 |             Generated text response
222 |         """
223 |         messages = kwargs.get('messages', [])
224 |         model_name = kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')
225 |         max_tokens = kwargs.get('max_tokens', 128)
226 |         
227 |         # Warn about ignored kwargs
228 |         ignored_kwargs = set(kwargs.keys()) - {'messages', 'model', 'max_tokens'}
229 |         if ignored_kwargs:
230 |             warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
231 |         
232 |         # Load model and processor
233 |         model, processor, config = self._load_model_and_processor(model_name)
234 |         
235 |         # Convert messages and extract images
236 |         processed_messages, images, original_sizes, model_sizes = self._convert_messages(messages)
237 |         
238 |         # Process user text input with box coordinates after image processing
239 |         # Swap original_size and model_size arguments for inverse transformation
240 |         for msg_idx, msg in enumerate(processed_messages):
241 |             if msg.get("role") == "user" and isinstance(msg.get("content"), str):
242 |                 content = msg.get("content", "")
243 |                 if "<|box_start|>" in content and original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes:
244 |                     orig_size = original_sizes[0]
245 |                     model_size = model_sizes[0]
246 |                     # Swap arguments to perform inverse transformation for user input
247 |                     processed_messages[msg_idx]["content"] = self._process_coordinates(content, model_size, orig_size)
248 |         
249 |         try:
250 |             # Format prompt according to model requirements using the processor directly
251 |             prompt = processor.apply_chat_template(
252 |                 processed_messages,
253 |                 tokenize=False,
254 |                 add_generation_prompt=True,
255 |                 return_tensors='pt'
256 |             )
257 |             tokenizer = cast(PreTrainedTokenizer, processor)
258 |             
259 |             # Generate response
260 |             text_content, usage = generate(
261 |                 model, 
262 |                 tokenizer, 
263 |                 str(prompt), 
264 |                 images, # type: ignore
265 |                 verbose=False,
266 |                 max_tokens=max_tokens
267 |             )
268 |             
269 |         except Exception as e:
270 |             raise RuntimeError(f"Error generating response: {str(e)}") from e
271 |         
272 |         # Process coordinates in the response back to original image space
273 |         if original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes:
274 |             # Get original image size and model size (using the first image)
275 |             orig_size = original_sizes[0]
276 |             model_size = model_sizes[0]
277 |             
278 |             # Check if output contains box tokens that need processing
279 |             if "<|box_start|>" in text_content:
280 |                 # Process coordinates from model space back to original image space
281 |                 text_content = self._process_coordinates(text_content, orig_size, model_size)
282 |         
283 |         return text_content
284 |     
285 |     def completion(self, *args, **kwargs) -> ModelResponse:
286 |         """Synchronous completion method.
287 |         
288 |         Returns:
289 |             ModelResponse with generated text
290 |         """
291 |         generated_text = self._generate(**kwargs)
292 |         
293 |         result = completion(
294 |             model=f"mlx/{kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')}",
295 |             mock_response=generated_text,
296 |         )
297 |         return cast(ModelResponse, result)
298 |     
299 |     async def acompletion(self, *args, **kwargs) -> ModelResponse:
300 |         """Asynchronous completion method.
301 |         
302 |         Returns:
303 |             ModelResponse with generated text
304 |         """
305 |         # Run _generate in thread pool to avoid blocking
306 |         loop = asyncio.get_event_loop()
307 |         generated_text = await loop.run_in_executor(
308 |             self._executor, 
309 |             functools.partial(self._generate, **kwargs)
310 |         )
311 |         
312 |         result = await acompletion(
313 |             model=f"mlx/{kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')}",
314 |             mock_response=generated_text,
315 |         )
316 |         return cast(ModelResponse, result)
317 |     
318 |     def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
319 |         """Synchronous streaming method.
320 |         
321 |         Returns:
322 |             Iterator of GenericStreamingChunk
323 |         """
324 |         generated_text = self._generate(**kwargs)
325 |         
326 |         generic_streaming_chunk: GenericStreamingChunk = {
327 |             "finish_reason": "stop",
328 |             "index": 0,
329 |             "is_finished": True,
330 |             "text": generated_text,
331 |             "tool_use": None,
332 |             "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
333 |         }
334 |         
335 |         yield generic_streaming_chunk
336 |     
337 |     async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
338 |         """Asynchronous streaming method.
339 |         
340 |         Returns:
341 |             AsyncIterator of GenericStreamingChunk
342 |         """
343 |         # Run _generate in thread pool to avoid blocking
344 |         loop = asyncio.get_event_loop()
345 |         generated_text = await loop.run_in_executor(
346 |             self._executor, 
347 |             functools.partial(self._generate, **kwargs)
348 |         )
349 |         
350 |         generic_streaming_chunk: GenericStreamingChunk = {
351 |             "finish_reason": "stop",
352 |             "index": 0,
353 |             "is_finished": True,
354 |             "text": generated_text,
355 |             "tool_use": None,
356 |             "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
357 |         }
358 |         
359 |         yield generic_streaming_chunk
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/omniparser.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | OpenAI computer-use-preview agent loop implementation using liteLLM
  3 | Paper: https://arxiv.org/abs/2408.00203
  4 | Code: https://github.com/microsoft/OmniParser
  5 | """
  6 | 
  7 | import asyncio
  8 | import json
  9 | from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
 10 | import litellm
 11 | import inspect
 12 | import base64
 13 | 
 14 | from ..decorators import register_agent
 15 | from ..types import Messages, AgentResponse, Tools, AgentCapability
 16 | from ..loops.base import AsyncAgentConfig
 17 | 
 18 | SOM_TOOL_SCHEMA = {
 19 |   "type": "function",
 20 |   "name": "computer",
 21 |   "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
 22 |   "parameters": {
 23 |     "type": "object",
 24 |     "properties": {
 25 |       "action": {
 26 |         "type": "string",
 27 |         "enum": [
 28 |           "screenshot",
 29 |           "click",
 30 |           "double_click",
 31 |           "drag",
 32 |           "type",
 33 |           "keypress",
 34 |           "scroll",
 35 |           "move",
 36 |           "wait",
 37 |           "get_current_url",
 38 |           "get_dimensions",
 39 |           "get_environment"
 40 |         ],
 41 |         "description": "The action to perform"
 42 |       },
 43 |       "element_id": {
 44 |         "type": "integer",
 45 |         "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)"
 46 |       },
 47 |       "start_element_id": {
 48 |         "type": "integer",
 49 |         "description": "The ID of the element to start dragging from (required for drag action)"
 50 |       },
 51 |       "end_element_id": {
 52 |         "type": "integer",
 53 |         "description": "The ID of the element to drag to (required for drag action)"
 54 |       },
 55 |       "text": {
 56 |         "type": "string",
 57 |         "description": "The text to type (required for type action)"
 58 |       },
 59 |       "keys": {
 60 |         "type": "string",
 61 |         "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')"
 62 |       },
 63 |       "button": {
 64 |         "type": "string",
 65 |         "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
 66 |       },
 67 |       "scroll_x": {
 68 |         "type": "integer",
 69 |         "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
 70 |       },
 71 |       "scroll_y": {
 72 |         "type": "integer",
 73 |         "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
 74 |       },
 75 |     },
 76 |     "required": [
 77 |       "action"
 78 |     ]
 79 |   }
 80 | }
 81 | 
 82 | OMNIPARSER_AVAILABLE = False
 83 | try:
 84 |     from som import OmniParser
 85 |     OMNIPARSER_AVAILABLE = True
 86 | except ImportError:
 87 |     pass
 88 | OMNIPARSER_SINGLETON = None
 89 | 
 90 | def get_parser():
 91 |     global OMNIPARSER_SINGLETON
 92 |     if OMNIPARSER_SINGLETON is None:
 93 |         OMNIPARSER_SINGLETON = OmniParser()
 94 |     return OMNIPARSER_SINGLETON
 95 |     
 96 | def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
 97 |     """Get the last computer_call_output message from a messages list.
 98 |     
 99 |     Args:
100 |         messages: List of messages to search through
101 |         
102 |     Returns:
103 |         The last computer_call_output message dict, or None if not found
104 |     """
105 |     for message in reversed(messages):
106 |         if isinstance(message, dict) and message.get("type") == "computer_call_output":
107 |             return message
108 |     return None
109 | 
110 | def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[Tools, dict]:
111 |     """Prepare tools for OpenAI API format"""
112 |     omniparser_tools = []
113 |     id2xy = dict()
114 |     
115 |     for schema in tool_schemas:
116 |         if schema["type"] == "computer":
117 |             omniparser_tools.append(SOM_TOOL_SCHEMA)
118 |             if "id2xy" in schema:
119 |                 id2xy = schema["id2xy"]
120 |             else:
121 |                 schema["id2xy"] = id2xy
122 |         elif schema["type"] == "function":
123 |             # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
124 |             # Schema should be: {type, name, description, parameters}
125 |             omniparser_tools.append({ "type": "function", **schema["function"] })
126 |     
127 |     return omniparser_tools, id2xy
128 | 
129 | async def replace_function_with_computer_call(item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]):
130 |   item_type = item.get("type")
131 | 
132 |   def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
133 |     if element_id is None:
134 |       return (None, None)
135 |     return id2xy.get(element_id, (None, None))
136 | 
137 |   if item_type == "function_call":
138 |     fn_name = item.get("name")
139 |     fn_args = json.loads(item.get("arguments", "{}"))
140 | 
141 |     item_id = item.get("id")
142 |     call_id = item.get("call_id")
143 |     
144 |     if fn_name == "computer":
145 |       action = fn_args.get("action")
146 |       element_id = fn_args.get("element_id")
147 |       start_element_id = fn_args.get("start_element_id")
148 |       end_element_id = fn_args.get("end_element_id")
149 |       text = fn_args.get("text")
150 |       keys = fn_args.get("keys")
151 |       button = fn_args.get("button")
152 |       scroll_x = fn_args.get("scroll_x")
153 |       scroll_y = fn_args.get("scroll_y")
154 | 
155 |       x, y = _get_xy(element_id)
156 |       start_x, start_y = _get_xy(start_element_id)
157 |       end_x, end_y = _get_xy(end_element_id)
158 | 
159 |       action_args = {
160 |           "type": action,
161 |           "x": x,
162 |           "y": y,
163 |           "start_x": start_x,
164 |           "start_y": start_y,
165 |           "end_x": end_x,
166 |           "end_y": end_y,
167 |           "text": text,
168 |           "keys": keys,
169 |           "button": button,
170 |           "scroll_x": scroll_x,
171 |           "scroll_y": scroll_y
172 |         }
173 |       # Remove None values to keep the JSON clean
174 |       action_args = {k: v for k, v in action_args.items() if v is not None}
175 | 
176 |       return [{
177 |         "type": "computer_call",
178 |         "action": action_args,
179 |         "id": item_id,
180 |         "call_id": call_id,
181 |         "status": "completed"
182 |       }]
183 | 
184 |   return [item]
185 | 
186 | async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[Tuple[float, float], int]):
187 |     """
188 |     Convert computer_call back to function_call format.
189 |     Also handles computer_call_output -> function_call_output conversion.
190 |     
191 |     Args:
192 |         item: The item to convert
193 |         xy2id: Mapping from (x, y) coordinates to element IDs
194 |     """
195 |     item_type = item.get("type")
196 | 
197 |     def _get_element_id(x: Optional[float], y: Optional[float]) -> Optional[int]:
198 |         """Get element ID from coordinates, return None if coordinates are None"""
199 |         if x is None or y is None:
200 |             return None
201 |         return xy2id.get((x, y))
202 | 
203 |     if item_type == "computer_call":
204 |         action_data = item.get("action", {})
205 |         
206 |         # Extract coordinates and convert back to element IDs
207 |         element_id = _get_element_id(action_data.get("x"), action_data.get("y"))
208 |         start_element_id = _get_element_id(action_data.get("start_x"), action_data.get("start_y"))
209 |         end_element_id = _get_element_id(action_data.get("end_x"), action_data.get("end_y"))
210 |         
211 |         # Build function arguments
212 |         fn_args = {
213 |             "action": action_data.get("type"),
214 |             "element_id": element_id,
215 |             "start_element_id": start_element_id,
216 |             "end_element_id": end_element_id,
217 |             "text": action_data.get("text"),
218 |             "keys": action_data.get("keys"),
219 |             "button": action_data.get("button"),
220 |             "scroll_x": action_data.get("scroll_x"),
221 |             "scroll_y": action_data.get("scroll_y")
222 |         }
223 |         
224 |         # Remove None values to keep the JSON clean
225 |         fn_args = {k: v for k, v in fn_args.items() if v is not None}
226 |         
227 |         return [{
228 |             "type": "function_call",
229 |             "name": "computer",
230 |             "arguments": json.dumps(fn_args),
231 |             "id": item.get("id"),
232 |             "call_id": item.get("call_id"),
233 |             "status": "completed",
234 | 
235 |             # Fall back to string representation
236 |             "content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})"
237 |         }]
238 |     
239 |     elif item_type == "computer_call_output":
240 |         # Simple conversion: computer_call_output -> function_call_output
241 |         return [{
242 |             "type": "function_call_output",
243 |             "call_id": item.get("call_id"),
244 |             "content": [item.get("output")],
245 |             "id": item.get("id"),
246 |             "status": "completed"
247 |         }]
248 | 
249 |     return [item]
250 | 
251 | 
252 | @register_agent(models=r"omniparser\+.*|omni\+.*", priority=2)
253 | class OmniparserConfig(AsyncAgentConfig):
254 |     """Omniparser agent configuration implementing AsyncAgentConfig protocol."""
255 |     
256 |     async def predict_step(
257 |         self,
258 |         messages: List[Dict[str, Any]],
259 |         model: str,
260 |         tools: Optional[List[Dict[str, Any]]] = None,
261 |         max_retries: Optional[int] = None,
262 |         stream: bool = False,
263 |         computer_handler=None,
264 |         use_prompt_caching: Optional[bool] = False,
265 |         _on_api_start=None,
266 |         _on_api_end=None,
267 |         _on_usage=None,
268 |         _on_screenshot=None,
269 |         **kwargs
270 |     ) -> Dict[str, Any]:
271 |         """
272 |         OpenAI computer-use-preview agent loop using liteLLM responses.
273 |         
274 |         Supports OpenAI's computer use preview models.
275 |         """
276 |         if not OMNIPARSER_AVAILABLE:
277 |             raise ValueError("omniparser loop requires som to be installed. Install it with `pip install cua-som`.")
278 |           
279 |         tools = tools or []
280 |         
281 |         llm_model = model.split('+')[-1]
282 | 
283 |         # Prepare tools for OpenAI API
284 |         openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
285 | 
286 |         # Find last computer_call_output
287 |         last_computer_call_output = get_last_computer_call_output(messages) # type: ignore
288 |         if last_computer_call_output:
289 |             image_url = last_computer_call_output.get("output", {}).get("image_url", "")
290 |             image_data = image_url.split(",")[-1]
291 |             if image_data:
292 |                 parser = get_parser()
293 |                 result = parser.parse(image_data)
294 |                 if _on_screenshot:
295 |                     await _on_screenshot(result.annotated_image_base64, "annotated_image")
296 |                 for element in result.elements:
297 |                     id2xy[element.id] = ((element.bbox.x1 + element.bbox.x2) / 2, (element.bbox.y1 + element.bbox.y2) / 2)
298 |         
299 |         # handle computer calls -> function calls
300 |         new_messages = []
301 |         for message in messages:
302 |             if not isinstance(message, dict):
303 |                 message = message.__dict__
304 |             new_messages += await replace_computer_call_with_function(message, id2xy) # type: ignore
305 |         messages = new_messages
306 | 
307 |         # Prepare API call kwargs
308 |         api_kwargs = {
309 |             "model": llm_model,
310 |             "input": messages,
311 |             "tools": openai_tools if openai_tools else None,
312 |             "stream": stream,
313 |             "truncation": "auto",
314 |             "num_retries": max_retries,
315 |             **kwargs
316 |         }
317 |         
318 |         # Call API start hook
319 |         if _on_api_start:
320 |             await _on_api_start(api_kwargs)
321 |         
322 |         print(str(api_kwargs)[:1000])
323 | 
324 |         # Use liteLLM responses
325 |         response = await litellm.aresponses(**api_kwargs)
326 | 
327 |         # Call API end hook
328 |         if _on_api_end:
329 |             await _on_api_end(api_kwargs, response)
330 | 
331 |         # Extract usage information
332 |         usage = {
333 |             **response.usage.model_dump(), # type: ignore
334 |             "response_cost": response._hidden_params.get("response_cost", 0.0), # type: ignore
335 |         }
336 |         if _on_usage:
337 |             await _on_usage(usage)
338 | 
339 |         # handle som function calls -> xy computer calls
340 |         new_output = []
341 |         for i in range(len(response.output)): # type: ignore
342 |           new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy) # type: ignore
343 |         
344 |         return {
345 |             "output": new_output,
346 |             "usage": usage
347 |         }
348 |     
349 |     async def predict_click(
350 |         self,
351 |         model: str,
352 |         image_b64: str,
353 |         instruction: str,
354 |         **kwargs
355 |     ) -> Optional[Tuple[float, float]]:
356 |         """
357 |         Predict click coordinates using OmniParser and LLM.
358 |         
359 |         Uses OmniParser to annotate the image with element IDs, then uses LLM
360 |         to identify the correct element ID based on the instruction.
361 |         """
362 |         if not OMNIPARSER_AVAILABLE:
363 |             return None
364 |         
365 |         # Parse the image with OmniParser to get annotated image and elements
366 |         parser = get_parser()
367 |         result = parser.parse(image_b64)
368 |         
369 |         # Extract the LLM model from composed model string
370 |         llm_model = model.split('+')[-1]
371 |         
372 |         # Create system prompt for element ID prediction
373 |         SYSTEM_PROMPT = f'''
374 | You are an expert UI element locator. Given a GUI image annotated with numerical IDs over each interactable element, along with a user's element description, provide the ID of the specified element.
375 | 
376 | The image shows UI elements with numbered overlays. Each number corresponds to a clickable/interactable element.
377 | 
378 | Output only the element ID as a single integer.
379 | '''.strip()
380 |         
381 |         # Prepare messages for LLM
382 |         messages = [
383 |             {
384 |                 "role": "system",
385 |                 "content": SYSTEM_PROMPT
386 |             },
387 |             {
388 |                 "role": "user",
389 |                 "content": [
390 |                     {
391 |                         "type": "image_url",
392 |                         "image_url": {
393 |                             "url": f"data:image/png;base64,{result.annotated_image_base64}"
394 |                         }
395 |                     },
396 |                     {
397 |                         "type": "text",
398 |                         "text": f"Find the element: {instruction}"
399 |                     }
400 |                 ]
401 |             }
402 |         ]
403 |         
404 |         # Call LLM to predict element ID
405 |         response = await litellm.acompletion(
406 |             model=llm_model,
407 |             messages=messages,
408 |             max_tokens=10,
409 |             temperature=0.1
410 |         )
411 |         
412 |         # Extract element ID from response
413 |         response_text = response.choices[0].message.content.strip() # type: ignore
414 |         
415 |         # Try to parse the element ID
416 |         try:
417 |             element_id = int(response_text)
418 |             
419 |             # Find the element with this ID and return its center coordinates
420 |             for element in result.elements:
421 |                 if element.id == element_id:
422 |                     center_x = (element.bbox.x1 + element.bbox.x2) / 2
423 |                     center_y = (element.bbox.y1 + element.bbox.y2) / 2
424 |                     return (center_x, center_y)
425 |         except ValueError:
426 |             # If we can't parse the ID, return None
427 |             pass
428 |             
429 |         return None
430 |     
431 |     def get_capabilities(self) -> List[AgentCapability]:
432 |         """Return the capabilities supported by this agent."""
433 |         return ["step"]
434 | 
```
Page 11/21FirstPrevNextLast