#
tokens: 48568/50000 26/616 files (page 6/20)
lines: off (toggle) GitHub
raw markdown copy
This is page 6 of 20. Use http://codebase.md/trycua/cua?page={x} to view the full context.

# Directory Structure

```
├── .cursorignore
├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github
│   ├── FUNDING.yml
│   ├── scripts
│   │   ├── get_pyproject_version.py
│   │   └── tests
│   │       ├── __init__.py
│   │       ├── README.md
│   │       └── test_get_pyproject_version.py
│   └── workflows
│       ├── bump-version.yml
│       ├── ci-lume.yml
│       ├── docker-publish-cua-linux.yml
│       ├── docker-publish-cua-windows.yml
│       ├── docker-publish-kasm.yml
│       ├── docker-publish-xfce.yml
│       ├── docker-reusable-publish.yml
│       ├── link-check.yml
│       ├── lint.yml
│       ├── npm-publish-cli.yml
│       ├── npm-publish-computer.yml
│       ├── npm-publish-core.yml
│       ├── publish-lume.yml
│       ├── pypi-publish-agent.yml
│       ├── pypi-publish-computer-server.yml
│       ├── pypi-publish-computer.yml
│       ├── pypi-publish-core.yml
│       ├── pypi-publish-mcp-server.yml
│       ├── pypi-publish-som.yml
│       ├── pypi-reusable-publish.yml
│       ├── python-tests.yml
│       ├── test-cua-models.yml
│       └── test-validation-script.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierignore
├── .prettierrc.yaml
├── .vscode
│   ├── docs.code-workspace
│   ├── extensions.json
│   ├── launch.json
│   ├── libs-ts.code-workspace
│   ├── lume.code-workspace
│   ├── lumier.code-workspace
│   ├── py.code-workspace
│   └── settings.json
├── blog
│   ├── app-use.md
│   ├── assets
│   │   ├── composite-agents.png
│   │   ├── docker-ubuntu-support.png
│   │   ├── hack-booth.png
│   │   ├── hack-closing-ceremony.jpg
│   │   ├── hack-cua-ollama-hud.jpeg
│   │   ├── hack-leaderboard.png
│   │   ├── hack-the-north.png
│   │   ├── hack-winners.jpeg
│   │   ├── hack-workshop.jpeg
│   │   ├── hud-agent-evals.png
│   │   └── trajectory-viewer.jpeg
│   ├── bringing-computer-use-to-the-web.md
│   ├── build-your-own-operator-on-macos-1.md
│   ├── build-your-own-operator-on-macos-2.md
│   ├── cloud-windows-ga-macos-preview.md
│   ├── composite-agents.md
│   ├── computer-use-agents-for-growth-hacking.md
│   ├── cua-hackathon.md
│   ├── cua-playground-preview.md
│   ├── cua-vlm-router.md
│   ├── hack-the-north.md
│   ├── hud-agent-evals.md
│   ├── human-in-the-loop.md
│   ├── introducing-cua-cli.md
│   ├── introducing-cua-cloud-containers.md
│   ├── lume-to-containerization.md
│   ├── neurips-2025-cua-papers.md
│   ├── sandboxed-python-execution.md
│   ├── training-computer-use-models-trajectories-1.md
│   ├── trajectory-viewer.md
│   ├── ubuntu-docker-support.md
│   └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│   ├── .env.example
│   ├── .gitignore
│   ├── content
│   │   └── docs
│   │       ├── agent-sdk
│   │       │   ├── agent-loops.mdx
│   │       │   ├── benchmarks
│   │       │   │   ├── index.mdx
│   │       │   │   ├── interactive.mdx
│   │       │   │   ├── introduction.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── osworld-verified.mdx
│   │       │   │   ├── screenspot-pro.mdx
│   │       │   │   └── screenspot-v2.mdx
│   │       │   ├── callbacks
│   │       │   │   ├── agent-lifecycle.mdx
│   │       │   │   ├── cost-saving.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── logging.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── pii-anonymization.mdx
│   │       │   │   └── trajectories.mdx
│   │       │   ├── chat-history.mdx
│   │       │   ├── custom-tools.mdx
│   │       │   ├── customizing-computeragent.mdx
│   │       │   ├── integrations
│   │       │   │   ├── hud.mdx
│   │       │   │   ├── meta.json
│   │       │   │   └── observability.mdx
│   │       │   ├── mcp-server
│   │       │   │   ├── client-integrations.mdx
│   │       │   │   ├── configuration.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── llm-integrations.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── tools.mdx
│   │       │   │   └── usage.mdx
│   │       │   ├── message-format.mdx
│   │       │   ├── meta.json
│   │       │   ├── migration-guide.mdx
│   │       │   ├── prompt-caching.mdx
│   │       │   ├── supported-agents
│   │       │   │   ├── composed-agents.mdx
│   │       │   │   ├── computer-use-agents.mdx
│   │       │   │   ├── grounding-models.mdx
│   │       │   │   ├── human-in-the-loop.mdx
│   │       │   │   └── meta.json
│   │       │   ├── supported-model-providers
│   │       │   │   ├── cua-vlm-router.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   └── local-models.mdx
│   │       │   ├── telemetry.mdx
│   │       │   └── usage-tracking.mdx
│   │       ├── cli-playbook
│   │       │   ├── commands.mdx
│   │       │   ├── index.mdx
│   │       │   └── meta.json
│   │       ├── computer-sdk
│   │       │   ├── cloud-vm-management.mdx
│   │       │   ├── commands.mdx
│   │       │   ├── computer-server
│   │       │   │   ├── Commands.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── REST-API.mdx
│   │       │   │   └── WebSocket-API.mdx
│   │       │   ├── computer-ui.mdx
│   │       │   ├── computers.mdx
│   │       │   ├── custom-computer-handlers.mdx
│   │       │   ├── meta.json
│   │       │   ├── sandboxed-python.mdx
│   │       │   └── tracing-api.mdx
│   │       ├── example-usecases
│   │       │   ├── form-filling.mdx
│   │       │   ├── gemini-complex-ui-navigation.mdx
│   │       │   ├── meta.json
│   │       │   ├── post-event-contact-export.mdx
│   │       │   └── windows-app-behind-vpn.mdx
│   │       ├── get-started
│   │       │   ├── meta.json
│   │       │   └── quickstart.mdx
│   │       ├── index.mdx
│   │       ├── macos-vm-cli-playbook
│   │       │   ├── lume
│   │       │   │   ├── cli-reference.mdx
│   │       │   │   ├── faq.md
│   │       │   │   ├── http-api.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── meta.json
│   │       │   │   └── prebuilt-images.mdx
│   │       │   ├── lumier
│   │       │   │   ├── building-lumier.mdx
│   │       │   │   ├── docker-compose.mdx
│   │       │   │   ├── docker.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   └── meta.json
│   │       │   └── meta.json
│   │       └── meta.json
│   ├── next.config.mjs
│   ├── package-lock.json
│   ├── package.json
│   ├── pnpm-lock.yaml
│   ├── postcss.config.mjs
│   ├── public
│   │   └── img
│   │       ├── agent_gradio_ui.png
│   │       ├── agent.png
│   │       ├── bg-dark.jpg
│   │       ├── bg-light.jpg
│   │       ├── cli.png
│   │       ├── computer.png
│   │       ├── grounding-with-gemini3.gif
│   │       ├── hero.png
│   │       ├── laminar_trace_example.png
│   │       ├── som_box_threshold.png
│   │       └── som_iou_threshold.png
│   ├── README.md
│   ├── source.config.ts
│   ├── src
│   │   ├── app
│   │   │   ├── (home)
│   │   │   │   ├── [[...slug]]
│   │   │   │   │   └── page.tsx
│   │   │   │   └── layout.tsx
│   │   │   ├── api
│   │   │   │   ├── posthog
│   │   │   │   │   └── [...path]
│   │   │   │   │       └── route.ts
│   │   │   │   └── search
│   │   │   │       └── route.ts
│   │   │   ├── favicon.ico
│   │   │   ├── global.css
│   │   │   ├── layout.config.tsx
│   │   │   ├── layout.tsx
│   │   │   ├── llms.mdx
│   │   │   │   └── [[...slug]]
│   │   │   │       └── route.ts
│   │   │   ├── llms.txt
│   │   │   │   └── route.ts
│   │   │   ├── robots.ts
│   │   │   └── sitemap.ts
│   │   ├── assets
│   │   │   ├── discord-black.svg
│   │   │   ├── discord-white.svg
│   │   │   ├── logo-black.svg
│   │   │   └── logo-white.svg
│   │   ├── components
│   │   │   ├── analytics-tracker.tsx
│   │   │   ├── cookie-consent.tsx
│   │   │   ├── doc-actions-menu.tsx
│   │   │   ├── editable-code-block.tsx
│   │   │   ├── footer.tsx
│   │   │   ├── hero.tsx
│   │   │   ├── iou.tsx
│   │   │   ├── mermaid.tsx
│   │   │   └── page-feedback.tsx
│   │   ├── lib
│   │   │   ├── llms.ts
│   │   │   └── source.ts
│   │   ├── mdx-components.tsx
│   │   └── providers
│   │       └── posthog-provider.tsx
│   └── tsconfig.json
├── examples
│   ├── agent_examples.py
│   ├── agent_ui_examples.py
│   ├── browser_tool_example.py
│   ├── cloud_api_examples.py
│   ├── computer_examples_windows.py
│   ├── computer_examples.py
│   ├── computer_ui_examples.py
│   ├── computer-example-ts
│   │   ├── .env.example
│   │   ├── .gitignore
│   │   ├── package-lock.json
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── README.md
│   │   ├── src
│   │   │   ├── helpers.ts
│   │   │   └── index.ts
│   │   └── tsconfig.json
│   ├── docker_examples.py
│   ├── evals
│   │   ├── hud_eval_examples.py
│   │   └── wikipedia_most_linked.txt
│   ├── pylume_examples.py
│   ├── sandboxed_functions_examples.py
│   ├── som_examples.py
│   ├── tracing_examples.py
│   ├── utils.py
│   └── winsandbox_example.py
├── img
│   ├── agent_gradio_ui.png
│   ├── agent.png
│   ├── cli.png
│   ├── computer.png
│   ├── logo_black.png
│   └── logo_white.png
├── libs
│   ├── kasm
│   │   ├── Dockerfile
│   │   ├── LICENSE
│   │   ├── README.md
│   │   └── src
│   │       └── ubuntu
│   │           └── install
│   │               └── firefox
│   │                   ├── custom_startup.sh
│   │                   ├── firefox.desktop
│   │                   └── install_firefox.sh
│   ├── lume
│   │   ├── .cursorignore
│   │   ├── CONTRIBUTING.md
│   │   ├── Development.md
│   │   ├── img
│   │   │   └── cli.png
│   │   ├── Package.resolved
│   │   ├── Package.swift
│   │   ├── README.md
│   │   ├── resources
│   │   │   └── lume.entitlements
│   │   ├── scripts
│   │   │   ├── build
│   │   │   │   ├── build-debug.sh
│   │   │   │   ├── build-release-notarized.sh
│   │   │   │   └── build-release.sh
│   │   │   └── install.sh
│   │   ├── src
│   │   │   ├── Commands
│   │   │   │   ├── Clone.swift
│   │   │   │   ├── Config.swift
│   │   │   │   ├── Create.swift
│   │   │   │   ├── Delete.swift
│   │   │   │   ├── Get.swift
│   │   │   │   ├── Images.swift
│   │   │   │   ├── IPSW.swift
│   │   │   │   ├── List.swift
│   │   │   │   ├── Logs.swift
│   │   │   │   ├── Options
│   │   │   │   │   └── FormatOption.swift
│   │   │   │   ├── Prune.swift
│   │   │   │   ├── Pull.swift
│   │   │   │   ├── Push.swift
│   │   │   │   ├── Run.swift
│   │   │   │   ├── Serve.swift
│   │   │   │   ├── Set.swift
│   │   │   │   └── Stop.swift
│   │   │   ├── ContainerRegistry
│   │   │   │   ├── ImageContainerRegistry.swift
│   │   │   │   ├── ImageList.swift
│   │   │   │   └── ImagesPrinter.swift
│   │   │   ├── Errors
│   │   │   │   └── Errors.swift
│   │   │   ├── FileSystem
│   │   │   │   ├── Home.swift
│   │   │   │   ├── Settings.swift
│   │   │   │   ├── VMConfig.swift
│   │   │   │   ├── VMDirectory.swift
│   │   │   │   └── VMLocation.swift
│   │   │   ├── LumeController.swift
│   │   │   ├── Main.swift
│   │   │   ├── Server
│   │   │   │   ├── Handlers.swift
│   │   │   │   ├── HTTP.swift
│   │   │   │   ├── Requests.swift
│   │   │   │   ├── Responses.swift
│   │   │   │   └── Server.swift
│   │   │   ├── Utils
│   │   │   │   ├── CommandRegistry.swift
│   │   │   │   ├── CommandUtils.swift
│   │   │   │   ├── Logger.swift
│   │   │   │   ├── NetworkUtils.swift
│   │   │   │   ├── Path.swift
│   │   │   │   ├── ProcessRunner.swift
│   │   │   │   ├── ProgressLogger.swift
│   │   │   │   ├── String.swift
│   │   │   │   └── Utils.swift
│   │   │   ├── Virtualization
│   │   │   │   ├── DarwinImageLoader.swift
│   │   │   │   ├── DHCPLeaseParser.swift
│   │   │   │   ├── ImageLoaderFactory.swift
│   │   │   │   └── VMVirtualizationService.swift
│   │   │   ├── VM
│   │   │   │   ├── DarwinVM.swift
│   │   │   │   ├── LinuxVM.swift
│   │   │   │   ├── VM.swift
│   │   │   │   ├── VMDetails.swift
│   │   │   │   ├── VMDetailsPrinter.swift
│   │   │   │   ├── VMDisplayResolution.swift
│   │   │   │   └── VMFactory.swift
│   │   │   └── VNC
│   │   │       ├── PassphraseGenerator.swift
│   │   │       └── VNCService.swift
│   │   └── tests
│   │       ├── Mocks
│   │       │   ├── MockVM.swift
│   │       │   ├── MockVMVirtualizationService.swift
│   │       │   └── MockVNCService.swift
│   │       ├── VM
│   │       │   └── VMDetailsPrinterTests.swift
│   │       ├── VMTests.swift
│   │       ├── VMVirtualizationServiceTests.swift
│   │       └── VNCServiceTests.swift
│   ├── lumier
│   │   ├── .dockerignore
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   └── src
│   │       ├── bin
│   │       │   └── entry.sh
│   │       ├── config
│   │       │   └── constants.sh
│   │       ├── hooks
│   │       │   └── on-logon.sh
│   │       └── lib
│   │           ├── utils.sh
│   │           └── vm.sh
│   ├── python
│   │   ├── agent
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── agent
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── adapters
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── cua_adapter.py
│   │   │   │   │   ├── huggingfacelocal_adapter.py
│   │   │   │   │   ├── human_adapter.py
│   │   │   │   │   ├── mlxvlm_adapter.py
│   │   │   │   │   └── models
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── generic.py
│   │   │   │   │       ├── internvl.py
│   │   │   │   │       ├── opencua.py
│   │   │   │   │       └── qwen2_5_vl.py
│   │   │   │   ├── agent.py
│   │   │   │   ├── callbacks
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── budget_manager.py
│   │   │   │   │   ├── image_retention.py
│   │   │   │   │   ├── logging.py
│   │   │   │   │   ├── operator_validator.py
│   │   │   │   │   ├── pii_anonymization.py
│   │   │   │   │   ├── prompt_instructions.py
│   │   │   │   │   ├── telemetry.py
│   │   │   │   │   └── trajectory_saver.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── computers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cua.py
│   │   │   │   │   └── custom.py
│   │   │   │   ├── decorators.py
│   │   │   │   ├── human_tool
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   ├── server.py
│   │   │   │   │   └── ui.py
│   │   │   │   ├── integrations
│   │   │   │   │   └── hud
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── agent.py
│   │   │   │   │       └── proxy.py
│   │   │   │   ├── loops
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── anthropic.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── composed_grounded.py
│   │   │   │   │   ├── gelato.py
│   │   │   │   │   ├── gemini.py
│   │   │   │   │   ├── generic_vlm.py
│   │   │   │   │   ├── glm45v.py
│   │   │   │   │   ├── gta1.py
│   │   │   │   │   ├── holo.py
│   │   │   │   │   ├── internvl.py
│   │   │   │   │   ├── model_types.csv
│   │   │   │   │   ├── moondream3.py
│   │   │   │   │   ├── omniparser.py
│   │   │   │   │   ├── openai.py
│   │   │   │   │   ├── opencua.py
│   │   │   │   │   ├── uiins.py
│   │   │   │   │   ├── uitars.py
│   │   │   │   │   └── uitars2.py
│   │   │   │   ├── proxy
│   │   │   │   │   ├── examples.py
│   │   │   │   │   └── handlers.py
│   │   │   │   ├── responses.py
│   │   │   │   ├── tools
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── browser_tool.py
│   │   │   │   ├── types.py
│   │   │   │   └── ui
│   │   │   │       ├── __init__.py
│   │   │   │       ├── __main__.py
│   │   │   │       └── gradio
│   │   │   │           ├── __init__.py
│   │   │   │           ├── app.py
│   │   │   │           └── ui_components.py
│   │   │   ├── benchmarks
│   │   │   │   ├── .gitignore
│   │   │   │   ├── contrib.md
│   │   │   │   ├── interactive.py
│   │   │   │   ├── models
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   └── gta1.py
│   │   │   │   ├── README.md
│   │   │   │   ├── ss-pro.py
│   │   │   │   ├── ss-v2.py
│   │   │   │   └── utils.py
│   │   │   ├── example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_computer_agent.py
│   │   ├── bench-ui
│   │   │   ├── bench_ui
│   │   │   │   ├── __init__.py
│   │   │   │   ├── api.py
│   │   │   │   └── child.py
│   │   │   ├── examples
│   │   │   │   ├── folder_example.py
│   │   │   │   ├── gui
│   │   │   │   │   ├── index.html
│   │   │   │   │   ├── logo.svg
│   │   │   │   │   └── styles.css
│   │   │   │   ├── output_overlay.png
│   │   │   │   └── simple_example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       └── test_port_detection.py
│   │   ├── computer
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer
│   │   │   │   ├── __init__.py
│   │   │   │   ├── computer.py
│   │   │   │   ├── diorama_computer.py
│   │   │   │   ├── helpers.py
│   │   │   │   ├── interface
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   ├── models.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── logger.py
│   │   │   │   ├── models.py
│   │   │   │   ├── providers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cloud
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── docker
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── lume
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── lume_api.py
│   │   │   │   │   ├── lumier
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── types.py
│   │   │   │   │   └── winsandbox
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── provider.py
│   │   │   │   │       └── setup_script.ps1
│   │   │   │   ├── tracing_wrapper.py
│   │   │   │   ├── tracing.py
│   │   │   │   ├── ui
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   └── gradio
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       └── app.py
│   │   │   │   └── utils.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_computer.py
│   │   ├── computer-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── browser.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── diorama
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── diorama_computer.py
│   │   │   │   │   ├── diorama.py
│   │   │   │   │   ├── draw.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── safezone.py
│   │   │   │   ├── handlers
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── main.py
│   │   │   │   ├── server.py
│   │   │   │   ├── utils
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── wallpaper.py
│   │   │   │   └── watchdog.py
│   │   │   ├── examples
│   │   │   │   ├── __init__.py
│   │   │   │   └── usage_example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   ├── run_server.py
│   │   │   ├── test_connection.py
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_server.py
│   │   ├── core
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── core
│   │   │   │   ├── __init__.py
│   │   │   │   └── telemetry
│   │   │   │       ├── __init__.py
│   │   │   │       └── posthog.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_telemetry.py
│   │   ├── mcp-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── build-extension.py
│   │   │   ├── CONCURRENT_SESSIONS.md
│   │   │   ├── desktop-extension
│   │   │   │   ├── cua-extension.mcpb
│   │   │   │   ├── desktop_extension.png
│   │   │   │   ├── manifest.json
│   │   │   │   ├── README.md
│   │   │   │   ├── requirements.txt
│   │   │   │   ├── run_server.sh
│   │   │   │   └── setup.py
│   │   │   ├── mcp_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── server.py
│   │   │   │   └── session_manager.py
│   │   │   ├── pdm.lock
│   │   │   ├── pyproject.toml
│   │   │   ├── QUICK_TEST_COMMANDS.sh
│   │   │   ├── quick_test_local_option.py
│   │   │   ├── README.md
│   │   │   ├── scripts
│   │   │   │   ├── install_mcp_server.sh
│   │   │   │   └── start_mcp_server.sh
│   │   │   ├── test_mcp_server_local_option.py
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_mcp_server.py
│   │   ├── pylume
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_pylume.py
│   │   └── som
│   │       ├── .bumpversion.cfg
│   │       ├── LICENSE
│   │       ├── poetry.toml
│   │       ├── pyproject.toml
│   │       ├── README.md
│   │       ├── som
│   │       │   ├── __init__.py
│   │       │   ├── detect.py
│   │       │   ├── detection.py
│   │       │   ├── models.py
│   │       │   ├── ocr.py
│   │       │   ├── util
│   │       │   │   └── utils.py
│   │       │   └── visualization.py
│   │       └── tests
│   │           ├── conftest.py
│   │           └── test_omniparser.py
│   ├── qemu-docker
│   │   ├── linux
│   │   │   ├── Dockerfile
│   │   │   ├── README.md
│   │   │   └── src
│   │   │       ├── entry.sh
│   │   │       └── vm
│   │   │           ├── image
│   │   │           │   └── README.md
│   │   │           └── setup
│   │   │               ├── install.sh
│   │   │               ├── setup-cua-server.sh
│   │   │               └── setup.sh
│   │   ├── README.md
│   │   └── windows
│   │       ├── Dockerfile
│   │       ├── README.md
│   │       └── src
│   │           ├── entry.sh
│   │           └── vm
│   │               ├── image
│   │               │   └── README.md
│   │               └── setup
│   │                   ├── install.bat
│   │                   ├── on-logon.ps1
│   │                   ├── setup-cua-server.ps1
│   │                   ├── setup-utils.psm1
│   │                   └── setup.ps1
│   ├── typescript
│   │   ├── .gitignore
│   │   ├── .nvmrc
│   │   ├── agent
│   │   │   ├── examples
│   │   │   │   ├── playground-example.html
│   │   │   │   └── README.md
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── client.ts
│   │   │   │   ├── index.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   └── client.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── computer
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── computer
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── providers
│   │   │   │   │   │   ├── base.ts
│   │   │   │   │   │   ├── cloud.ts
│   │   │   │   │   │   └── index.ts
│   │   │   │   │   └── types.ts
│   │   │   │   ├── index.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── base.ts
│   │   │   │   │   ├── factory.ts
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── linux.ts
│   │   │   │   │   ├── macos.ts
│   │   │   │   │   └── windows.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   ├── computer
│   │   │   │   │   └── cloud.test.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── factory.test.ts
│   │   │   │   │   ├── index.test.ts
│   │   │   │   │   ├── linux.test.ts
│   │   │   │   │   ├── macos.test.ts
│   │   │   │   │   └── windows.test.ts
│   │   │   │   └── setup.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── core
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── index.ts
│   │   │   │   └── telemetry
│   │   │   │       ├── clients
│   │   │   │       │   ├── index.ts
│   │   │   │       │   └── posthog.ts
│   │   │   │       └── index.ts
│   │   │   ├── tests
│   │   │   │   └── telemetry.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── cua-cli
│   │   │   ├── .gitignore
│   │   │   ├── .prettierrc
│   │   │   ├── bun.lock
│   │   │   ├── CLAUDE.md
│   │   │   ├── index.ts
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── auth.ts
│   │   │   │   ├── cli.ts
│   │   │   │   ├── commands
│   │   │   │   │   ├── auth.ts
│   │   │   │   │   └── sandbox.ts
│   │   │   │   ├── config.ts
│   │   │   │   ├── http.ts
│   │   │   │   ├── storage.ts
│   │   │   │   └── util.ts
│   │   │   └── tsconfig.json
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── pnpm-workspace.yaml
│   │   └── README.md
│   └── xfce
│       ├── .dockerignore
│       ├── .gitignore
│       ├── Development.md
│       ├── Dockerfile
│       ├── Dockerfile.dev
│       ├── README.md
│       └── src
│           ├── scripts
│           │   ├── resize-display.sh
│           │   ├── start-computer-server.sh
│           │   ├── start-novnc.sh
│           │   ├── start-vnc.sh
│           │   └── xstartup.sh
│           ├── supervisor
│           │   └── supervisord.conf
│           └── xfce-config
│               ├── helpers.rc
│               ├── xfce4-power-manager.xml
│               └── xfce4-session.xml
├── LICENSE.md
├── Makefile
├── notebooks
│   ├── agent_nb.ipynb
│   ├── blog
│   │   ├── build-your-own-operator-on-macos-1.ipynb
│   │   └── build-your-own-operator-on-macos-2.ipynb
│   ├── composite_agents_docker_nb.ipynb
│   ├── computer_nb.ipynb
│   ├── computer_server_nb.ipynb
│   ├── customizing_computeragent.ipynb
│   ├── eval_osworld.ipynb
│   ├── ollama_nb.ipynb
│   ├── README.md
│   ├── sota_hackathon_cloud.ipynb
│   └── sota_hackathon.ipynb
├── package-lock.json
├── package.json
├── pnpm-lock.yaml
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── scripts
│   ├── install-cli.ps1
│   ├── install-cli.sh
│   ├── playground-docker.sh
│   ├── playground.sh
│   ├── run-docker-dev.sh
│   └── typescript-typecheck.js
├── TESTING.md
├── tests
│   ├── agent_loop_testing
│   │   ├── agent_test.py
│   │   └── README.md
│   ├── pytest.ini
│   ├── shell_cmd.py
│   ├── test_files.py
│   ├── test_mcp_server_session_management.py
│   ├── test_mcp_server_streaming.py
│   ├── test_shell_bash.py
│   ├── test_telemetry.py
│   ├── test_tracing.py
│   ├── test_venv.py
│   └── test_watchdog.py
└── uv.lock
```

# Files

--------------------------------------------------------------------------------
/.github/workflows/pypi-publish-computer.yml:
--------------------------------------------------------------------------------

```yaml
name: Publish Computer Package

on:
  push:
    tags:
      - "computer-v*"
  workflow_dispatch:
    inputs:
      version:
        description: "Version to publish (without v prefix)"
        required: true
        default: "0.1.0"
  workflow_call:
    inputs:
      version:
        description: "Version to publish"
        required: true
        type: string

# Adding permissions at workflow level
permissions:
  contents: write

jobs:
  prepare:
    runs-on: macos-latest
    outputs:
      version: ${{ steps.get-version.outputs.version }}
      core_version: ${{ steps.update-deps.outputs.core_version }}
    steps:
      - uses: actions/checkout@v4

      - name: Determine version
        id: get-version
        run: |
          echo "=== Version Detection Debug ==="
          echo "Event name: ${{ github.event_name }}"
          echo "Workflow call version: ${{ inputs.version }}"
          echo "Workflow dispatch version: ${{ github.event.inputs.version }}"
          echo "GitHub ref: ${{ github.ref }}"

          # Check inputs.version first (works for workflow_call regardless of event_name)
          if [ -n "${{ inputs.version }}" ]; then
            # Version provided via workflow_call or workflow_dispatch with version input
            VERSION=${{ inputs.version }}
            echo "Using inputs.version: $VERSION"
          elif [ "${{ github.event_name }}" == "push" ]; then
            # Extract version from tag (for package-specific tags)
            if [[ "${{ github.ref }}" =~ ^refs/tags/computer-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then
              VERSION=${BASH_REMATCH[1]}
              echo "Extracted from tag: $VERSION"
            else
              echo "Invalid tag format for computer"
              exit 1
            fi
          elif [ -n "${{ github.event.inputs.version }}" ]; then
            # Use version from workflow_dispatch event inputs
            VERSION=${{ github.event.inputs.version }}
            echo "Using event.inputs.version: $VERSION"
          else
            echo "ERROR: No version found!"
            echo "  - inputs.version is empty"
            echo "  - event.inputs.version is empty"
            echo "  - Not a tag push event"
            exit 1
          fi

          echo "=== Final Version ==="
          echo "VERSION=$VERSION"
          echo "version=$VERSION" >> $GITHUB_OUTPUT

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: "3.11"

      - name: Update dependencies to latest versions
        id: update-deps
        run: |
          cd libs/python/computer
          # Install required package for PyPI API access
          pip install requests

          # Create a more robust Python script for PyPI version checking
          cat > get_latest_versions.py << 'EOF'
          import requests
          import json
          import sys

          def get_package_version(package_name, fallback="0.1.0"):
              try:
                  response = requests.get(f'https://pypi.org/pypi/{package_name}/json')
                  print(f"API Response Status for {package_name}: {response.status_code}", file=sys.stderr)
                  
                  if response.status_code != 200:
                      print(f"API request failed for {package_name}, using fallback version", file=sys.stderr)
                      return fallback
                  
                  data = json.loads(response.text)
                  
                  if 'info' not in data:
                      print(f"Missing 'info' key in API response for {package_name}, using fallback version", file=sys.stderr)
                      return fallback
                      
                  return data['info']['version']
              except Exception as e:
                  print(f"Error fetching version for {package_name}: {str(e)}", file=sys.stderr)
                  return fallback

          # Get latest versions
          print(get_package_version('cua-core'))
          EOF

          # Execute the script to get the versions
          VERSIONS=($(python get_latest_versions.py))
          LATEST_CORE=${VERSIONS[0]}

          echo "Latest cua-core version: $LATEST_CORE"

          # Output the versions for the next job
          echo "core_version=$LATEST_CORE" >> $GITHUB_OUTPUT

          # Determine major version for version constraint
          CORE_MAJOR=$(echo $LATEST_CORE | cut -d. -f1)
          NEXT_CORE_MAJOR=$((CORE_MAJOR + 1))

          # Update dependencies in pyproject.toml
          if [[ "$OSTYPE" == "darwin"* ]]; then
            # macOS version of sed needs an empty string for -i
            sed -i '' "s/\"cua-core>=.*,<.*\"/\"cua-core>=$LATEST_CORE,<$NEXT_CORE_MAJOR.0.0\"/" pyproject.toml
          else
            # Linux version
            sed -i "s/\"cua-core>=.*,<.*\"/\"cua-core>=$LATEST_CORE,<$NEXT_CORE_MAJOR.0.0\"/" pyproject.toml
          fi

          # Display the updated dependencies
          echo "Updated dependencies in pyproject.toml:"
          grep -E "cua-core" pyproject.toml

  publish:
    needs: prepare
    uses: ./.github/workflows/pypi-reusable-publish.yml
    with:
      package_name: "computer"
      package_dir: "libs/python/computer"
      version: ${{ needs.prepare.outputs.version }}
      is_lume_package: false
      base_package_name: "cua-computer"
    secrets:
      PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}

  set-env-variables:
    needs: [prepare, publish]
    runs-on: macos-latest
    steps:
      - name: Set environment variables for use in other jobs
        run: |
          echo "CORE_VERSION=${{ needs.prepare.outputs.core_version }}" >> $GITHUB_ENV

```

--------------------------------------------------------------------------------
/libs/python/computer/computer/providers/winsandbox/setup_script.ps1:
--------------------------------------------------------------------------------

```
# Setup script for Windows Sandbox CUA Computer provider
# This script runs when the sandbox starts

Write-Host "Starting CUA Computer setup in Windows Sandbox..."

# Function to find the mapped Python installation from pywinsandbox
function Find-MappedPython {
    Write-Host "Looking for mapped Python installation from pywinsandbox..."
    
    # pywinsandbox maps the host Python installation to the sandbox
    # Look for mapped shared folders on the desktop (common pywinsandbox pattern)
    $desktopPath = "C:\Users\WDAGUtilityAccount\Desktop"
    $sharedFolders = Get-ChildItem -Path $desktopPath -Directory -ErrorAction SilentlyContinue
    
    foreach ($folder in $sharedFolders) {
        # Look for Python executables in shared folders
        $pythonPaths = @(
            "$($folder.FullName)\python.exe",
            "$($folder.FullName)\Scripts\python.exe",
            "$($folder.FullName)\bin\python.exe"
        )
        
        foreach ($pythonPath in $pythonPaths) {
            if (Test-Path $pythonPath) {
                try {
                    $version = & $pythonPath --version 2>&1
                    if ($version -match "Python") {
                        Write-Host "Found mapped Python: $pythonPath - $version"
                        return $pythonPath
                    }
                } catch {
                    continue
                }
            }
        }
        
        # Also check subdirectories that might contain Python
        $subDirs = Get-ChildItem -Path $folder.FullName -Directory -ErrorAction SilentlyContinue
        foreach ($subDir in $subDirs) {
            $pythonPath = "$($subDir.FullName)\python.exe"
            if (Test-Path $pythonPath) {
                try {
                    $version = & $pythonPath --version 2>&1
                    if ($version -match "Python") {
                        Write-Host "Found mapped Python in subdirectory: $pythonPath - $version"
                        return $pythonPath
                    }
                } catch {
                    continue
                }
            }
        }
    }
    
    # Fallback: try common Python commands that might be available
    $pythonCommands = @("python", "py", "python3")
    foreach ($cmd in $pythonCommands) {
        try {
            $version = & $cmd --version 2>&1
            if ($version -match "Python") {
                Write-Host "Found Python via command '$cmd': $version"
                return $cmd
            }
        } catch {
            continue
        }
    }
    
    throw "Could not find any Python installation (mapped or otherwise)"
}

try {
    # Step 1: Find the mapped Python installation
    Write-Host "Step 1: Finding mapped Python installation..."
    $pythonExe = Find-MappedPython
    Write-Host "Using Python: $pythonExe"
    
    # Verify Python works and show version
    $pythonVersion = & $pythonExe --version 2>&1
    Write-Host "Python version: $pythonVersion"

    # Step 2: Create a dedicated virtual environment in mapped Desktop folder (persistent)
    Write-Host "Step 2: Creating virtual environment (if needed)..."
    $cachePath = "C:\Users\WDAGUtilityAccount\Desktop\wsb_cache"
    $venvPath = "C:\Users\WDAGUtilityAccount\Desktop\wsb_cache\venv"
    if (!(Test-Path $venvPath)) {
        Write-Host "Creating venv at: $venvPath"
        & $pythonExe -m venv $venvPath
    } else {
        Write-Host "Venv already exists at: $venvPath"
    }
    # Hide the folder to keep Desktop clean
    try {
        $item = Get-Item $cachePath -ErrorAction SilentlyContinue
        if ($item) {
            if (-not ($item.Attributes -band [IO.FileAttributes]::Hidden)) {
                $item.Attributes = $item.Attributes -bor [IO.FileAttributes]::Hidden
            }
        }
    } catch { }
    $venvPython = Join-Path $venvPath "Scripts\python.exe"
    if (!(Test-Path $venvPython)) {
        throw "Virtual environment Python not found at $venvPython"
    }
    Write-Host "Using venv Python: $venvPython"

    # Step 3: Install cua-computer-server into the venv
    Write-Host "Step 3: Installing cua-computer-server..."
    
    Write-Host "Upgrading pip..."
    & $venvPython -m pip install --upgrade pip --quiet
    
    Write-Host "Installing cua-computer-server..."
    & $venvPython -m pip install cua-computer-server
    
    Write-Host "cua-computer-server installation completed."

    # Step 4: Start computer server in background using the venv Python
    Write-Host "Step 4: Starting computer server in background..."
    Write-Host "Starting computer server with: $venvPython"
    
    # Start the computer server in the background
    $serverProcess = Start-Process -FilePath $venvPython -ArgumentList "-m", "computer_server.main" -WindowStyle Hidden -PassThru
    Write-Host "Computer server started in background with PID: $($serverProcess.Id)"
    
    # Give it a moment to start
    Start-Sleep -Seconds 3
    
    # Check if the process is still running
    if (Get-Process -Id $serverProcess.Id -ErrorAction SilentlyContinue) {
        Write-Host "Computer server is running successfully in background"
    } else {
        throw "Computer server failed to start or exited immediately"
    }

} catch {
    Write-Error "Setup failed: $_"
    Write-Host "Error details: $($_.Exception.Message)"
    Write-Host "Stack trace: $($_.ScriptStackTrace)"
    Write-Host ""
    Write-Host "Press any key to close this window..."
    $null = $Host.UI.RawUI.ReadKey("NoEcho,IncludeKeyDown")
    exit 1
}

Write-Host ""
Write-Host "Setup completed successfully!"
Write-Host "Press any key to close this window..."
$null = $Host.UI.RawUI.ReadKey("NoEcho,IncludeKeyDown")

```

--------------------------------------------------------------------------------
/libs/python/som/som/ocr.py:
--------------------------------------------------------------------------------

```python
import logging
import signal
from contextlib import contextmanager
from pathlib import Path
from typing import Any, Dict, List, Tuple, Union

import easyocr
import numpy as np
import torch
from PIL import Image

logger = logging.getLogger(__name__)


class TimeoutException(Exception):
    pass


@contextmanager
def timeout(seconds: int):
    import threading

    # Check if we're in the main thread
    if threading.current_thread() is threading.main_thread():

        def timeout_handler(signum, frame):
            raise TimeoutException("OCR process timed out")

        original_handler = signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(seconds)

        try:
            yield
        finally:
            signal.alarm(0)
            signal.signal(signal.SIGALRM, original_handler)
    else:
        # In a non-main thread, we can't use signal
        logger.warning(
            "Timeout function called from non-main thread; signal-based timeout disabled"
        )
        try:
            yield
        finally:
            pass


class OCRProcessor:
    """Class for handling OCR text detection."""

    _shared_reader = None  # Class-level shared reader instance

    def __init__(self):
        """Initialize the OCR processor."""
        self.reader = None
        # Determine best available device
        self.device = "cpu"
        if torch.cuda.is_available():
            self.device = "cuda"
        elif (
            hasattr(torch, "backends")
            and hasattr(torch.backends, "mps")
            and torch.backends.mps.is_available()
        ):
            self.device = "mps"
        logger.info(f"OCR processor initialized with device: {self.device}")

    def _ensure_reader(self):
        """Ensure EasyOCR reader is initialized.

        Uses a class-level cached reader to avoid reinitializing on every instance.
        """
        # First check if we already have a class-level reader
        if OCRProcessor._shared_reader is not None:
            self.reader = OCRProcessor._shared_reader
            return

        # Otherwise initialize a new one
        if self.reader is None:
            try:
                logger.info("Initializing EasyOCR reader...")
                import easyocr

                # Use GPU if available
                use_gpu = self.device in ["cuda", "mps"]
                self.reader = easyocr.Reader(["en"], gpu=use_gpu)

                # Verify reader initialization
                if self.reader is None:
                    raise ValueError("Failed to initialize EasyOCR reader")

                # Cache the reader at class level
                OCRProcessor._shared_reader = self.reader

                logger.info(f"EasyOCR reader initialized successfully with GPU={use_gpu}")
            except Exception as e:
                logger.error(f"Failed to initialize EasyOCR reader: {str(e)}")
                # Set to a placeholder that will be checked
                self.reader = None
                raise RuntimeError(f"EasyOCR initialization failed: {str(e)}") from e

    def detect_text(
        self, image: Image.Image, confidence_threshold: float = 0.5, timeout_seconds: int = 5
    ) -> List[Dict[str, Any]]:
        """Detect text in an image using EasyOCR.

        Args:
            image: PIL Image to process
            confidence_threshold: Minimum confidence for text detection
            timeout_seconds: Maximum time to wait for OCR

        Returns:
            List of text detection dictionaries
        """
        try:
            # Try to initialize reader, catch any exceptions
            try:
                self._ensure_reader()
            except Exception as e:
                logger.error(f"Failed to initialize OCR reader: {str(e)}")
                return []

            # Ensure reader was properly initialized
            if self.reader is None:
                logger.error("OCR reader is None after initialization")
                return []

            # Convert PIL Image to numpy array
            image_np = np.array(image)

            try:
                with timeout(timeout_seconds):
                    results = self.reader.readtext(
                        image_np, paragraph=False, text_threshold=confidence_threshold
                    )
            except TimeoutException:
                logger.warning("OCR timed out")
                return []
            except Exception as e:
                logger.warning(f"OCR failed: {str(e)}")
                return []

            detections = []
            img_width, img_height = image.size

            for box, text, conf in results:
                # Ensure conf is float
                conf_float = float(conf)
                if conf_float < confidence_threshold:
                    continue

                # Convert box format to [x1, y1, x2, y2]
                # Ensure box points are properly typed as float
                x1 = min(float(point[0]) for point in box) / img_width
                y1 = min(float(point[1]) for point in box) / img_height
                x2 = max(float(point[0]) for point in box) / img_width
                y2 = max(float(point[1]) for point in box) / img_height

                detections.append(
                    {
                        "type": "text",
                        "bbox": [x1, y1, x2, y2],
                        "content": text,
                        "confidence": conf,
                        "interactivity": False,  # Text is typically non-interactive
                    }
                )

            return detections
        except Exception as e:
            logger.error(f"Unexpected error in OCR processing: {str(e)}")
            return []

```

--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/gelato.py:
--------------------------------------------------------------------------------

```python
"""
Gelato agent loop implementation for click prediction using litellm.acompletion
Model: https://huggingface.co/mlfoundations/Gelato-30B-A3B
Code: https://github.com/mlfoundations/Gelato/tree/main
"""

import base64
import math
import re
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple

import litellm
from PIL import Image

from ..decorators import register_agent
from ..loops.base import AsyncAgentConfig
from ..types import AgentCapability

SYSTEM_PROMPT = """
You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. For elements with area, return the center point.

Output the coordinate pair exactly:
(x,y)
"""


def extract_coordinates(raw_string):
    """
    Extract the coordinates from the raw string.
    Args:
        raw_string: str (e.g. "(100, 200)")
    Returns:
        x: float (e.g. 100.0)
        y: float (e.g. 200.0)
    """
    try:
        matches = re.findall(r"\((-?\d*\.?\d+),\s*(-?\d*\.?\d+)\)", raw_string)
        return [tuple(map(int, match)) for match in matches][0]
    except:
        return 0, 0


def smart_resize(
    height: int,
    width: int,
    factor: int = 28,
    min_pixels: int = 3136,
    max_pixels: int = 8847360,
) -> Tuple[int, int]:
    """Smart resize function similar to qwen_vl_utils."""
    # Calculate the total pixels
    total_pixels = height * width

    # If already within bounds, return original dimensions
    if min_pixels <= total_pixels <= max_pixels:
        # Round to nearest factor
        new_height = (height // factor) * factor
        new_width = (width // factor) * factor
        return new_height, new_width

    # Calculate scaling factor
    if total_pixels > max_pixels:
        scale = (max_pixels / total_pixels) ** 0.5
    else:
        scale = (min_pixels / total_pixels) ** 0.5

    # Apply scaling
    new_height = int(height * scale)
    new_width = int(width * scale)

    # Round to nearest factor
    new_height = (new_height // factor) * factor
    new_width = (new_width // factor) * factor

    # Ensure minimum size
    new_height = max(new_height, factor)
    new_width = max(new_width, factor)

    return new_height, new_width


@register_agent(models=r".*Gelato.*")
class GelatoConfig(AsyncAgentConfig):
    """Gelato agent configuration implementing AsyncAgentConfig protocol for click prediction."""

    def __init__(self):
        self.current_model = None
        self.last_screenshot_b64 = None

    async def predict_step(
        self,
        messages: List[Dict[str, Any]],
        model: str,
        tools: Optional[List[Dict[str, Any]]] = None,
        max_retries: Optional[int] = None,
        stream: bool = False,
        computer_handler=None,
        _on_api_start=None,
        _on_api_end=None,
        _on_usage=None,
        _on_screenshot=None,
        **kwargs,
    ) -> Dict[str, Any]:
        raise NotImplementedError()

    async def predict_click(
        self, model: str, image_b64: str, instruction: str, **kwargs
    ) -> Optional[Tuple[float, float]]:
        """
        Predict click coordinates using UI-Ins model via litellm.acompletion.

        Args:
            model: The UI-Ins model name
            image_b64: Base64 encoded image
            instruction: Instruction for where to click

        Returns:
            Tuple of (x, y) coordinates or None if prediction fails
        """
        # Decode base64 image
        image_data = base64.b64decode(image_b64)
        image = Image.open(BytesIO(image_data))
        width, height = image.width, image.height

        # Smart resize the image (similar to qwen_vl_utils)
        resized_height, resized_width = smart_resize(
            height,
            width,
            factor=28,  # Default factor for Qwen models
            min_pixels=3136,
            max_pixels=4096 * 2160,
        )
        resized_image = image.resize((resized_width, resized_height))
        scale_x, scale_y = width / resized_width, height / resized_height

        # Convert resized image back to base64
        buffered = BytesIO()
        resized_image.save(buffered, format="PNG")
        resized_image_b64 = base64.b64encode(buffered.getvalue()).decode()

        # Prepare system and user messages
        system_message = {
            "role": "system",
            "content": [{"type": "text", "text": SYSTEM_PROMPT.strip()}],
        }

        user_message = {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{resized_image_b64}"},
                },
                {"type": "text", "text": instruction},
            ],
        }

        # Prepare API call kwargs
        api_kwargs = {
            "model": model,
            "messages": [system_message, user_message],
            "max_tokens": 2056,
            "temperature": 0.0,
            **kwargs,
        }

        # Use liteLLM acompletion
        response = await litellm.acompletion(**api_kwargs)

        # Extract response text
        output_text = response.choices[0].message.content  # type: ignore

        # Extract and rescale coordinates
        pred_x, pred_y = extract_coordinates(output_text)  # type: ignore
        pred_x *= scale_x
        pred_y *= scale_y

        return (math.floor(pred_x), math.floor(pred_y))

    def get_capabilities(self) -> List[AgentCapability]:
        """Return the capabilities supported by this agent."""
        return ["click"]

```

--------------------------------------------------------------------------------
/.github/workflows/pypi-publish-mcp-server.yml:
--------------------------------------------------------------------------------

```yaml
name: Publish MCP Server Package

on:
  push:
    tags:
      - "mcp-server-v*"
  workflow_dispatch:
    inputs:
      version:
        description: "Version to publish (without v prefix)"
        required: true
        default: "0.1.0"
  workflow_call:
    inputs:
      version:
        description: "Version to publish"
        required: true
        type: string
    outputs:
      version:
        description: "The version that was published"
        value: ${{ jobs.prepare.outputs.version }}

# Adding permissions at workflow level
permissions:
  contents: write

jobs:
  prepare:
    runs-on: macos-latest
    outputs:
      version: ${{ steps.get-version.outputs.version }}
      agent_version: ${{ steps.update-deps.outputs.agent_version }}
      computer_version: ${{ steps.update-deps.outputs.computer_version }}
    steps:
      - uses: actions/checkout@v4

      - name: Determine version
        id: get-version
        run: |
          if [ "${{ github.event_name }}" == "push" ]; then
            # Extract version from tag (for package-specific tags)
            if [[ "${{ github.ref }}" =~ ^refs/tags/mcp-server-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then
              VERSION=${BASH_REMATCH[1]}
            else
              echo "Invalid tag format for mcp-server"
              exit 1
            fi
          elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
            # Use version from workflow dispatch
            VERSION=${{ github.event.inputs.version }}
          else
            # Use version from workflow_call
            VERSION=${{ inputs.version }}
          fi
          echo "VERSION=$VERSION"
          echo "version=$VERSION" >> $GITHUB_OUTPUT

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: "3.11"

      - name: Update dependencies to latest versions
        id: update-deps
        run: |
          cd libs/python/mcp-server

          # Install required package for PyPI API access
          pip install requests

          # Create a Python script for PyPI version checking
          cat > get_latest_versions.py << 'EOF'
          import requests
          import json
          import sys

          def get_package_version(package_name, fallback="0.1.0"):
              try:
                  response = requests.get(f'https://pypi.org/pypi/{package_name}/json')
                  print(f"API Response Status for {package_name}: {response.status_code}", file=sys.stderr)
                  
                  if response.status_code != 200:
                      print(f"API request failed for {package_name}, using fallback version", file=sys.stderr)
                      return fallback
                  
                  data = json.loads(response.text)
                  
                  if 'info' not in data:
                      print(f"Missing 'info' key in API response for {package_name}, using fallback version", file=sys.stderr)
                      return fallback
                      
                  return data['info']['version']
              except Exception as e:
                  print(f"Error fetching version for {package_name}: {str(e)}", file=sys.stderr)
                  return fallback

          # Get latest versions
          print(get_package_version('cua-agent'))
          print(get_package_version('cua-computer'))
          EOF

          # Execute the script to get the versions
          VERSIONS=($(python get_latest_versions.py))
          LATEST_AGENT=${VERSIONS[0]}
          LATEST_COMPUTER=${VERSIONS[1]}

          echo "Latest cua-agent version: $LATEST_AGENT"
          echo "Latest cua-computer version: $LATEST_COMPUTER"

          # Output the versions for the next job
          echo "agent_version=$LATEST_AGENT" >> $GITHUB_OUTPUT
          echo "computer_version=$LATEST_COMPUTER" >> $GITHUB_OUTPUT

          # Determine major version for version constraint
          AGENT_MAJOR=$(echo $LATEST_AGENT | cut -d. -f1)
          COMPUTER_MAJOR=$(echo $LATEST_COMPUTER | cut -d. -f1)

          NEXT_AGENT_MAJOR=$((AGENT_MAJOR + 1))
          NEXT_COMPUTER_MAJOR=$((COMPUTER_MAJOR + 1))

          # Update dependencies in pyproject.toml
          if [[ "$OSTYPE" == "darwin"* ]]; then
            # macOS version of sed needs an empty string for -i
            # Update cua-agent with all extras
            sed -i '' "s/\"cua-agent\[all\]>=.*,<.*\"/\"cua-agent[all]>=$LATEST_AGENT,<$NEXT_AGENT_MAJOR.0.0\"/" pyproject.toml
            sed -i '' "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml
          else
            # Linux version
            sed -i "s/\"cua-agent\[all\]>=.*,<.*\"/\"cua-agent[all]>=$LATEST_AGENT,<$NEXT_AGENT_MAJOR.0.0\"/" pyproject.toml
            sed -i "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml
          fi

          # Display the updated dependencies
          echo "Updated dependencies in pyproject.toml:"
          grep -E "cua-agent|cua-computer" pyproject.toml

  publish:
    needs: prepare
    uses: ./.github/workflows/pypi-reusable-publish.yml
    with:
      package_name: "mcp-server"
      package_dir: "libs/python/mcp-server"
      version: ${{ needs.prepare.outputs.version }}
      is_lume_package: false
      base_package_name: "cua-mcp-server"
    secrets:
      PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}

  set-env-variables:
    needs: [prepare, publish]
    runs-on: macos-latest
    steps:
      - name: Set environment variables for use in other jobs
        run: |
          echo "AGENT_VERSION=${{ needs.prepare.outputs.agent_version }}" >> $GITHUB_ENV
          echo "COMPUTER_VERSION=${{ needs.prepare.outputs.computer_version }}" >> $GITHUB_ENV

```

--------------------------------------------------------------------------------
/libs/python/mcp-server/build-extension.py:
--------------------------------------------------------------------------------

```python
#!/usr/bin/env python3
"""
Build script for CUA Desktop Extension (.mcpb file)

This script:
1. Creates a temporary build directory
2. Copies necessary files from mcp_server/ to the build directory
3. Copies manifest and other static files
4. Creates a .mcpb (zip) file
5. Cleans up the temporary directory

Usage:
    python build-extension.py
"""

import os
import shutil
import subprocess
import sys
import tempfile
import zipfile
from pathlib import Path


def main():
    """Build the desktop extension."""
    # Get the script directory (libs/python/mcp-server)
    script_dir = Path(__file__).parent
    repo_root = script_dir.parent.parent.parent

    # Define paths
    output_dir = script_dir / "desktop-extension"
    output_file = output_dir / "cua-extension.mcpb"

    # Source directories
    mcp_server_dir = script_dir / "mcp_server"

    # Required files to copy
    files_to_copy = {
        "manifest.json": output_dir / "manifest.json",
        "desktop_extension.png": output_dir / "desktop_extension.png",
        "requirements.txt": output_dir / "requirements.txt",
        "run_server.sh": output_dir / "run_server.sh",
        "setup.py": output_dir / "setup.py",
    }

    # MCP server files to copy
    mcp_server_files = [
        "server.py",
        "session_manager.py",
    ]

    print("Building CUA Desktop Extension...")
    print(f"  Output: {output_file}")

    # Create temporary build directory
    with tempfile.TemporaryDirectory(prefix="cua-extension-build-") as build_dir:
        build_path = Path(build_dir)

        # Copy MCP server files
        print("  Copying MCP server files...")
        for filename in mcp_server_files:
            src = mcp_server_dir / filename
            dst = build_path / filename
            if src.exists():
                shutil.copy2(src, dst)
                print(f"    ✓ {filename}")
            else:
                print(f"    ✗ {filename} (not found)")
                sys.exit(1)

        # Copy static files from desktop-extension directory
        print("  Copying static files...")
        for src_name, src_path in files_to_copy.items():
            if src_path.exists():
                dst = build_path / src_name
                # Special handling for shell script - ensure executable
                shutil.copy2(src_path, dst)
                if src_name.endswith(".sh"):
                    os.chmod(dst, 0o755)
                print(f"    ✓ {src_name}")
            else:
                print(f"    ✗ {src_name} (not found)")
                sys.exit(1)

        # Validate manifest.json exists
        manifest_path = build_path / "manifest.json"
        if not manifest_path.exists():
            print("  ✗ manifest.json not found in build directory")
            sys.exit(1)

        # Create the .mcpb file (zip archive)
        print("  Creating .mcpb archive...")
        with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zipf:
            # Add all files from build directory to the zip
            for root, dirs, files in os.walk(build_path):
                # Skip __pycache__ and other unwanted directories
                dirs[:] = [d for d in dirs if d not in ["__pycache__", ".git"]]

                for file in files:
                    file_path = Path(root) / file
                    # Use relative path from build directory as archive name
                    arcname = file_path.relative_to(build_path)
                    zipf.write(file_path, arcname)
                    print(f"    ✓ Added {arcname}")

        print(f"✓ Build complete: {output_file}")
        print(f"  Archive size: {output_file.stat().st_size / 1024:.1f} KB")

        # Set custom file icon based on platform
        icon_file = output_dir / "desktop_extension.png"
        if sys.platform == "darwin":
            _set_icon_macos(output_file, icon_file)
        elif sys.platform == "win32":
            _set_icon_windows(output_file, icon_file)
        elif sys.platform.startswith("linux"):
            _set_icon_linux(output_file, icon_file)


def _set_icon_macos(output_file: Path, icon_file: Path):
    """Set custom file icon on macOS."""
    try:
        # Check if fileicon is installed
        result = subprocess.run(["which", "fileicon"], capture_output=True, text=True)
        if result.returncode == 0:
            # Use the logo as the file icon
            if icon_file.exists():
                print("  Setting custom file icon (macOS)...")
                subprocess.run(
                    ["fileicon", "set", str(output_file), str(icon_file)],
                    check=False,
                    capture_output=True,
                )
                print("    ✓ File icon set")
            else:
                print(f"    ⚠ Icon file not found: {icon_file}")
        else:
            print("  ⚠ fileicon not installed (optional - for custom file icon)")
            print("    Install with: brew install fileicon")
    except Exception as e:
        print(f"  ⚠ Could not set file icon: {e}")


def _set_icon_windows(output_file: Path, icon_file: Path):
    """Set custom file icon on Windows."""
    try:
        # Windows uses a desktop.ini approach, which is complex
        # For simplicity, we'll skip this for now
        print("  ⚠ Custom file icons not supported on Windows yet")
    except Exception as e:
        print(f"  ⚠ Could not set file icon: {e}")


def _set_icon_linux(output_file: Path, icon_file: Path):
    """Set custom file icon on Linux."""
    try:
        # Linux uses .desktop files and thumbnail generation
        # This is complex and depends on the desktop environment
        print("  ⚠ Custom file icons not supported on Linux yet")
    except Exception as e:
        print(f"  ⚠ Could not set file icon: {e}")


if __name__ == "__main__":
    main()

```

--------------------------------------------------------------------------------
/libs/python/agent/agent/integrations/hud/__init__.py:
--------------------------------------------------------------------------------

```python
"""HUD integration: dataset runners and MCP-based computer agent export.

This module exposes helpers to evaluate HUD-compatible datasets and exports
the MCP-compatible computer agent implementation.

Exports:
- run_single_task(dataset, ...)
- run_full_dataset(dataset, ...)
- MCPComputerAgent
"""

import time
from typing import Any, Optional

from agent.computers import is_agent_computer
from datasets import Dataset, load_dataset
from hud import trace
from hud.datasets import Task, run_dataset

from .agent import MCPComputerAgent

# ---------------------------------------------------------------------------
# Single-task runner
# ---------------------------------------------------------------------------


async def run_single_task(
    dataset: str | Dataset | list[dict[str, Any]],
    *,
    task_id: int = 0,
    model: str | None = None,
    allowed_tools: list[str] | None = None,
    # === ComputerAgent kwargs ===
    tools: list[Any] | None = None,
    custom_loop: Any | None = None,
    only_n_most_recent_images: int | None = None,
    callbacks: list[Any] | None = None,
    instructions: str | None = None,
    verbosity: int | None = None,
    trajectory_dir: str | dict | None = None,
    max_retries: int | None = 3,
    screenshot_delay: float | int = 0.5,
    use_prompt_caching: bool | None = False,
    max_trajectory_budget: float | dict | None = None,
    telemetry_enabled: bool | None = True,
) -> None:
    """Load one task from the dataset and execute it with MCPComputerAgent."""

    # Load dataset and pick a sample
    if isinstance(dataset, str):
        dataset = load_dataset(dataset, split="train")  # type: ignore[arg-type]
    elif isinstance(dataset, list):
        dataset = dataset
    else:
        dataset = dataset["train"]

    sample_task = dataset[task_id]  # type: ignore[index]
    task_prompt = sample_task.get("prompt", f"Task {sample_task.get('id', 0)}")  # type: ignore[attr-defined]

    # Filter any existing Computer tools
    # The eval framework will add its own Computer tool per task
    if tools:
        tools = [tool for tool in tools if not is_agent_computer(tool)]

    with trace(name=task_prompt):
        task = Task(**sample_task)  # type: ignore[arg-type]

        agent = MCPComputerAgent(
            model=model or "computer-use-preview",
            allowed_tools=allowed_tools or ["openai_computer"],
            # === ComputerAgent kwargs passthrough ===
            tools=tools,
            custom_loop=custom_loop,
            only_n_most_recent_images=only_n_most_recent_images,
            callbacks=callbacks,
            instructions=instructions,
            verbosity=verbosity,
            trajectory_dir=trajectory_dir,
            max_retries=max_retries,
            screenshot_delay=screenshot_delay,
            use_prompt_caching=use_prompt_caching,
            max_trajectory_budget=max_trajectory_budget,
            telemetry_enabled=telemetry_enabled,
        )
        print(f"Running: {task_prompt}")
        result = await agent.run(task, max_steps=10)
        print(f"✅ Reward: {result.reward}")


# ---------------------------------------------------------------------------
# Full-dataset runner
# ---------------------------------------------------------------------------


async def run_full_dataset(
    dataset: str | Dataset | list[dict[str, Any]],
    *,
    job_name: Optional[str] = None,
    model: str | None = None,
    allowed_tools: list[str] | None = None,
    max_concurrent: int = 30,
    max_steps: int = 50,
    split: str = "train",
    trajectory_dir: str | dict | None = None,
    # === ComputerAgent kwargs ===
    tools: list[Any] | None = None,
    custom_loop: Any | None = None,
    only_n_most_recent_images: int | None = 5,
    callbacks: list[Any] | None = None,
    instructions: str | None = None,
    verbosity: int | None = None,
    max_retries: int | None = 3,
    screenshot_delay: float | int = 0.5,
    use_prompt_caching: bool | None = False,
    max_trajectory_budget: float | dict | None = None,
    telemetry_enabled: bool | None = True,
) -> list[Any]:
    """Run evaluation across the entire dataset using hud.datasets.run_dataset."""

    # Run with our MCP-based agent class.
    if isinstance(dataset, str):
        dataset_name = dataset.split("/")[-1]
        job_name = job_name or f"Evaluation {dataset_name}"
        dataset = load_dataset(dataset, split=split)  # type: ignore[arg-type]
    else:
        dataset_name = "custom"
        job_name = job_name or f"Evaluation {time.strftime('%H:%M %Y-%m-%d')}"

    # Filter any existing Computer tools
    # The eval framework will add its own Computer tool per task
    if tools:
        tools = [tool for tool in tools if not is_agent_computer(tool)]

    # Execute evaluation
    return await run_dataset(
        name=job_name,
        dataset=dataset,
        agent_class=MCPComputerAgent,
        agent_config={
            "model": model,
            "allowed_tools": allowed_tools,
            "trajectory_dir": trajectory_dir,
            # === ComputerAgent kwargs passthrough ===
            "tools": tools,
            "custom_loop": custom_loop,
            "only_n_most_recent_images": only_n_most_recent_images,
            "callbacks": callbacks,
            "instructions": instructions,
            "verbosity": verbosity,
            "max_retries": max_retries,
            "screenshot_delay": screenshot_delay,
            "use_prompt_caching": use_prompt_caching,
            "max_trajectory_budget": max_trajectory_budget,
            "telemetry_enabled": telemetry_enabled,
        },
        max_concurrent=max_concurrent,
        metadata={"dataset": dataset_name},
        max_steps=max_steps,
        auto_respond=True,
    )


__all__ = [
    "run_single_task",
    "run_full_dataset",
    "MCPComputerAgent",
]

```

--------------------------------------------------------------------------------
/libs/lume/tests/VMTests.swift:
--------------------------------------------------------------------------------

```swift
import Foundation
import Testing

@testable import lume

class MockProcessRunner: ProcessRunner {
    var runCalls: [(executable: String, arguments: [String])] = []

    func run(executable: String, arguments: [String]) throws {
        runCalls.append((executable, arguments))
    }
}

private func setupVMDirectory(_ tempDir: URL) throws -> VMDirectory {
    let vmDir = VMDirectory(Path(tempDir.path))

    // Create disk image file
    let diskPath = vmDir.diskPath
    let diskData = Data(repeating: 0, count: 1024 * 1024)  // 1MB mock disk
    try diskData.write(to: diskPath.url)

    // Create nvram file
    let nvramPath = vmDir.nvramPath
    let nvramData = Data(repeating: 0, count: 1024)  // 1KB mock nvram
    try nvramData.write(to: nvramPath.url)

    // Create initial config file
    var config = try VMConfig(
        os: "mock-os",
        cpuCount: 1,
        memorySize: 1024,
        diskSize: 1024,
        display: "1024x768"
    )
    config.setMacAddress("00:11:22:33:44:55")
    try vmDir.saveConfig(config)

    // Create .initialized file to mark VM as initialized
    let initializedPath = vmDir.dir.file(".initialized")
    try Data().write(to: initializedPath.url)

    return vmDir
}

@MainActor
@Test("VM initialization and configuration")
func testVMInitialization() async throws {
    let tempDir = try createTempDirectory()
    let vmDir = try setupVMDirectory(tempDir)
    var config = try VMConfig(
        os: "mock-os",
        cpuCount: 1,
        memorySize: 1024,
        diskSize: 1024,
        display: "1024x768"
    )
    config.setMacAddress("00:11:22:33:44:55")  // Set MAC address to avoid nil
    let home = Home(fileManager: FileManager.default)
    let context = VMDirContext(dir: vmDir, config: config, home: home, storage: nil)

    let vm = MockVM(
        vmDirContext: context,
        virtualizationServiceFactory: { _ in MockVMVirtualizationService() },
        vncServiceFactory: { MockVNCService(vmDirectory: $0) }
    )

    // Test initial state
    let details = vm.details
    #expect(details.name == vmDir.name)
    #expect(details.os == "mock-os")
    #expect(details.status == "stopped")
    #expect(details.vncUrl == nil)
}

@MainActor
@Test("VM run and stop operations")
func testVMRunAndStop() async throws {
    let tempDir = try createTempDirectory()
    let vmDir = try setupVMDirectory(tempDir)
    var config = try VMConfig(
        os: "mock-os",
        cpuCount: 2,
        memorySize: 2048,
        diskSize: 1024,
        display: "1024x768"
    )
    config.setMacAddress("00:11:22:33:44:55")
    let home = Home(fileManager: FileManager.default)
    let context = VMDirContext(dir: vmDir, config: config, home: home, storage: nil)

    let vm = MockVM(
        vmDirContext: context,
        virtualizationServiceFactory: { _ in MockVMVirtualizationService() },
        vncServiceFactory: { MockVNCService(vmDirectory: $0) }
    )

    // Test running VM
    let runTask = Task {
        try await vm.run(
            noDisplay: false, sharedDirectories: [], mount: nil as Path?, vncPort: 0,
            recoveryMode: false)
    }

    // Give the VM time to start
    try await Task.sleep(nanoseconds: UInt64(1e9))

    // Test stopping VM
    try await vm.stop()
    runTask.cancel()
}

@MainActor
@Test("VM configuration updates")
func testVMConfigurationUpdates() async throws {
    let tempDir = try createTempDirectory()
    let vmDir = try setupVMDirectory(tempDir)
    var config = try VMConfig(
        os: "mock-os",
        cpuCount: 1,
        memorySize: 1024,
        diskSize: 1024,
        display: "1024x768"
    )
    config.setMacAddress("00:11:22:33:44:55")
    let home = Home(fileManager: FileManager.default)
    let context = VMDirContext(dir: vmDir, config: config, home: home, storage: nil)

    let vm = MockVM(
        vmDirContext: context,
        virtualizationServiceFactory: { _ in MockVMVirtualizationService() },
        vncServiceFactory: { MockVNCService(vmDirectory: $0) }
    )

    // Test CPU count update
    try vm.setCpuCount(4)
    #expect(vm.vmDirContext.config.cpuCount == 4)

    // Test memory size update
    try vm.setMemorySize(4096)
    #expect(vm.vmDirContext.config.memorySize == 4096)

    // Test MAC address update
    try vm.setMacAddress("00:11:22:33:44:66")
    #expect(vm.vmDirContext.config.macAddress == "00:11:22:33:44:66")
}

@MainActor
@Test("VM setup process")
func testVMSetup() async throws {
    let tempDir = try createTempDirectory()
    let vmDir = try setupVMDirectory(tempDir)
    var config = try VMConfig(
        os: "mock-os",
        cpuCount: 1,
        memorySize: 1024,
        diskSize: 1024,
        display: "1024x768"
    )
    config.setMacAddress("00:11:22:33:44:55")
    let home = Home(fileManager: FileManager.default)
    let context = VMDirContext(dir: vmDir, config: config, home: home, storage: nil)

    let vm = MockVM(
        vmDirContext: context,
        virtualizationServiceFactory: { _ in MockVMVirtualizationService() },
        vncServiceFactory: { MockVNCService(vmDirectory: $0) }
    )

    let expectedDiskSize: UInt64 = 64 * 1024 * 1024 * 1024  // 64 GB

    try await vm.setup(
        ipswPath: "/path/to/mock.ipsw",
        cpuCount: 2,
        memorySize: 2048,
        diskSize: expectedDiskSize,
        display: "1024x768"
    )

    #expect(vm.vmDirContext.config.cpuCount == 2)
    #expect(vm.vmDirContext.config.memorySize == 2048)
    let actualDiskSize = vm.vmDirContext.config.diskSize ?? 0
    #expect(
        actualDiskSize == expectedDiskSize,
        "Expected disk size \(expectedDiskSize), but got \(actualDiskSize)")
    #expect(vm.vmDirContext.config.macAddress == "00:11:22:33:44:55")
}

private func createTempDirectory() throws -> URL {
    let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent(UUID().uuidString)
    try FileManager.default.createDirectory(at: tempDir, withIntermediateDirectories: true)
    return tempDir
}

```

--------------------------------------------------------------------------------
/tests/agent_loop_testing/agent_test.py:
--------------------------------------------------------------------------------

```python
#!/usr/bin/env python3
"""
Simple CUA Agent Test

Tests the actual CUA ComputerAgent SDK with a mock computer.
Only provides screenshot functionality - no complex computer actions.
"""

import asyncio
import base64
import sys
from io import BytesIO
from pathlib import Path

from PIL import Image, ImageDraw

# Add project root to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))


class MockComputer:
    """Mock computer that only provides screenshots."""

    def __init__(self):
        self.action_count = 0
        self._image = self._create_image()

    def _create_image(self) -> str:
        """Create a simple desktop image."""
        img = Image.new("RGB", (1920, 1080), color="lightblue")
        draw = ImageDraw.Draw(img)

        # Draw Safari icon
        draw.rectangle([100, 950, 150, 1000], fill="blue", outline="black", width=2)
        draw.text((110, 960), "Safari", fill="white")

        # Draw Terminal icon
        draw.rectangle([200, 950, 250, 1000], fill="green", outline="black", width=2)
        draw.text((210, 960), "Terminal", fill="white")

        # Convert to base64
        img_bytes = BytesIO()
        img.save(img_bytes, format="PNG")
        return base64.b64encode(img_bytes.getvalue()).decode("utf-8")

    async def screenshot(self) -> str:
        self.action_count += 1
        return self._image

    async def get_dimensions(self) -> tuple[int, int]:
        return (1920, 1080)

    # All other methods are no-ops (required by CUA interface)
    async def click(self, x: int, y: int, button: str = "left") -> None:
        await asyncio.sleep(0.1)

    async def double_click(self, x: int, y: int) -> None:
        await asyncio.sleep(0.1)

    async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
        await asyncio.sleep(0.1)

    async def type(self, text: str) -> None:
        await asyncio.sleep(0.1)

    async def wait(self, ms: int = 1000) -> None:
        await asyncio.sleep(ms / 1000.0)

    async def move(self, x: int, y: int) -> None:
        await asyncio.sleep(0.1)

    async def keypress(self, keys) -> None:
        await asyncio.sleep(0.1)

    async def drag(self, path) -> None:
        await asyncio.sleep(0.1)

    async def get_current_url(self) -> str:
        return "desktop://mock"

    async def get_environment(self) -> str:
        return "mac"

    # Required abstract methods
    async def left_mouse_down(self, x: int = 0, y: int = 0) -> None:
        await asyncio.sleep(0.1)

    async def left_mouse_up(self, x: int = 0, y: int = 0) -> None:
        await asyncio.sleep(0.1)

    async def right_mouse_down(self, x: int = 0, y: int = 0) -> None:
        await asyncio.sleep(0.1)

    async def right_mouse_up(self, x: int = 0, y: int = 0) -> None:
        await asyncio.sleep(0.1)

    async def mouse_move(self, x: int, y: int) -> None:
        await asyncio.sleep(0.1)

    async def key_down(self, key: str) -> None:
        await asyncio.sleep(0.1)

    async def key_up(self, key: str) -> None:
        await asyncio.sleep(0.1)

    async def type_text(self, text: str) -> None:
        await asyncio.sleep(0.1)


async def test_cua_agent(model_name: str):
    """Test CUA agent with mock computer."""
    print(f"🤖 Testing CUA Agent: {model_name}")
    print("=" * 50)

    try:
        # Import the real CUA agent
        from agent import ComputerAgent

        # Create mock computer
        mock_computer = MockComputer()

        # Create the real CUA ComputerAgent
        agent = ComputerAgent(model=model_name, tools=[mock_computer], max_trajectory_budget=5.0)

        print("✅ CUA Agent created")
        print("✅ Mock computer ready")
        print("🚀 Running agent...")
        print()

        # Run the agent with a specific task
        message = "Open Safari browser"

        iteration = 0
        async for result in agent.run([{"role": "user", "content": message}]):
            iteration += 1
            print(f"Iteration {iteration}:")

            # Print agent output
            output_items = result.get("output", [])
            if not output_items:
                print("  (No output from agent)")
            else:
                for item in output_items:
                    if item["type"] == "message":
                        print(f"  Agent: {item['content'][0]['text']}")
                    elif item["type"] == "tool_call":
                        print(f"  Tool: {item.get('tool_name')} {item.get('arguments')}")
                    else:
                        print(f"  Unknown output type: {item}")

            # Debug: print full result for empty iterations
            if not output_items:
                print(f"  Debug - Full result: {result}")

            # Let the agent decide when to stop (it should try to complete the task)
            # Only stop after 5 iterations to prevent infinite loops
            if iteration >= 5:
                print("🏁 Stopping after 5 iterations (safety limit)")
                break

        print()
        print("=" * 50)
        print("🎉 TEST COMPLETE!")
        print("=" * 50)
        print(f"✅ Model: {model_name}")
        print(f"✅ Iterations: {iteration}")
        print(f"✅ Screenshots: {mock_computer.action_count}")
        print("✅ Agent executed successfully")

        return True

    except ImportError as e:
        print(f"❌ Import error: {e}")
        print("💡 Install CUA: pip install -e libs/python/agent -e libs/python/computer")
        return False
    except Exception as e:
        print(f"❌ Test failed: {e}")
        return False


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Test CUA Agent with mock computer")
    parser.add_argument(
        "--model", default="anthropic/claude-sonnet-4-5-20250929", help="CUA model to test"
    )
    args = parser.parse_args()

    success = asyncio.run(test_cua_agent(args.model))
    sys.exit(0 if success else 1)

```

--------------------------------------------------------------------------------
/libs/python/bench-ui/bench_ui/api.py:
--------------------------------------------------------------------------------

```python
import json
import os
import subprocess
import sys
import tempfile
import time
from pathlib import Path
from typing import Any, Dict, Optional
from urllib import request
from urllib.error import HTTPError, URLError

import psutil

# Map child PID -> listening port
_pid_to_port: Dict[int, int] = {}


def _post_json(url: str, payload: Dict[str, Any]) -> Dict[str, Any]:
    data = json.dumps(payload).encode("utf-8")
    req = request.Request(
        url, data=data, headers={"Content-Type": "application/json"}, method="POST"
    )
    try:
        with request.urlopen(req, timeout=5) as resp:
            text = resp.read().decode("utf-8")
            return json.loads(text)
    except HTTPError as e:
        try:
            body = (e.read() or b"").decode("utf-8", errors="ignore")
            return json.loads(body)
        except Exception:
            return {"error": "http_error", "status": getattr(e, "code", None)}
    except URLError as e:
        return {"error": "url_error", "reason": str(e.reason)}


def _detect_port_for_pid(pid: int) -> int:
    """Detect a listening local TCP port for the given PID using psutil.

    Fails fast if psutil is unavailable or if no suitable port is found.
    """
    if psutil is None:
        raise RuntimeError("psutil is required for PID->port detection. Please install psutil.")

    # Scan system-wide connections and filter by PID
    for c in psutil.net_connections(kind="tcp"):
        if getattr(c, "pid", None) != pid:
            continue
        laddr = getattr(c, "laddr", None)
        status = str(getattr(c, "status", ""))
        if not laddr or not isinstance(laddr, tuple) or len(laddr) < 2:
            continue
        lip, lport = laddr[0], int(laddr[1])
        if status.upper() != "LISTEN":
            continue
        if lip in ("127.0.0.1", "::1", "0.0.0.0", "::"):
            return lport

    raise RuntimeError(f"Could not detect listening port for pid {pid}")


def launch_window(
    url: Optional[str] = None,
    *,
    html: Optional[str] = None,
    folder: Optional[str] = None,
    title: str = "Window",
    x: Optional[int] = None,
    y: Optional[int] = None,
    width: int = 600,
    height: int = 400,
    icon: Optional[str] = None,
    use_inner_size: bool = False,
    title_bar_style: str = "default",
) -> int:
    """Create a pywebview window in a child process and return its PID.

    Preferred input is a URL via the positional `url` parameter.
    To load inline HTML instead, pass `html=...`.
    To serve a static folder, pass `folder=...` (path to directory).

    Spawns `python -m bench_ui.child` with a JSON config passed via a temp file.
    The child prints a single JSON line: {"pid": <pid>, "port": <port>}.
    We cache pid->port for subsequent control calls like get_element_rect.
    """
    if not url and not html and not folder:
        raise ValueError("launch_window requires either a url, html, or folder")

    config = {
        "url": url,
        "html": html,
        "folder": folder,
        "title": title,
        "x": x,
        "y": y,
        "width": width,
        "height": height,
        "icon": icon,
        "use_inner_size": use_inner_size,
        "title_bar_style": title_bar_style,
    }

    with tempfile.NamedTemporaryFile("w", delete=False, suffix=".json") as f:
        json.dump(config, f)
        cfg_path = f.name

    try:
        # Launch child process
        proc = subprocess.Popen(
            [sys.executable, "-m", "bench_ui.child", cfg_path],
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
        )
        assert proc.stdout is not None
        # Read first line with startup info
        line = proc.stdout.readline().strip()
        info = json.loads(line)
        pid = int(info["pid"]) if "pid" in info else proc.pid
        port = int(info["port"])  # required
        _pid_to_port[pid] = port
        return pid
    finally:
        try:
            os.unlink(cfg_path)
        except Exception:
            pass


def get_element_rect(pid: int, selector: str, *, space: str = "window"):
    """Ask the child process to compute element client rect via injected JS.

    Returns a dict like {"x": float, "y": float, "width": float, "height": float} or None if not found.
    """
    if pid not in _pid_to_port:
        _pid_to_port[pid] = _detect_port_for_pid(pid)
    port = _pid_to_port[pid]
    url = f"http://127.0.0.1:{port}/rect"
    last: Dict[str, Any] = {}
    for _ in range(30):  # ~3s total
        resp = _post_json(url, {"selector": selector, "space": space})
        last = resp or {}
        rect = last.get("rect") if isinstance(last, dict) else None
        err = last.get("error") if isinstance(last, dict) else None
        if rect is not None:
            return rect
        if err in ("window_not_ready", "invalid_json"):
            time.sleep(0.1)
            continue
        # If other transient errors, brief retry
        if err:
            time.sleep(0.1)
            continue
        time.sleep(0.1)
    raise RuntimeError(f"Failed to get element rect: {last}")


def execute_javascript(pid: int, javascript: str):
    """Execute arbitrary JavaScript in the window and return its result.

    Retries briefly while the window is still becoming ready.
    """
    if pid not in _pid_to_port:
        _pid_to_port[pid] = _detect_port_for_pid(pid)
    port = _pid_to_port[pid]
    url = f"http://127.0.0.1:{port}/eval"
    last: Dict[str, Any] = {}
    for _ in range(30):  # ~3s total
        resp = _post_json(url, {"javascript": javascript})
        last = resp or {}
        if isinstance(last, dict):
            if "result" in last:
                return last["result"]
            if last.get("error") in ("window_not_ready", "invalid_json"):
                time.sleep(0.1)
                continue
            if last.get("error"):
                time.sleep(0.1)
                continue
        time.sleep(0.1)
    raise RuntimeError(f"Failed to execute JavaScript: {last}")

```

--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/uiins.py:
--------------------------------------------------------------------------------

```python
"""
UI-Ins agent loop implementation for click prediction using litellm.acompletion
Paper: https://arxiv.org/pdf/2510.202861
Code: https://github.com/alibaba/UI-Ins
"""

import asyncio
import base64
import json
import math
import re
import uuid
from io import BytesIO
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union

import litellm
from PIL import Image

from ..decorators import register_agent
from ..loops.base import AsyncAgentConfig
from ..types import AgentCapability, AgentResponse, Messages, Tools

SYSTEM_PROMPT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.\n\n## Output Format\nReturn a json object with a reasoning process in  tags, a function name and arguments within  XML tags:\n```\n\n...\n\n\n{"name": "grounding", "arguments": }\n\n```\n represents the following item of the action space:\n## Action Space{"action": "click", "coordinate": [x, y]}\nYour task is to accurately locate a UI element based on the instruction. You should first analyze instruction in  tags and finally output the function in  tags.\n"""


def parse_coordinates(raw_string: str) -> tuple[int, int]:
    matches = re.findall(r"\[(\d+),\s*(\d+)\]", raw_string)
    if matches:
        return tuple(map(int, matches[0]))
    return -1, -1


def smart_resize(
    height: int,
    width: int,
    factor: int = 28,
    min_pixels: int = 3136,
    max_pixels: int = 8847360,
) -> Tuple[int, int]:
    """Smart resize function similar to qwen_vl_utils."""
    # Calculate the total pixels
    total_pixels = height * width

    # If already within bounds, return original dimensions
    if min_pixels <= total_pixels <= max_pixels:
        # Round to nearest factor
        new_height = (height // factor) * factor
        new_width = (width // factor) * factor
        return new_height, new_width

    # Calculate scaling factor
    if total_pixels > max_pixels:
        scale = (max_pixels / total_pixels) ** 0.5
    else:
        scale = (min_pixels / total_pixels) ** 0.5

    # Apply scaling
    new_height = int(height * scale)
    new_width = int(width * scale)

    # Round to nearest factor
    new_height = (new_height // factor) * factor
    new_width = (new_width // factor) * factor

    # Ensure minimum size
    new_height = max(new_height, factor)
    new_width = max(new_width, factor)

    return new_height, new_width


@register_agent(models=r".*UI-Ins.*")
class UIInsConfig(AsyncAgentConfig):
    """UI-Ins agent configuration implementing AsyncAgentConfig protocol for click prediction."""

    def __init__(self):
        self.current_model = None
        self.last_screenshot_b64 = None

    async def predict_step(
        self,
        messages: List[Dict[str, Any]],
        model: str,
        tools: Optional[List[Dict[str, Any]]] = None,
        max_retries: Optional[int] = None,
        stream: bool = False,
        computer_handler=None,
        _on_api_start=None,
        _on_api_end=None,
        _on_usage=None,
        _on_screenshot=None,
        **kwargs,
    ) -> Dict[str, Any]:
        raise NotImplementedError()

    async def predict_click(
        self, model: str, image_b64: str, instruction: str, **kwargs
    ) -> Optional[Tuple[float, float]]:
        """
        Predict click coordinates using UI-Ins model via litellm.acompletion.

        Args:
            model: The UI-Ins model name
            image_b64: Base64 encoded image
            instruction: Instruction for where to click

        Returns:
            Tuple of (x, y) coordinates or None if prediction fails
        """
        # Decode base64 image
        image_data = base64.b64decode(image_b64)
        image = Image.open(BytesIO(image_data))
        width, height = image.width, image.height

        # Smart resize the image (similar to qwen_vl_utils)
        resized_height, resized_width = smart_resize(
            height,
            width,
            factor=28,  # Default factor for Qwen models
            min_pixels=3136,
            max_pixels=4096 * 2160,
        )
        resized_image = image.resize((resized_width, resized_height))
        scale_x, scale_y = width / resized_width, height / resized_height

        # Convert resized image back to base64
        buffered = BytesIO()
        resized_image.save(buffered, format="PNG")
        resized_image_b64 = base64.b64encode(buffered.getvalue()).decode()

        # Prepare system and user messages
        system_message = {
            "role": "system",
            "content": [
                {"type": "text", "text": "You are a helpful assistant."},
                {"type": "text", "text": SYSTEM_PROMPT},
            ],
        }

        user_message = {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{resized_image_b64}"},
                },
                {"type": "text", "text": instruction},
            ],
        }

        # Prepare API call kwargs
        api_kwargs = {
            "model": model,
            "messages": [system_message, user_message],
            "max_tokens": 2056,
            "temperature": 0.0,
            **kwargs,
        }

        # Use liteLLM acompletion
        response = await litellm.acompletion(**api_kwargs)

        # Extract response text
        output_text = response.choices[0].message.content  # type: ignore

        # Extract and rescale coordinates
        pred_x, pred_y = parse_coordinates(output_text)  # type: ignore
        pred_x *= scale_x
        pred_y *= scale_y

        return (math.floor(pred_x), math.floor(pred_y))

    def get_capabilities(self) -> List[AgentCapability]:
        """Return the capabilities supported by this agent."""
        return ["click"]

```

--------------------------------------------------------------------------------
/libs/python/agent/agent/proxy/examples.py:
--------------------------------------------------------------------------------

```python
"""
Example usage of the proxy server and client requests.
"""

import dotenv

dotenv.load_dotenv()

import asyncio
import json
import os
from typing import Any, Dict

import aiohttp


async def test_http_endpoint():
    """Test the HTTP /responses endpoint."""

    anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
    assert isinstance(anthropic_api_key, str), "ANTHROPIC_API_KEY environment variable must be set"

    # Example 1: Simple text request
    simple_request = {
        "model": "anthropic/claude-sonnet-4-5-20250929",
        "input": "Tell me a three sentence bedtime story about a unicorn.",
        "env": {"ANTHROPIC_API_KEY": anthropic_api_key},
    }

    # Example 2: Multi-modal request with image
    multimodal_request = {
        "model": "anthropic/claude-sonnet-4-5-20250929",
        "input": [
            {
                "role": "user",
                "content": [
                    {"type": "input_text", "text": "what is in this image?"},
                    {
                        "type": "input_image",
                        "image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
                    },
                ],
            }
        ],
        "env": {"ANTHROPIC_API_KEY": anthropic_api_key},
    }

    # Example 3: Request with custom agent and computer kwargs
    custom_request = {
        "model": "anthropic/claude-sonnet-4-5-20250929",
        "input": "Take a screenshot and tell me what you see",
        "env": {"ANTHROPIC_API_KEY": anthropic_api_key},
    }

    # Test requests
    base_url = "https://m-linux-96lcxd2c2k.containers.cloud.trycua.com:8443"
    # base_url = "http://localhost:8000"
    api_key = os.getenv("CUA_API_KEY")
    assert isinstance(api_key, str), "CUA_API_KEY environment variable must be set"

    async with aiohttp.ClientSession() as session:
        for i, request_data in enumerate(
            [
                simple_request,
                # multimodal_request,
                custom_request,
            ],
            1,
        ):
            print(f"\n--- Test {i} ---")
            print(f"Request: {json.dumps(request_data, indent=2)}")

            try:
                print(f"Sending request to {base_url}/responses")
                async with session.post(
                    f"{base_url}/responses",
                    json=request_data,
                    headers={"Content-Type": "application/json", "X-API-Key": api_key},
                ) as response:
                    result = await response.json()
                    print(f"Status: {response.status}")
                    print(f"Response: {json.dumps(result, indent=2)}")

            except Exception as e:
                print(f"Error: {e}")


def curl_examples():
    """Print curl command examples."""

    print("=== CURL Examples ===\n")

    print("1. Simple text request:")
    print(
        """curl http://localhost:8000/responses \\
  -H "Content-Type: application/json" \\
  -d '{
    "model": "anthropic/claude-sonnet-4-5-20250929",
    "input": "Tell me a three sentence bedtime story about a unicorn."
  }'"""
    )

    print("\n2. Multi-modal request with image:")
    print(
        """curl http://localhost:8000/responses \\
  -H "Content-Type: application/json" \\
  -d '{
    "model": "anthropic/claude-sonnet-4-5-20250929",
    "input": [
      {
        "role": "user",
        "content": [
          {"type": "input_text", "text": "what is in this image?"},
          {
            "type": "input_image",
            "image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
          }
        ]
      }
    ]
  }'"""
    )

    print("\n3. Request with custom configuration:")
    print(
        """curl http://localhost:8000/responses \\
  -H "Content-Type: application/json" \\
  -d '{
    "model": "anthropic/claude-sonnet-4-5-20250929",
    "input": "Take a screenshot and tell me what you see",
    "agent_kwargs": {
      "save_trajectory": true,
      "verbosity": 20
    },
    "computer_kwargs": {
      "os_type": "linux",
      "provider_type": "cloud"
    }
  }'"""
    )


async def test_p2p_client():
    """Example P2P client using peerjs-python."""
    try:
        from aiortc import RTCConfiguration, RTCIceServer
        from peerjs import ConnectionEventType, Peer, PeerOptions

        # Set up client peer
        options = PeerOptions(
            host="0.peerjs.com",
            port=443,
            secure=True,
            config=RTCConfiguration(iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]),
        )

        client_peer = Peer(id="test-client", peer_options=options)
        await client_peer.start()

        # Connect to proxy server
        connection = client_peer.connect("computer-agent-proxy")

        @connection.on(ConnectionEventType.Open)
        async def connection_open():
            print("Connected to proxy server")

            # Send a test request
            request = {
                "model": "anthropic/claude-sonnet-4-5-20250929",
                "input": "Hello from P2P client!",
            }
            await connection.send(json.dumps(request))

        @connection.on(ConnectionEventType.Data)
        async def connection_data(data):
            print(f"Received response: {data}")
            await client_peer.destroy()

        # Wait for connection
        await asyncio.sleep(10)

    except ImportError:
        print("P2P dependencies not available. Install peerjs-python for P2P testing.")
    except Exception as e:
        print(f"P2P test error: {e}")


if __name__ == "__main__":
    import sys

    if len(sys.argv) > 1 and sys.argv[1] == "curl":
        curl_examples()
    elif len(sys.argv) > 1 and sys.argv[1] == "p2p":
        asyncio.run(test_p2p_client())
    else:
        asyncio.run(test_http_endpoint())

```

--------------------------------------------------------------------------------
/libs/python/computer-server/computer_server/diorama/safezone.py:
--------------------------------------------------------------------------------

```python
#!/usr/bin/env python3
"""
UI Safezone Helper - A utility to get accurate bounds for macOS UI elements

This module provides helper functions to get accurate bounds for macOS UI elements
like the menubar and dock, which are needed for proper screenshot composition.
"""

import sys
import time
from typing import Any, Dict, Optional, Tuple

# Import Objective-C bridge libraries
try:
    import AppKit
    import Foundation
    from AppKit import NSRunningApplication, NSWorkspace
    from ApplicationServices import (
        AXUIElementCopyAttributeValue,
        AXUIElementCopyAttributeValues,
        AXUIElementCreateApplication,
        AXUIElementCreateSystemWide,
        AXUIElementGetTypeID,
        AXValueGetType,
        AXValueGetValue,
        kAXChildrenAttribute,
        kAXErrorSuccess,
        kAXMenuBarAttribute,
        kAXPositionAttribute,
        kAXRoleAttribute,
        kAXSizeAttribute,
        kAXTitleAttribute,
        kAXValueCGPointType,
        kAXValueCGSizeType,
    )
except ImportError:
    print("Error: This script requires PyObjC to be installed.")
    print("Please install it with: pip install pyobjc")
    sys.exit(1)

# Constants for accessibility API
kAXErrorSuccess = 0
kAXRoleAttribute = "AXRole"
kAXSubroleAttribute = "AXSubrole"
kAXTitleAttribute = "AXTitle"
kAXPositionAttribute = "AXPosition"
kAXSizeAttribute = "AXSize"
kAXChildrenAttribute = "AXChildren"
kAXMenuBarAttribute = "AXMenuBar"


def element_attribute(element, attribute):
    """Get an attribute from an accessibility element"""
    if attribute == kAXChildrenAttribute:
        err, value = AXUIElementCopyAttributeValues(element, attribute, 0, 999, None)
        if err == kAXErrorSuccess:
            if isinstance(value, Foundation.NSArray):
                return list(value)
            else:
                return value
    err, value = AXUIElementCopyAttributeValue(element, attribute, None)
    if err == kAXErrorSuccess:
        return value
    return None


def element_value(element, type):
    """Get a value from an accessibility element"""
    err, value = AXValueGetValue(element, type, None)
    if err == True:
        return value
    return None


def get_element_bounds(element):
    """Get the bounds of an accessibility element"""
    bounds = {"x": 0, "y": 0, "width": 0, "height": 0}

    # Get position
    position_value = element_attribute(element, kAXPositionAttribute)
    if position_value:
        position_value = element_value(position_value, kAXValueCGPointType)
        if position_value:
            bounds["x"] = position_value.x
            bounds["y"] = position_value.y

    # Get size
    size_value = element_attribute(element, kAXSizeAttribute)
    if size_value:
        size_value = element_value(size_value, kAXValueCGSizeType)
        if size_value:
            bounds["width"] = size_value.width
            bounds["height"] = size_value.height

    return bounds


def find_dock_process():
    """Find the Dock process"""
    running_apps = NSWorkspace.sharedWorkspace().runningApplications()
    for app in running_apps:
        if app.localizedName() == "Dock" and app.bundleIdentifier() == "com.apple.dock":
            return app.processIdentifier()
    return None


def get_menubar_bounds():
    """Get the bounds of the macOS menubar

    Returns:
        Dictionary with x, y, width, height of the menubar
    """
    # Get the system-wide accessibility element
    system_element = AXUIElementCreateSystemWide()

    # Try to find the menubar
    menubar = element_attribute(system_element, kAXMenuBarAttribute)
    if menubar is None:
        # If we can't get it directly, try through the frontmost app
        frontmost_app = NSWorkspace.sharedWorkspace().frontmostApplication()
        if frontmost_app:
            app_pid = frontmost_app.processIdentifier()
            app_element = AXUIElementCreateApplication(app_pid)
            menubar = element_attribute(app_element, kAXMenuBarAttribute)

    if menubar is None:
        print("Error: Could not get menubar")
        # Return default menubar bounds as fallback
        return {"x": 0, "y": 0, "width": 1800, "height": 24}

    # Get menubar bounds
    return get_element_bounds(menubar)


def get_dock_bounds():
    """Get the bounds of the macOS Dock

    Returns:
        Dictionary with x, y, width, height of the Dock
    """
    dock_pid = find_dock_process()
    if dock_pid is None:
        print("Error: Could not find Dock process")
        # Return empty bounds as fallback
        return {"x": 0, "y": 0, "width": 0, "height": 0}

    # Create an accessibility element for the Dock
    dock_element = AXUIElementCreateApplication(dock_pid)
    if dock_element is None:
        print(f"Error: Could not create accessibility element for Dock (PID {dock_pid})")
        return {"x": 0, "y": 0, "width": 0, "height": 0}

    # Get the Dock's children
    children = element_attribute(dock_element, kAXChildrenAttribute)
    if not children or len(children) == 0:
        print("Error: Could not get Dock children")
        return {"x": 0, "y": 0, "width": 0, "height": 0}

    # Find the Dock's list (first child is usually the main dock list)
    dock_list = None
    for child in children:
        role = element_attribute(child, kAXRoleAttribute)
        if role == "AXList":
            dock_list = child
            break

    if dock_list is None:
        print("Error: Could not find Dock list")
        return {"x": 0, "y": 0, "width": 0, "height": 0}

    # Get the bounds of the dock list
    return get_element_bounds(dock_list)


def get_ui_element_bounds():
    """Get the bounds of important UI elements like menubar and dock

    Returns:
        Dictionary with menubar and dock bounds
    """
    menubar_bounds = get_menubar_bounds()
    dock_bounds = get_dock_bounds()

    return {"menubar": menubar_bounds, "dock": dock_bounds}


if __name__ == "__main__":
    # Example usage
    bounds = get_ui_element_bounds()
    print("Menubar bounds:", bounds["menubar"])
    print("Dock bounds:", bounds["dock"])

```

--------------------------------------------------------------------------------
/docs/content/docs/macos-vm-cli-playbook/lume/cli-reference.mdx:
--------------------------------------------------------------------------------

```markdown
---
title: Lume CLI Reference
description: Command Line Interface reference for Lume
---

import { Callout } from 'fumadocs-ui/components/callout';

Once installed, you can start using Lume with these common workflows:

### Run a Prebuilt VM

```bash
# Run a macOS Sequoia VM
lume run macos-sequoia-vanilla:latest

# Run an Ubuntu VM
lume run ubuntu-noble-vanilla:latest
```

<Callout>
  We provide [prebuilt VM images](../lume/prebuilt-images) in our [ghcr
  registry](https://github.com/orgs/trycua/packages).
</Callout>

### Create a Custom VM

```bash
# Create a new macOS VM
lume create my-macos-vm --cpu 4 --memory 8GB --disk-size 50GB

# Create a Linux VM
lume create my-linux-vm --os linux --cpu 2 --memory 4GB
```

<Callout title="Disk Space">
The actual disk space used by sparse images will be much lower than the logical size listed. You can resize VM disks after creation using `lume set <name> --disk-size <size>`.
</Callout>

## VM Management

lume create &lt;name&gt;
Create a new macOS or Linux virtual machine.

**Options:**

- `--os <os>` - Operating system to install (macOS or linux, default: macOS)
- `--cpu <cores>` - Number of CPU cores (default: 4)
- `--memory <size>` - Memory size, e.g., 8GB (default: 4GB)
- `--disk-size <size>` - Disk size, e.g., 50GB (default: 40GB)
- `--display <res>` - Display resolution (default: 1024x768)
- `--ipsw <path>` - Path to IPSW file or 'latest' for macOS VMs
- `--storage <name>` - VM storage location to use

**Examples:**

```bash
# Create macOS VM with custom specs
lume create my-mac --cpu 6 --memory 16GB --disk-size 100GB

# Create Linux VM
lume create my-ubuntu --os linux --cpu 2 --memory 8GB

# Create macOS VM with latest IPSW
lume create my-sequoia --ipsw latest
```

lume run &lt;name&gt;
Start and run a virtual machine.

**Options:**

- `--no-display` - Do not start the VNC client app
- `--shared-dir <dir>` - Share directory with VM (format: path[:ro|rw])
- `--mount <path>` - For Linux VMs only, attach a read-only disk image
- `--registry <url>` - Container registry URL (default: ghcr.io)
- `--organization <org>` - Organization to pull from (default: trycua)
- `--vnc-port <port>` - Port to use for the VNC server (default: 0 for auto-assign)
- `--recovery-mode <boolean>` - For macOS VMs only, start VM in recovery mode (default: false)
- `--storage <name>` - VM storage location to use

**Examples:**

```bash
# Run VM with shared directory
lume run my-vm --shared-dir /path/to/share:rw

# Run VM without display (headless)
lume run my-vm --no-display

# Run macOS VM in recovery mode
lume run my-mac --recovery-mode true
```

lume stop &lt;name&gt;
Stop a running virtual machine.

**Options:**

- `--storage <name>` - VM storage location to use

### lume delete &lt;name&gt;

Delete a virtual machine and its associated files.

**Options:**

- `--force` - Force deletion without confirmation
- `--storage <name>` - VM storage location to use

### lume clone &lt;name&gt; &lt;new-name&gt;

Create a copy of an existing virtual machine.

**Options:**

- `--source-storage <name>` - Source VM storage location
- `--dest-storage <name>` - Destination VM storage location

## VM Information and Configuration

### lume ls

List all virtual machines and their status.

### lume get &lt;name&gt;

Get detailed information about a specific virtual machine.

**Options:**

- `-f, --format <format>` - Output format (json|text)
- `--storage <name>` - VM storage location to use

### lume set &lt;name&gt;

Modify virtual machine configuration.

**Options:**

- `--cpu <cores>` - New number of CPU cores (e.g., 4)
- `--memory <size>` - New memory size (e.g., 8192MB or 8GB)
- `--disk-size <size>` - New disk size (e.g., 40960MB or 40GB)
- `--display <res>` - New display resolution in format WIDTHxHEIGHT (e.g., 1024x768)
- `--storage <name>` - VM storage location to use

**Examples:**

```bash
# Increase VM memory
lume set my-vm --memory 16GB

# Change display resolution
lume set my-vm --display 1920x1080

# Add more CPU cores
lume set my-vm --cpu 8
```

## Image Management

### lume images

List available macOS images in local cache.

### lume pull &lt;image&gt;

Download a VM image from a container registry.

**Options:**

- `--registry <url>` - Container registry URL (default: ghcr.io)
- `--organization <org>` - Organization to pull from (default: trycua)
- `--storage <name>` - VM storage location to use

### lume push &lt;name&gt; &lt;image:tag&gt;

Upload a VM image to a container registry.

**Options:**

- `--additional-tags <tags...>` - Additional tags to push the same image to
- `--registry <url>` - Container registry URL (default: ghcr.io)
- `--organization <org>` - Organization/user to push to (default: trycua)
- `--storage <name>` - VM storage location to use
- `--chunk-size-mb <size>` - Chunk size for disk image upload in MB (default: 512)
- `--verbose` - Enable verbose logging
- `--dry-run` - Prepare files and show plan without uploading
- `--reassemble` - Verify integrity by reassembling chunks (requires --dry-run)

### lume ipsw

Get the latest macOS restore image URL.

### lume prune

Remove cached images to free up disk space.

## Configuration

### lume config

Manage Lume configuration settings.

**Subcommands:**

##### Storage Management

- `lume config storage add <name> <path>` - Add a new VM storage location
- `lume config storage remove <name>` - Remove a VM storage location
- `lume config storage list` - List all VM storage locations
- `lume config storage default <name>` - Set the default VM storage location

##### Cache Management

- `lume config cache get` - Get current cache directory
- `lume config cache set <path>` - Set cache directory

##### Image Caching

- `lume config caching get` - Show current caching status
- `lume config caching set <boolean>` - Enable or disable image caching

## API Server

### lume serve

Start the Lume API server for programmatic access.

**Options:**

- `--port <port>` - Port to listen on (default: 7777)

## Global Options

These options are available for all commands:

- `--help` - Show help information
- `--version` - Show version number

```

--------------------------------------------------------------------------------
/libs/lumier/src/lib/utils.sh:
--------------------------------------------------------------------------------

```bash
#!/usr/bin/env bash

# Function to wait for SSH to become available
wait_for_ssh() {
    local host_ip=$1
    local user=$2
    local password=$3
    local retry_interval=${4:-5}   # Default retry interval is 5 seconds
    local max_retries=${5:-20}    # Default maximum retries is 20 (0 for infinite)

    # Only show waiting message in debug mode
    if [ "${LUMIER_DEBUG:-0}" == "1" ]; then
        echo "Waiting for SSH to become available on $host_ip..."
    fi

    local retry_count=0
    while true; do
        # Try to connect via SSH
        # Add -q for completely silent operation, redirect stderr to /dev/null
        sshpass -p "$password" ssh -q -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR "$user@$host_ip" "exit" 2>/dev/null

        # Check the exit status of the SSH command
        if [ $? -eq 0 ]; then
            echo "SSH is ready on $host_ip!"
            return 0
        fi

        # Increment retry count
        ((retry_count++))
        
        # Exit if maximum retries are reached
        if [ $max_retries -ne 0 ] && [ $retry_count -ge $max_retries ]; then
            echo "Maximum retries reached. SSH is not available."
            return 1
        fi

        # Only show retry messages in debug mode
        if [ "${LUMIER_DEBUG:-0}" == "1" ]; then
            echo "SSH not ready. Retrying in $retry_interval seconds... (Attempt $retry_count)"
        fi
        sleep $retry_interval
    done
}

# Function to execute a script on a remote server using sshpass
execute_remote_script() {
    local host="$1"
    local user="$2"
    local password="$3"
    local script_path="$4"
    local vnc_password="$5"
    local data_folder="$6"

    # Check if all required arguments are provided
    if [ -z "$host" ] || [ -z "$user" ] || [ -z "$password" ] || [ -z "$script_path" ] || [ -z "$vnc_password" ]; then
        echo "Usage: execute_remote_script <host> <user> <password> <script_path> <vnc_password> [data_folder]"
        return 1
    fi

    # Only show VNC info in debug mode
    if [ "${LUMIER_DEBUG:-0}" == "1" ]; then
        echo "VNC password exported to VM: $vnc_password"
    fi

    # Set the shared folder path for the VM
    if [ -n "$data_folder" ]; then
        # VM always sees shared folders at this path, regardless of container path
        shared_folder_path="/Volumes/My Shared Files"
        
        # Only show path in debug mode
        if [ "${LUMIER_DEBUG:-0}" == "1" ]; then
            echo "Data folder path in VM: $shared_folder_path"
        fi
    else
        shared_folder_path=""
    fi

    # Read the script content and prepend the shebang
    script_content="#!/usr/bin/env bash\n"
    # Always export VNC_PASSWORD
    script_content+="export VNC_PASSWORD='$vnc_password'\n"
    # Export SHARED_FOLDER_PATH only if we have a data folder path
    if [ -n "$shared_folder_path" ]; then
        script_content+="export SHARED_FOLDER_PATH='$shared_folder_path'\n"
    fi
    # Pass debug setting to the VM
    script_content+="export VNC_DEBUG='${LUMIER_DEBUG:-0}'\n"
    
    # Add debug messages only if debug mode is enabled
    if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then
        script_content+="echo \"[DEBUG] Starting on-logon script execution...\"\n"
    fi
    
    # Add the original script content
    script_content+="$(<"$script_path")"
    
    # Add debug messages only if debug mode is enabled
    if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then
        script_content+="\necho \"[DEBUG] Finished executing on-logon script.\"\n"
    fi
    
    # Print debug info only when debug mode is enabled
    if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then
        echo "[DEBUG] Executing remote script with content length: $(echo -n "$script_content" | wc -c) bytes"
        echo "[DEBUG] Script path: $script_path"
    fi
    
    # Use a here-document to send the script content
    # We'll capture both stdout and stderr when debug is enabled
    if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then
        echo "[DEBUG] Connecting to $user@$host to execute script..."
        sshpass -p "$password" ssh -q -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR "$user@$host" "bash -s -- '$vnc_password' '$data_folder'" 2>&1 <<EOF
$script_content
EOF
    else
        # Otherwise run quietly
        sshpass -p "$password" ssh -q -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR "$user@$host" "bash -s -- '$vnc_password' '$data_folder'" 2>/dev/null <<EOF
$script_content
EOF
    fi

    # Print completion message only in debug mode
    if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then
        echo "[DEBUG] Script execution completed."
    fi

    # Check the exit status of the sshpass command
    if [ $? -ne 0 ]; then
        echo "Failed to execute script on remote host $host."
        return 1
    fi
}

extract_json_field() {
    local field_name=$1
    local input=$2
    local result=""
    
    # First attempt with jq if available (most reliable JSON parsing)
    if command -v jq &> /dev/null; then
        # Use jq for reliable JSON parsing
        result=$(echo "$input" | jq -r ".$field_name // empty" 2>/dev/null)
        if [[ -n "$result" ]]; then
            echo "$result"
            return 0
        fi
    fi
    
    # Fallback to grep-based approach with improvements
    # First try for quoted string values
    result=$(echo "$input" | tr -d '\n' | grep -o "\"$field_name\"\s*:\s*\"[^\"]*\"" | sed -E 's/.*":\s*"(.*)"$/\1/')
    if [[ -n "$result" ]]; then
        echo "$result"
        return 0
    fi
    
    # Try for non-quoted values (numbers, true, false, null)
    result=$(echo "$input" | tr -d '\n' | grep -o "\"$field_name\"\s*:\s*[^,}]*" | sed -E 's/.*":\s*(.*)$/\1/')
    if [[ -n "$result" ]]; then
        echo "$result"
        return 0
    fi
    
    # Return empty string if field not found
    echo ""
}

extract_json_field_from_file() {
    local field_name=$1
    local json_file=$2
    local json_text
    json_text=$(<"$json_file")
    extract_json_field "$field_name" "$json_text"
}

extract_json_field_from_text() {
    local field_name=$1
    local json_text=$2
    extract_json_field "$field_name" "$json_text"
}

```

--------------------------------------------------------------------------------
/libs/python/agent/benchmarks/ss-pro.py:
--------------------------------------------------------------------------------

```python
#!/usr/bin/env python3
"""
ScreenSpot-Pro Benchmark Script

Evaluates models on the ScreenSpot-Pro dataset for click prediction accuracy.
Supports both ComputerAgent model strings and custom model classes.
"""

import argparse
import asyncio
import random
import statistics
import time
from typing import Optional

from datasets import load_dataset
from tqdm import tqdm
from utils import (
    ModelWrapper,
    get_available_models,
    get_gpu_memory,
    is_click_in_bbox,
    save_results_to_markdown,
    save_visualizations,
)


async def evaluate_model(
    model_wrapper: ModelWrapper, dataset, max_samples: Optional[int] = None
) -> dict:
    """
    Evaluate a model on the ScreenSpot-Pro dataset.

    Args:
        model_wrapper: ModelWrapper instance
        dataset: ScreenSpot-Pro dataset (list of samples)
        max_samples: Maximum number of samples to evaluate (None for all)

    Returns:
        Dictionary with evaluation results
    """
    print(f"\nEvaluating model: {model_wrapper.model_name}")

    # Load model
    await model_wrapper.load_model()

    total_samples = len(dataset)
    if max_samples is not None:
        total_samples = min(max_samples, total_samples)

    correct_predictions = 0
    error_predictions = 0
    results = []

    for i in tqdm(range(total_samples), desc=f"Evaluating {model_wrapper.model_name}"):
        sample = dataset[i]

        # Extract sample data
        image = sample["image"]
        instruction = sample["instruction"]
        bbox = sample["bbox"]  # [x1, y1, x2, y2]
        sample_id = sample["img_filename"]

        # Predict click coordinates with timing
        start_time = time.time()
        click_coords = await model_wrapper.predict_click(image, instruction)
        prediction_time = time.time() - start_time

        # Check if prediction is correct
        is_correct = is_click_in_bbox(click_coords, bbox)

        if is_correct:
            correct_predictions += 1

        results.append(
            {
                "id": sample_id,
                "instruction": instruction,
                "bbox": bbox,
                "predicted_coords": click_coords,
                "is_correct": is_correct,
                "failed": False,
                "prediction_time": prediction_time,
            }
        )

    # Unload model
    await model_wrapper.unload_model()

    # Calculate metrics
    accuracy = correct_predictions / total_samples if total_samples > 0 else 0.0
    error_rate = error_predictions / total_samples if total_samples > 0 else 0.0

    # Calculate timing statistics
    successful_times = [r["prediction_time"] for r in results if not r["failed"]]
    avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0
    median_prediction_time = statistics.median(successful_times) if successful_times else 0.0
    min_prediction_time = min(successful_times) if successful_times else 0.0
    max_prediction_time = max(successful_times) if successful_times else 0.0

    # Get VRAM statistics
    vram_stats = model_wrapper.get_vram_stats()

    return {
        "model_name": model_wrapper.model_name,
        "total_samples": total_samples,
        "correct_predictions": correct_predictions,
        "failed_predictions": error_predictions,
        "accuracy": accuracy,
        "failure_rate": error_rate,
        "avg_prediction_time": avg_prediction_time,
        "median_prediction_time": median_prediction_time,
        "min_prediction_time": min_prediction_time,
        "max_prediction_time": max_prediction_time,
        "vram_max_mb": vram_stats["max_mb"],
        "vram_avg_mb": vram_stats["avg_mb"],
        "results": results,
    }


async def main():
    """
    Main function to run the benchmark.
    """
    # Parse command line arguments
    parser = argparse.ArgumentParser(description="ScreenSpot-Pro Benchmark Script")
    parser.add_argument(
        "--samples", type=int, default=300, help="Number of samples to evaluate (default: 300)"
    )
    parser.add_argument(
        "--seed", type=int, default=42, help="Random seed for shuffling (default: 42)"
    )
    args = parser.parse_args()

    # Set random seed
    random.seed(args.seed)

    # Load dataset
    print("Loading ScreenSpot-Pro dataset...")
    ds = load_dataset("lmms-lab/ScreenSpot-Pro")
    dataset = ds["train"]  # type: ignore
    # Convert to list to support indexing
    dataset_list = list(dataset)
    print(f"Dataset loaded: {len(dataset_list)} samples")

    # Shuffle dataset with seed
    random.shuffle(dataset_list)
    print(f"Dataset shuffled with seed {args.seed}")

    # Get available models
    models = get_available_models()

    # Evaluation settings
    max_samples = args.samples  # Use command line argument

    # Run evaluations
    all_results = []

    for model in models:
        model_wrapper = ModelWrapper(model)
        result = await evaluate_model(model_wrapper, dataset_list, max_samples)
        all_results.append(result)

        # Print summary
        print(f"\n{result['model_name']} Results:")
        print(f"  Accuracy: {result['accuracy']*100:.2f}%")
        print(f"  Correct: {result['correct_predictions']}/{result['total_samples']}")
        print(f"  Errors: {result['failed_predictions']}")
        print(f"  Error Rate: {result['failure_rate']*100:.2f}%")
        print(f"  Avg Time: {result['avg_prediction_time']:.2f}s")
        print(f"  Median Time: {result['median_prediction_time']:.2f}s")
        print(
            f"  Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s"
        )
        print(f"  VRAM Max: {result['vram_max_mb']:.1f}MB")
        print(f"  VRAM Avg: {result['vram_avg_mb']:.1f}MB")

        # Print GPU memory info
        gpu_memory = get_gpu_memory()
        if gpu_memory and gpu_memory[0] > 0:
            print(f"  GPU Free Memory: {gpu_memory[0]:.1f}MB")

    # Save results
    if all_results:
        save_results_to_markdown(all_results)
        save_visualizations(all_results, dataset_list)
        print("\nBenchmark completed successfully!")
    else:
        print("\nNo successful evaluations completed.")


if __name__ == "__main__":
    asyncio.run(main())

```

--------------------------------------------------------------------------------
/libs/lume/src/FileSystem/VMDirectory.swift:
--------------------------------------------------------------------------------

```swift
import Foundation

// MARK: - VMDirectory

/// Manages a virtual machine's directory structure and files
/// Responsible for:
/// - Managing VM configuration files
/// - Handling disk operations
/// - Managing VM state and locking
/// - Providing access to VM-related paths
struct VMDirectory: Sendable {
    // MARK: - Constants
    
    private enum FileNames {
        static let nvram = "nvram.bin"
        static let disk = "disk.img"
        static let config = "config.json"
        static let sessions = "sessions.json"
    }
    
    // MARK: - Properties
    
    let dir: Path
    let nvramPath: Path
    let diskPath: Path
    let configPath: Path
    let sessionsPath: Path
    
    /// The name of the VM directory
    var name: String { dir.name }
    
    // MARK: - Initialization
    
    /// Creates a new VMDirectory instance
    /// - Parameters:
    ///   - dir: The base directory path for the VM
    init(_ dir: Path) {
        self.dir = dir
        self.nvramPath = dir.file(FileNames.nvram)
        self.diskPath = dir.file(FileNames.disk)
        self.configPath = dir.file(FileNames.config)
        self.sessionsPath = dir.file(FileNames.sessions)
    }
}

// MARK: - VM State Management

extension VMDirectory {
    /// Checks if the VM directory is fully initialized with all required files
    func initialized() -> Bool {
        // Add detailed logging for debugging
        let configExists = configPath.exists()
        let diskExists = diskPath.exists()
        let nvramExists = nvramPath.exists()
        
        // Logger.info(
        //     "VM directory initialization check", 
        //     metadata: [
        //         "directory": dir.path,
        //         "config_path": configPath.path,
        //         "config_exists": "\(configExists)",
        //         "disk_path": diskPath.path,
        //         "disk_exists": "\(diskExists)",
        //         "nvram_path": nvramPath.path,
        //         "nvram_exists": "\(nvramExists)"
        //     ]
        // )
        
        return configExists && diskExists && nvramExists
    }

    /// Checks if the VM directory exists
    func exists() -> Bool {
        dir.exists()
    }
}

// MARK: - Disk Management

extension VMDirectory {
    /// Resizes the VM's disk to the specified size
    /// - Parameter size: The new size in bytes
    /// - Throws: VMDirectoryError if the disk operation fails
    func setDisk(_ size: UInt64) throws {
        do {
            if !diskPath.exists() {
                guard FileManager.default.createFile(atPath: diskPath.path, contents: nil) else {
                    throw VMDirectoryError.fileCreationFailed(diskPath.path)
                }
            }
            
            let handle = try FileHandle(forWritingTo: diskPath.url)
            defer { try? handle.close() }
            
            try handle.truncate(atOffset: size)
        } catch {
        }
    }
}

// MARK: - Configuration Management

extension VMDirectory {
    /// Saves the VM configuration to disk
    /// - Parameter config: The configuration to save
    /// - Throws: VMDirectoryError if the save operation fails
    func saveConfig(_ config: VMConfig) throws {
        let encoder = JSONEncoder()
        encoder.outputFormatting = .prettyPrinted
        
        do {
            let data = try encoder.encode(config)
            guard FileManager.default.createFile(atPath: configPath.path, contents: data) else {
                throw VMDirectoryError.fileCreationFailed(configPath.path)
            }
        } catch {
            throw VMDirectoryError.invalidConfigData
        }
    }

    /// Loads the VM configuration from disk
    /// - Returns: The loaded configuration
    /// - Throws: VMDirectoryError if the load operation fails
    func loadConfig() throws -> VMConfig {
        guard let data = FileManager.default.contents(atPath: configPath.path) else {
            throw VMDirectoryError.configNotFound
        }
        
        do {
            let decoder = JSONDecoder()
            return try decoder.decode(VMConfig.self, from: data)
        } catch {
            throw VMDirectoryError.invalidConfigData
        }
    }
}

// MARK: - VNC Session Management

struct VNCSession: Codable {
    let url: String
    let sharedDirectories: [SharedDirectory]?
    
    init(url: String, sharedDirectories: [SharedDirectory]? = nil) {
        self.url = url
        self.sharedDirectories = sharedDirectories
    }
}

extension VMDirectory {
    /// Saves VNC session information to disk
    /// - Parameters:
    ///   - session: The VNC session to save
    ///   - sharedDirectories: Optional array of shared directories to save with the session
    /// - Throws: VMDirectoryError if the save operation fails
    func saveSession(_ session: VNCSession) throws {
        let encoder = JSONEncoder()
        encoder.outputFormatting = .prettyPrinted
        
        do {
            let data = try encoder.encode(session)
            guard FileManager.default.createFile(atPath: sessionsPath.path, contents: data) else {
                throw VMDirectoryError.fileCreationFailed(sessionsPath.path)
            }
        } catch {
            throw VMDirectoryError.invalidSessionData
        }
    }
    
    /// Loads the VNC session information from disk
    /// - Returns: The loaded VNC session
    /// - Throws: VMDirectoryError if the load operation fails
    func loadSession() throws -> VNCSession {
        guard let data = FileManager.default.contents(atPath: sessionsPath.path) else {
            throw VMDirectoryError.sessionNotFound
        }
        
        do {
            let decoder = JSONDecoder()
            return try decoder.decode(VNCSession.self, from: data)
        } catch {
            throw VMDirectoryError.invalidSessionData
        }
    }
    
    /// Removes the VNC session information from disk
    func clearSession() {
        try? FileManager.default.removeItem(atPath: sessionsPath.path)
    }
}

// MARK: - CustomStringConvertible
extension VMDirectory: CustomStringConvertible {
    var description: String {
        "VMDirectory(path: \(dir.path))"
    }
}

extension VMDirectory {
    func delete() throws {
        try FileManager.default.removeItem(atPath: dir.path)
    }
}

```

--------------------------------------------------------------------------------
/.github/workflows/npm-publish-cli.yml:
--------------------------------------------------------------------------------

```yaml
name: Publish @trycua/cli

on:
  workflow_dispatch:
    inputs:
      version:
        description: "Version to publish (default: from package.json)"
        required: false
        default: ""

jobs:
  build-and-publish:
    permissions:
      id-token: write
      contents: write
      packages: write

    strategy:
      matrix:
        include:
          - target: bun-linux-x64
            ext: ""
            binary_name: cua-linux-x64
          - target: bun-darwin-x64
            ext: ""
            binary_name: cua-darwin-x64
          - target: bun-darwin-arm64
            ext: ""
            binary_name: cua-darwin-arm64
          - target: bun-windows-x64
            ext: ".exe"
            binary_name: cua-windows-x64

    runs-on: ubuntu-latest

    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Setup Bun
        uses: oven-sh/setup-bun@v2
        with:
          bun-version: latest

      - name: Get version
        id: version
        run: |
          if [ -n "${{ github.event.inputs.version }}" ]; then
            echo "version=${{ github.event.inputs.version }}" >> $GITHUB_OUTPUT
          else
            VERSION=$(bun -p "require('./libs/typescript/cua-cli/package.json').version")
            echo "version=${VERSION}" >> $GITHUB_OUTPUT
          fi

      - name: Install dependencies
        working-directory: ./libs/typescript/cua-cli
        run: bun install --frozen-lockfile

      - name: Build binary
        working-directory: ./libs/typescript/cua-cli
        run: |
          bun build --compile --minify --sourcemap --target=${{ matrix.target }} index.ts --outfile ${{ matrix.binary_name }}${{ matrix.ext }}
          mkdir -p ../../../dist
          mv ${{ matrix.binary_name }}${{ matrix.ext }}* ../../../dist/

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          name: cua-binary-${{ matrix.target }}
          path: dist/
          if-no-files-found: error
          retention-days: 1

  publish-npm:
    needs: build-and-publish
    if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/cua-v')
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Setup Bun
        uses: oven-sh/setup-bun@v2
        with:
          bun-version: latest

      - name: Install dependencies
        working-directory: ./libs/typescript/cua-cli
        run: bun install --frozen-lockfile

      - name: Publish to npm
        working-directory: ./libs/typescript/cua-cli
        env:
          NPM_CONFIG_TOKEN: ${{ secrets.NPM_TOKEN }}
        run: bun publish --production --access public --tolerate-republish

  create-release:
    needs: [build-and-publish, publish-npm]
    runs-on: ubuntu-latest
    permissions:
      contents: write
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Setup Bun
        uses: oven-sh/setup-bun@v2
        with:
          bun-version: latest

      - name: Get version
        id: version
        run: |
          VERSION=$(bun -p "require('./libs/typescript/cua-cli/package.json').version")
          echo "version=${VERSION}" >> $GITHUB_OUTPUT
          echo "tag=cua-v${VERSION}" >> $GITHUB_OUTPUT

      - name: Download all artifacts
        uses: actions/download-artifact@v4
        with:
          path: dist
          merge-multiple: true

      - name: Create Release
        id: create_release
        uses: actions/create-release@v1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
          tag_name: ${{ steps.version.outputs.tag }}
          release_name: cua-cli v${{ steps.version.outputs.version }}
          body: |
            # cua-cli v${{ steps.version.outputs.version }}

            ## Installation

            ### Using install script (recommended)
            ```bash
            # For Linux/macOS
            curl -fsSL https://cua.ai/cli/install.sh | sh

            # For Windows (PowerShell)
            irm https://cua.ai/cli/install.ps1 | iex
            ```

            ### Using npm/bun
            ```bash
            # Using bun
            bun add -g @trycua/cli

            # Or using npm
            npm install -g @trycua/cli
            ```

            ### From source
            ```bash
            git clone -b ${{ steps.version.outputs.tag }} https://github.com/trycua/cua.git
            cd cua/libs/typescript/cua-cli
            bun install
            bun link
            bun link cua-cli
            ```

            ## Release Assets
            - `cua-darwin-arm64`: macOS (Apple Silicon)
            - `cua-darwin-x64`: macOS (Intel)
            - `cua-linux-x64`: Linux (x86_64)
            - `cua-windows-x64.exe`: Windows (x86_64)
          draft: false
          prerelease: false

      - name: Upload Linux Binary
        uses: actions/upload-release-asset@v1
        with:
          upload_url: ${{ steps.create_release.outputs.upload_url }}
          asset_path: ./dist/cua-linux-x64
          asset_name: cua-linux-x64
          asset_content_type: application/octet-stream
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

      - name: Upload macOS Intel Binary
        uses: actions/upload-release-asset@v1
        with:
          upload_url: ${{ steps.create_release.outputs.upload_url }}
          asset_path: ./dist/cua-darwin-x64
          asset_name: cua-darwin-x64
          asset_content_type: application/octet-stream
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

      - name: Upload macOS Apple Silicon Binary
        uses: actions/upload-release-asset@v1
        with:
          upload_url: ${{ steps.create_release.outputs.upload_url }}
          asset_path: ./dist/cua-darwin-arm64
          asset_name: cua-darwin-arm64
          asset_content_type: application/octet-stream
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

      - name: Upload Windows Binary
        uses: actions/upload-release-asset@v1
        with:
          upload_url: ${{ steps.create_release.outputs.upload_url }}
          asset_path: ./dist/cua-windows-x64.exe
          asset_name: cua-windows-x64.exe
          asset_content_type: application/octet-stream
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

```

--------------------------------------------------------------------------------
/libs/xfce/Dockerfile:
--------------------------------------------------------------------------------

```dockerfile
# CUA Docker XFCE Container
# Vanilla XFCE desktop with noVNC and computer-server

FROM ubuntu:22.04

# Avoid prompts from apt
ENV DEBIAN_FRONTEND=noninteractive

# Set environment variables
ENV HOME=/home/cua
ENV DISPLAY=:1
ENV VNC_PORT=5901
ENV NOVNC_PORT=6901
ENV API_PORT=8000
ENV VNC_RESOLUTION=1024x768
ENV VNC_COL_DEPTH=24

# Install system dependencies first (including sudo)
RUN apt-get update && apt-get install -y \
    # System utilities
    sudo \
    unzip \
    zip \
    xdg-utils \
    gcc \
    # Qt/XCB runtime deps for PyQt5 (libqxcb.so)
    libxcb-icccm4 \
    libxcb-image0 \
    libxcb-keysyms1 \
    libxcb-render-util0 \
    libxcb-xinerama0 \
    libxcb-shape0 \
    libxcb-randr0 \
    libxcb-xfixes0 \
    libxcb-sync1 \
    libxcb-util1 \
    libxcb-cursor0 \
    libxkbcommon-x11-0 \
    # Desktop environment
    xfce4 \
    xfce4-terminal \
    dbus-x11 \
    # VNC server
    tigervnc-standalone-server \
    tigervnc-common \
    # noVNC dependencies
    # python will be installed via deadsnakes as 3.12 \
    git \
    net-tools \
    netcat \
    supervisor \
    # Computer-server dependencies
    # python-tk/dev for 3.12 will be installed later \
    gnome-screenshot \
    wmctrl \
    ffmpeg \
    socat \
    xclip \
    # Browser
    wget \
    software-properties-common \
    # Build tools
    build-essential \
    libncursesw5-dev \
    libssl-dev \
    libsqlite3-dev \
    tk-dev \
    libgl1-mesa-dev \
    libgdbm-dev \
    libc6-dev \
    libbz2-dev \
    libffi-dev \
    zlib1g-dev \
    && rm -rf /var/lib/apt/lists/*

# Install Python 3.12 from deadsnakes (keep system python3 for apt)
RUN add-apt-repository -y ppa:deadsnakes/ppa && \
    apt-get update && apt-get install -y \
    python3.12 python3.12-venv python3.12-dev python3.12-tk \
    && \
    python3.12 -m ensurepip --upgrade && \
    python3.12 -m pip install --upgrade pip setuptools wheel && \
    rm -rf /var/lib/apt/lists/*

# Ensure 'python' points to Python 3.12
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 2

# Remove screensavers and power manager to avoid popups and lock screens
RUN apt-get remove -y \
    xfce4-power-manager \
    xfce4-power-manager-data \
    xfce4-power-manager-plugins \
    xfce4-screensaver \
    light-locker \
    xscreensaver \
    xscreensaver-data || true

# Create user after sudo is installed
RUN useradd -m -s /bin/bash -G sudo cua && \
    echo "cua:cua" | chpasswd && \
    echo "cua ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers

# Install Firefox from Mozilla PPA (snap-free) - inline to avoid script issues
RUN apt-get update && \
    add-apt-repository -y ppa:mozillateam/ppa && \
    echo 'Package: *\nPin: release o=LP-PPA-mozillateam\nPin-Priority: 1001' > /etc/apt/preferences.d/mozilla-firefox && \
    apt-get update && \
    apt-get install -y firefox && \
    echo 'pref("datareporting.policy.firstRunURL", "");\npref("datareporting.policy.dataSubmissionEnabled", false);\npref("datareporting.healthreport.service.enabled", false);\npref("datareporting.healthreport.uploadEnabled", false);\npref("trailhead.firstrun.branches", "nofirstrun-empty");\npref("browser.aboutwelcome.enabled", false);' > /usr/lib/firefox/browser/defaults/preferences/firefox.js && \
    update-alternatives --install /usr/bin/x-www-browser x-www-browser /usr/bin/firefox 100 && \
    update-alternatives --install /usr/bin/gnome-www-browser gnome-www-browser /usr/bin/firefox 100 && \
    rm -rf /var/lib/apt/lists/*

# Install noVNC
RUN git clone https://github.com/novnc/noVNC.git /opt/noVNC && \
    git clone https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \
    ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html

# Pre-create cache directory with correct ownership before pip install
RUN mkdir -p /home/cua/.cache && \
    chown -R cua:cua /home/cua/.cache

# Install computer-server using Python 3.12 pip
RUN python3.12 -m pip install cua-computer-server

# Install GTK and WebKit dependencies for pywebview
RUN apt-get update && apt-get install -y \
    python3-gi \
    python3-gi-cairo \
    gir1.2-gtk-3.0 \
    gir1.2-webkit2-4.1 \
    libgirepository1.0-dev \
    libcairo2-dev \
    pkg-config \
    gobject-introspection \
    && rm -rf /var/lib/apt/lists/*

# Install pywebview with GTK backend, used by cua-bench for web UIs
RUN python3.12 -m pip install "pywebview[gtk]"
RUN python3.12 -m pip install cua-bench-ui>=0.7.0 --no-cache-dir

# Install playwright and Firefox dependencies
RUN python3.12 -m pip install playwright && \
    python3.12 -m playwright install --with-deps firefox

# Fix any cache files created by pip
RUN chown -R cua:cua /home/cua/.cache

# Copy startup scripts
COPY src/supervisor/ /etc/supervisor/conf.d/
COPY src/scripts/ /usr/local/bin/

# Make scripts executable
RUN chmod +x /usr/local/bin/*.sh

# Setup VNC
RUN chown -R cua:cua /home/cua
USER cua
WORKDIR /home/cua

# Create VNC directory (no password needed with SecurityTypes None)
RUN mkdir -p $HOME/.vnc

# Configure XFCE for first start
RUN mkdir -p $HOME/.config/xfce4/xfconf/xfce-perchannel-xml $HOME/.config/xfce4 $HOME/.config/autostart

# Copy XFCE config to disable browser launching and welcome screens
COPY --chown=cua:cua src/xfce-config/helpers.rc $HOME/.config/xfce4/helpers.rc
COPY --chown=cua:cua src/xfce-config/xfce4-session.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-session.xml
COPY --chown=cua:cua src/xfce-config/xfce4-power-manager.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-power-manager.xml

# Disable autostart for screensaver, lock screen, and power manager
RUN echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-tips-autostart.desktop && \
    echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-screensaver.desktop && \
    echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/light-locker.desktop && \
    echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-power-manager.desktop && \
    chown -R cua:cua $HOME/.config

# Create storage and shared directories, and Firefox cache directory
RUN mkdir -p $HOME/storage $HOME/shared $HOME/.cache/dconf $HOME/.mozilla/firefox && \
    chown -R cua:cua $HOME/storage $HOME/shared $HOME/.cache $HOME/.mozilla $HOME/.vnc

USER root

# Expose ports
EXPOSE $VNC_PORT $NOVNC_PORT $API_PORT

# Start services via supervisor
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]

```

--------------------------------------------------------------------------------
/libs/python/agent/agent/callbacks/operator_validator.py:
--------------------------------------------------------------------------------

```python
"""
OperatorValidatorCallback

Ensures agent output actions conform to expected schemas by fixing common issues:
- click: add default button='left' if missing
- keypress: wrap keys string into a list
- etc.

This runs in on_llm_end, which receives the output array (AgentMessage[] as dicts).
The purpose is to avoid spending another LLM call to fix broken computer call syntax when possible.
"""

from __future__ import annotations

from typing import Any, Dict, List

from .base import AsyncCallbackHandler


class OperatorNormalizerCallback(AsyncCallbackHandler):
    """Normalizes common computer call hallucinations / errors in computer call syntax."""

    async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        # Mutate in-place as requested, but still return the list for chaining
        for item in output or []:
            if item.get("type") != "computer_call":
                continue
            action = item.get("action")
            if not isinstance(action, dict):
                continue

            # rename mouse click actions to "click"
            for mouse_btn in ["left", "right", "wheel", "back", "forward"]:
                if action.get("type", "") == f"{mouse_btn}_click":
                    action["type"] = "click"
                    action["button"] = mouse_btn
            # rename hotkey actions to "keypress"
            for alias in ["hotkey", "key", "press", "key_press"]:
                if action.get("type", "") == alias:
                    action["type"] = "keypress"
            # assume click actions
            if "button" in action and "type" not in action:
                action["type"] = "click"
            if "click" in action and "type" not in action:
                action["type"] = "click"
            if ("scroll_x" in action or "scroll_y" in action) and "type" not in action:
                action["type"] = "scroll"
            if "text" in action and "type" not in action:
                action["type"] = "type"

            action_type = action.get("type")

            def _keep_keys(action: Dict[str, Any], keys_to_keep: List[str]):
                """Keep only the provided keys on action; delete everything else.
                Always ensures required 'type' is present if listed in keys_to_keep.
                """
                for key in list(action.keys()):
                    if key not in keys_to_keep:
                        del action[key]

            # rename "coordinate" to "x", "y"
            if "coordinate" in action:
                action["x"] = action["coordinate"][0]
                action["y"] = action["coordinate"][1]
                del action["coordinate"]
            if action_type == "click":
                # convert "click" to "button"
                if "button" not in action and "click" in action:
                    action["button"] = action["click"]
                    del action["click"]
                # default button to "left"
                action["button"] = action.get("button", "left")
            # add default scroll x, y if missing
            if action_type == "scroll":
                action["scroll_x"] = action.get("scroll_x", 0)
                action["scroll_y"] = action.get("scroll_y", 0)
            # ensure keys arg is a list (normalize aliases first)
            if action_type == "keypress":
                keys = action.get("keys")
                for keys_alias in ["keypress", "key", "press", "key_press", "text"]:
                    if keys_alias in action:
                        action["keys"] = action[keys_alias]
                        del action[keys_alias]
                keys = action.get("keys")
                if isinstance(keys, str):
                    action["keys"] = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys]
            required_keys_by_type = {
                # OpenAI actions
                "click": ["type", "button", "x", "y"],
                "double_click": ["type", "x", "y"],
                "drag": ["type", "path"],
                "keypress": ["type", "keys"],
                "move": ["type", "x", "y"],
                "screenshot": ["type"],
                "scroll": ["type", "scroll_x", "scroll_y", "x", "y"],
                "type": ["type", "text"],
                "wait": ["type"],
                # Anthropic actions
                "left_mouse_down": ["type", "x", "y"],
                "left_mouse_up": ["type", "x", "y"],
                "triple_click": ["type", "button", "x", "y"],
            }
            keep = required_keys_by_type.get(action_type or "")
            if keep:
                _keep_keys(action, keep)

        # # Second pass: if an assistant message is immediately followed by a computer_call,
        # # replace the assistant message itself with a reasoning message with summary text.
        # if isinstance(output, list):
        #     for i, item in enumerate(output):
        #         # AssistantMessage shape: { type: 'message', role: 'assistant', content: OutputContent[] }
        #         if item.get("type") == "message" and item.get("role") == "assistant":
        #             next_idx = i + 1
        #             if next_idx >= len(output):
        #                 continue
        #             next_item = output[next_idx]
        #             if not isinstance(next_item, dict):
        #                 continue
        #             if next_item.get("type") != "computer_call":
        #                 continue
        #             contents = item.get("content") or []
        #             # Extract text from OutputContent[]
        #             text_parts: List[str] = []
        #             if isinstance(contents, list):
        #                 for c in contents:
        #                     if isinstance(c, dict) and c.get("type") == "output_text" and isinstance(c.get("text"), str):
        #                         text_parts.append(c["text"])
        #             text_content = "\n".join(text_parts).strip()
        #             # Replace assistant message with reasoning message
        #             output[i] = {
        #                 "type": "reasoning",
        #                 "summary": [
        #                     {
        #                         "type": "summary_text",
        #                         "text": text_content,
        #                     }
        #                 ],
        #             }

        return output

```

--------------------------------------------------------------------------------
/scripts/install-cli.ps1:
--------------------------------------------------------------------------------

```
# CUA CLI Installation Script for Windows
$ErrorActionPreference = "Stop"

function Install-WithBun {
    Write-Host "Installing CUA CLI using Bun..." -ForegroundColor Yellow
    
    # Check if bun is already installed
    if (-not (Get-Command bun -ErrorAction SilentlyContinue)) {
        Write-Host "Installing Bun..." -ForegroundColor Yellow
        try {
            powershell -c "irm bun.sh/install.ps1|iex"
            
            # Refresh environment variables
            $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
            
            # Add bun to PATH for this session if not already there
            $bunPath = "$env:USERPROFILE\.bun\bin"
            if ($env:Path -notlike "*$bunPath*") {
                $env:Path = "$bunPath;$env:Path"
            }
        } catch {
            Write-Host "Error: Failed to install Bun. Please install manually from https://bun.sh" -ForegroundColor Red
            return $false
        }
    }

    # Verify bun installation
    if (-not (Get-Command bun -ErrorAction SilentlyContinue)) {
        Write-Host "Error: Bun installation failed. Please install manually from https://bun.sh" -ForegroundColor Red
        return $false
    }

    try {
        bun add -g @trycua/cli
        # Determine installed version from npm registry
        try {
            $bunVersion = (npm view @trycua/cli version) 2>$null
            if (-not $bunVersion) { $bunVersion = "unknown" }
        } catch { $bunVersion = "unknown" }
        # Ensure install dir and write version file
        $installDir = "$env:USERPROFILE\.cua\bin"
        if (-not (Test-Path $installDir)) { New-Item -ItemType Directory -Path $installDir -Force | Out-Null }
        Set-Content -Path (Join-Path $installDir ".version") -Value $bunVersion -NoNewline
        return $true
    } catch {
        Write-Host "Warning: Failed to install with Bun, trying npm..." -ForegroundColor Yellow
        try {
            npm install -g @trycua/cli
            # Determine installed version from npm registry
            try {
                $npmVersion = (npm view @trycua/cli version) 2>$null
                if (-not $npmVersion) { $npmVersion = "unknown" }
            } catch { $npmVersion = "unknown" }
            # Ensure install dir and write version file
            $installDir = "$env:USERPROFILE\.cua\bin"
            if (-not (Test-Path $installDir)) { New-Item -ItemType Directory -Path $installDir -Force | Out-Null }
            Set-Content -Path (Join-Path $installDir ".version") -Value $npmVersion -NoNewline
            return $true
        } catch {
            Write-Host "Error: Installation failed with npm as well." -ForegroundColor Red
            return $false
        }
    }
}

Write-Host "Installing CUA CLI..." -ForegroundColor Green

# Determine if this is a 64-bit system
$is64Bit = [Environment]::Is64BitOperatingSystem
if (-not $is64Bit) {
    Write-Host "Warning: 32-bit Windows is not supported. Falling back to Bun installation..." -ForegroundColor Yellow
    if (Install-WithBun) {
        exit 0
    } else {
        Write-Host "Error: Installation failed. Please try installing manually:" -ForegroundColor Red
        Write-Host "   irm https://cua.ai/install.ps1 | iex"
        exit 1
    }
}

# Get the latest release version
try {
    $release = Invoke-RestMethod -Uri "https://api.github.com/repos/trycua/cua/releases/latest" -ErrorAction Stop
    $version = $release.tag_name -replace '^cua-v', ''
    # Look for the windows binary in the release assets
    $windowsAsset = $release.assets | Where-Object { $_.name -eq 'cua-windows-x64.exe' }
    
    if (-not $windowsAsset) {
        throw "Windows binary not found in release assets"
    }
    
    $binaryUrl = $windowsAsset.browser_download_url
} catch {
    Write-Host "Warning: Could not fetch latest release, falling back to Bun installation" -ForegroundColor Yellow
    if (Install-WithBun) {
        exit 0
    } else {
        Write-Host "Error: Installation failed. Please try installing manually:" -ForegroundColor Red
        Write-Host "   irm https://cua.ai/install.ps1 | iex"
        exit 1
    }
}

# Create installation directory
$installDir = "$env:USERPROFILE\.cua\bin"
if (-not (Test-Path $installDir)) {
    New-Item -ItemType Directory -Path $installDir -Force | Out-Null
}

$binaryPath = Join-Path $installDir "cua.exe"

# Download the binary
Write-Host "Downloading CUA CLI $version for Windows x64..." -ForegroundColor Cyan
try {
    Invoke-WebRequest -Uri $binaryUrl -OutFile $binaryPath -ErrorAction Stop
} catch {
    Write-Host "Warning: Failed to download pre-built binary, falling back to Bun installation" -ForegroundColor Yellow
    if (Install-WithBun) {
        exit 0
    } else {
        Write-Host "Error: Installation failed. Please try installing manually:" -ForegroundColor Red
        Write-Host "   irm https://cua.ai/install.ps1 | iex"
        exit 1
    }
}

# Write version file for binary install
try {
    Set-Content -Path (Join-Path $installDir ".version") -Value $version -NoNewline
} catch {
    # Non-fatal
}

# Add to PATH if not already there
$currentPath = [Environment]::GetEnvironmentVariable("Path", "User")
if ($currentPath -notlike "*$installDir*") {
    [Environment]::SetEnvironmentVariable("Path", "$currentPath;$installDir", "User")
    $env:Path = "$env:Path;$installDir"
    Write-Host "Success: Added $installDir to your PATH" -ForegroundColor Green
}

# Verify installation
if (Test-Path $binaryPath) {
    Write-Host "Success: CUA CLI $version installed successfully to $binaryPath" -ForegroundColor Green
    Write-Host ""
    Write-Host "Get started with:" -ForegroundColor Cyan
    Write-Host "   cua login"
    Write-Host "   cua create --os linux --configuration small --region north-america"
    Write-Host ""
    Write-Host "For more help, visit: https://docs.cua.ai/libraries/cua-cli" -ForegroundColor Cyan
    
    # Offer to add to PATH if not already there
    if (-not ($env:Path -like "*$installDir*")) {
        Write-Host ""
        Write-Host "Note: Please restart your terminal or run the following command to use CUA CLI:" -ForegroundColor Yellow
        Write-Host "   `$env:Path += ';$installDir'"
    }
} else {
    Write-Host "Error: Installation failed. Please try installing manually:" -ForegroundColor Red
    Write-Host "   irm https://cua.ai/install.ps1 | iex"
    exit 1
}
```

--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------

```json
{
    "configurations": [
        {
            "name": "Agent UI",
            "type": "debugpy",
            "request": "launch",
            "program": "examples/agent_ui_examples.py",
            "console": "integratedTerminal",
            "justMyCode": false,
            "python": "${workspaceFolder:cua-root}/.venv/bin/python",
            "cwd": "${workspaceFolder:cua-root}",
            "env": {
                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
            }
        },
        {
            "name": "Computer UI",
            "type": "debugpy",
            "request": "launch",
            "program": "examples/computer_ui_examples.py",
            "console": "integratedTerminal",
            "justMyCode": false,
            "python": "${workspaceFolder:cua-root}/.venv/bin/python",
            "cwd": "${workspaceFolder:cua-root}",
            "env": {
                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
            }
        },
        {
            "name": "Run Computer Examples",
            "type": "debugpy",
            "request": "launch",
            "program": "examples/computer_examples.py",
            "console": "integratedTerminal",
            "justMyCode": true,
            "python": "${workspaceFolder:cua-root}/.venv/bin/python",
            "cwd": "${workspaceFolder:cua-root}",
            "env": {
                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
            }
        },
        {
            "name": "Run Agent Examples",
            "type": "debugpy",
            "request": "launch",
            "program": "examples/agent_examples.py",
            "console": "integratedTerminal",
            "justMyCode": false,
            "python": "${workspaceFolder:cua-root}/.venv/bin/python",
            "cwd": "${workspaceFolder:cua-root}",
            "env": {
                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
            }
        },
        {
            "name": "SOM: Run Experiments (No OCR)",
            "type": "debugpy",
            "request": "launch",
            "program": "examples/som_examples.py",
            "args": [
                "examples/test_data",
                "--output-dir",
                "examples/output",
                "--ocr",
                "none",
                "--mode",
                "experiment"
            ],
            "console": "integratedTerminal",
            "justMyCode": false,
            "python": "${workspaceFolder:cua-root}/.venv/bin/python",
            "cwd": "${workspaceFolder:cua-root}",
            "env": {
                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
            }
        },
        {
            "name": "SOM: Run Experiments (EasyOCR)",
            "type": "debugpy",
            "request": "launch",
            "program": "examples/som_examples.py",
            "args": [
                "examples/test_data",
                "--output-dir",
                "examples/output",
                "--ocr",
                "easyocr",
                "--mode",
                "experiment"
            ],
            "console": "integratedTerminal",
            "justMyCode": false,
            "python": "${workspaceFolder:cua-root}/.venv/bin/python",
            "cwd": "${workspaceFolder:cua-root}",
            "env": {
                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
            }
        },
        {
            "name": "Run Computer Server",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/libs/python/computer-server/run_server.py",
            "console": "integratedTerminal",
            "justMyCode": true,
            "python": "${workspaceFolder:cua-root}/.venv/bin/python",
            "cwd": "${workspaceFolder:cua-root}",
            "env": {
                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
            }
        },
        {
            "name": "Run Computer Server with Args",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/libs/python/computer-server/run_server.py",
            "args": [
                "--host",
                "0.0.0.0",
                "--port",
                "8000",
                "--log-level",
                "debug"
            ],
            "console": "integratedTerminal",
            "justMyCode": false,
            "python": "${workspaceFolder:cua-root}/.venv/bin/python",
            "cwd": "${workspaceFolder:cua-root}",
            "env": {
                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer-server"
            }
        },
        {
            "type": "lldb",
            "request": "launch",
            "args": [],
            "cwd": "${workspaceFolder:cua-root}/libs/lume",
            "name": "Debug lume (libs/lume)",
            "program": "${workspaceFolder:cua-root}/libs/lume/.build/debug/lume",
            "preLaunchTask": "swift: Build Debug lume (libs/lume)"
        },
        {
            "type": "lldb",
            "request": "launch",
            "args": [],
            "cwd": "${workspaceFolder:cua-root}/libs/lume",
            "name": "Release lume (libs/lume)",
            "program": "${workspaceFolder:cua-root}/libs/lume/.build/release/lume",
            "preLaunchTask": "swift: Build Release lume (libs/lume)"
        }
    ]
}
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/internvl.py:
--------------------------------------------------------------------------------

```python
"""
InternVL agent loop implementation for click prediction using litellm.acompletion.

Implements the ScreenSpot InternVL grounding baseline behavior:
- Uses the exact grounding prompt format with <image> and <ref> tags
- Expects coordinates in 0-1000 normalized range in formats [[x1,y1,x2,y2]] or [[x,y]]
- Converts to pixel coordinates relative to the original screenshot size

Note: We do NOT manually load the InternVL model; acompletions (via HuggingFaceLocalAdapter)
will handle loading based on the provided model name.
"""

from __future__ import annotations

import base64
import math
import re
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple

import litellm
from PIL import Image

from ..decorators import register_agent
from ..types import AgentCapability
from .composed_grounded import ComposedGroundedConfig

# Regex patterns for extracting coordinates
# Accept optional whitespace and optional decimal fractions
_NUM = r"(\d+(?:\.\d+)?)"
_POINT_PATTERN = re.compile(r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]")
_BBOX_PATTERN = re.compile(
    r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]"
)


def _extract_first_point(text: str) -> Optional[Tuple[float, float]]:
    """Extract the first [[x,y]] as normalized (0-1000) floats."""
    m = _POINT_PATTERN.search(text)
    if not m:
        return None
    try:
        x = float(m.group(1))
        y = float(m.group(2))
        return x, y
    except Exception:
        return None


def _extract_last_bbox(text: str) -> Optional[Tuple[float, float, float, float]]:
    """Extract the last [[x1,y1,x2,y2]] as normalized (0-1000) floats."""
    matches = list(_BBOX_PATTERN.finditer(text))
    if not matches:
        return None
    m = matches[-1]
    try:
        x1 = float(m.group(1))
        y1 = float(m.group(2))
        x2 = float(m.group(3))
        y2 = float(m.group(4))
        return x1, y1, x2, y2
    except Exception:
        return None


def _scale_norm_to_pixels(x_norm: float, y_norm: float, width: int, height: int) -> Tuple[int, int]:
    """Scale 0-1000 normalized coordinates to pixel coordinates for given image size."""
    x_px = int(math.floor((x_norm / 1000.0) * width))
    y_px = int(math.floor((y_norm / 1000.0) * height))
    # Clamp to image bounds just in case
    x_px = max(0, min(width - 1, x_px))
    y_px = max(0, min(height - 1, y_px))
    return x_px, y_px


@register_agent(models=r"(?i).*InternVL.*")
class InternVLConfig(ComposedGroundedConfig):
    """InternVL agent configuration reusing ComposedGroundedConfig for steps and
    overriding predict_click to implement ScreenSpot InternVL grounding baseline."""

    async def predict_step(
        self,
        messages: List[Dict[str, Any]],
        model: str,
        tools: Optional[List[Dict[str, Any]]] = None,
        max_retries: Optional[int] = None,
        stream: bool = False,
        computer_handler=None,
        _on_api_start=None,
        _on_api_end=None,
        _on_usage=None,
        _on_screenshot=None,
        **kwargs,
    ) -> Dict[str, Any]:
        """Fallback to a self-composed model"""
        return await super().predict_step(
            messages=messages,
            model=f"{model}+{model}",
            tools=tools,
            max_retries=max_retries,
            stream=stream,
            computer_handler=computer_handler,
            _on_api_start=_on_api_start,
            _on_api_end=_on_api_end,
            _on_usage=_on_usage,
            _on_screenshot=_on_screenshot,
            **kwargs,
        )

    async def predict_click(
        self, model: str, image_b64: str, instruction: str, **kwargs
    ) -> Optional[Tuple[int, int]]:
        """
        Predict click coordinates using InternVL via litellm.acompletion.

        Behavior mirrors the ScreenSpot InternVL baseline:
        - Prompt: "<image>\nPlease provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. Answer in the format of [[x1, y1, x2, y2]]"
        - Parse either [[x,y]] point or [[x1,y1,x2,y2]] bbox, using bbox center if point missing
        - Coordinates are 0-1000 normalized; convert to pixel coordinates for the original screenshot
        """
        try:
            # Decode image dimensions to scale the normalized outputs
            img_bytes = base64.b64decode(image_b64)
            image = Image.open(BytesIO(img_bytes))
            width, height = image.size
        except Exception:
            # If decoding fails, proceed with a safe default size to avoid crash
            width, height = 1920, 1080

        # Build grounding prompt exactly like the baseline
        grounding_prompt = (
            f"Please provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. "
            f"Answer in the format of [[x1, y1, x2, y2]]"
        )

        # Prepare messages for LiteLLM
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{image_b64}"},
                    },
                    {"type": "text", "text": grounding_prompt},
                ],
            }
        ]

        # Call acompletion; HuggingFaceLocalAdapter/model handler will handle InternVL loading
        api_kwargs = {
            "model": model,
            "messages": messages,
            # Conservative generation params akin to baseline (deterministic)
            "max_tokens": kwargs.get("max_tokens", 256),
            "temperature": kwargs.get("temperature", 0.0),
        }

        response = await litellm.acompletion(**api_kwargs)
        output_text = (response.choices[0].message.content or "").strip()  # type: ignore

        # print(f"InternVL output: {output_text}")

        # Try to parse a point first; if absent, parse bbox and take center
        point = _extract_first_point(output_text)
        if point is None:
            bbox = _extract_last_bbox(output_text)
            if bbox is None:
                return None
            x1, y1, x2, y2 = bbox
            cx = (x1 + x2) / 2.0
            cy = (y1 + y2) / 2.0
            point = (cx, cy)

        x_norm, y_norm = point
        x_px, y_px = _scale_norm_to_pixels(x_norm, y_norm, width, height)
        return (x_px, y_px)

    def get_capabilities(self) -> List[AgentCapability]:
        return ["click", "step"]

```

--------------------------------------------------------------------------------
/libs/python/agent/benchmarks/interactive.py:
--------------------------------------------------------------------------------

```python
#!/usr/bin/env python3
"""
Interactive Click Prediction Tool

Takes screenshots and allows testing multiple models interactively.
Models are loaded/unloaded one at a time to avoid memory issues.
"""

import asyncio
import os
from datetime import datetime
from typing import Any, Dict, List

from utils import (
    ModelWrapper,
    get_available_models,
    save_prediction_visualization,
    take_screenshot,
)


async def predict_with_all_models(image, instruction: str, models) -> List[Dict[str, Any]]:
    """
    Predict click coordinates with all models sequentially.

    Args:
        image: PIL Image to analyze
        instruction: Instruction text
        models: List of model instances

    Returns:
        List of prediction results
    """
    predictions = []

    for model in models:
        model_wrapper = ModelWrapper(model)
        print(f"\n🔄 Loading {model_wrapper.model_name}...")

        try:
            # Load model
            await model_wrapper.load_model()

            # Predict
            coords = await model_wrapper.predict_click(image, instruction)

            predictions.append(
                {"model_name": model_wrapper.model_name, "coords": coords, "error": None}
            )

            if coords:
                print(f"✅ {model_wrapper.model_name}: ({coords[0]}, {coords[1]})")
            else:
                print(f"❌ {model_wrapper.model_name}: No prediction")

        except Exception as e:
            print(f"❌ {model_wrapper.model_name}: ERROR - {str(e)}")
            predictions.append(
                {"model_name": model_wrapper.model_name, "coords": None, "error": str(e)}
            )

        finally:
            # Always unload model to free memory
            try:
                await model_wrapper.unload_model()
                print(f"🗑️  Unloaded {model_wrapper.model_name}")
            except Exception as e:
                print(f"⚠️  Error unloading {model_wrapper.model_name}: {e}")

    return predictions


def print_header():
    """Print the interactive tool header."""
    print("=" * 60)
    print("🖱️  Interactive Click Prediction Tool")
    print("=" * 60)
    print("Commands:")
    print("  • Type an instruction to test models on last screenshot")
    print("  • 'screenshot' - Take a new screenshot")
    print("  • 'models' - List available models")
    print("  • 'quit' or 'exit' - Exit the tool")
    print("=" * 60)
    print("💡 Tip: Take a screenshot first, then send instructions to test models!")


def print_models(models):
    """Print available models."""
    print("\n📋 Available Models:")
    for i, model in enumerate(models, 1):
        if isinstance(model, str):
            print(f"  {i}. {model}")
        else:
            print(f"  {i}. models.{model.__class__.__name__}")


async def main():
    """
    Main interactive loop.
    """
    print_header()

    # Get available models
    models = get_available_models()
    print_models(models)

    # Create output directory for visualizations
    output_dir = "interactive_output"
    os.makedirs(output_dir, exist_ok=True)

    session_count = 0
    last_screenshot = None
    screenshot_timestamp = None

    while True:
        try:
            # Get user input
            print(f"\n{'='*40}")
            user_input = input("🎯 Enter instruction (or command): ").strip()

            if not user_input:
                continue

            # Handle commands
            if user_input.lower() in ["quit", "exit", "q"]:
                print("👋 Goodbye!")
                break

            elif user_input.lower() == "models":
                print_models(models)
                continue

            elif user_input.lower() == "screenshot":
                print("📸 Taking screenshot...")
                try:
                    last_screenshot = take_screenshot()
                    screenshot_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                    screenshot_path = os.path.join(
                        output_dir, f"screenshot_{screenshot_timestamp}.png"
                    )
                    last_screenshot.save(screenshot_path)
                    print(f"✅ Screenshot captured and saved to: {screenshot_path}")
                    print(f"📝 Ready for instructions! Screenshot size: {last_screenshot.size}")
                except Exception as e:
                    print(f"❌ Error taking screenshot: {e}")
                continue

            # Handle instruction input
            if last_screenshot is None:
                print(
                    "⚠️  No screenshot available! Please take a screenshot first using 'screenshot' command."
                )
                continue

            session_count += 1
            print(f"\n🎯 Session {session_count}: '{user_input}'")
            print(f"📷 Using screenshot from: {screenshot_timestamp}")

            # Predict with all models using last screenshot
            print(f"\n🤖 Testing {len(models)} models on screenshot...")
            predictions = await predict_with_all_models(last_screenshot, user_input, models)

            # Display results summary
            print("\n📊 Results Summary:")
            print("-" * 50)
            for pred in predictions:
                if pred["coords"]:
                    print(f"✅ {pred['model_name']}: ({pred['coords'][0]}, {pred['coords'][1]})")
                elif pred["error"]:
                    print(f"❌ {pred['model_name']}: ERROR - {pred['error']}")
                else:
                    print(f"❌ {pred['model_name']}: No prediction")

            # Save visualization
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            vis_filename = f"session_{session_count:03d}_{timestamp}.png"
            vis_path = os.path.join(output_dir, vis_filename)

            try:
                save_prediction_visualization(last_screenshot, user_input, predictions, vis_path)
                print(f"\n💾 Visualization saved to: {vis_path}")
            except Exception as e:
                print(f"⚠️  Error saving visualization: {e}")

            print(f"\n✨ Session {session_count} completed!")

        except KeyboardInterrupt:
            print("\n\n👋 Interrupted by user. Goodbye!")
            break
        except Exception as e:
            print(f"\n❌ Unexpected error: {e}")
            print("Continuing...")


if __name__ == "__main__":
    try:
        asyncio.run(main())
    except KeyboardInterrupt:
        print("\n👋 Goodbye!")
    except Exception as e:
        print(f"❌ Fatal error: {e}")

```

--------------------------------------------------------------------------------
/.github/workflows/pypi-publish-agent.yml:
--------------------------------------------------------------------------------

```yaml
name: Publish Agent Package

on:
  push:
    tags:
      - "agent-v*"
  workflow_dispatch:
    inputs:
      version:
        description: "Version to publish (without v prefix)"
        required: true
        default: "0.1.0"
  workflow_call:
    inputs:
      version:
        description: "Version to publish"
        required: true
        type: string

# Adding permissions at workflow level
permissions:
  contents: write

jobs:
  prepare:
    runs-on: macos-latest
    outputs:
      version: ${{ steps.get-version.outputs.version }}
      computer_version: ${{ steps.update-deps.outputs.computer_version }}
      som_version: ${{ steps.update-deps.outputs.som_version }}
      core_version: ${{ steps.update-deps.outputs.core_version }}
    steps:
      - uses: actions/checkout@v4
        with:
          ref: main
          fetch-depth: 0

      - name: Ensure latest main branch
        run: |
          git fetch origin main
          git reset --hard origin/main
          echo "Current HEAD commit:"
          git log -1 --oneline

      - name: Determine version
        id: get-version
        run: |
          # Check inputs.version first (works for workflow_call regardless of event_name)
          if [ -n "${{ inputs.version }}" ]; then
            VERSION=${{ inputs.version }}
          elif [ "${{ github.event_name }}" == "push" ]; then
            # Extract version from tag (for package-specific tags)
            if [[ "${{ github.ref }}" =~ ^refs/tags/agent-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then
              VERSION=${BASH_REMATCH[1]}
            else
              echo "ERROR: Invalid tag format for agent"
              exit 1
            fi
          elif [ -n "${{ github.event.inputs.version }}" ]; then
            VERSION=${{ github.event.inputs.version }}
          else
            echo "ERROR: No version found (inputs.version, event.inputs.version, and tag all empty)"
            exit 1
          fi

          echo "Agent version: $VERSION"
          echo "version=$VERSION" >> $GITHUB_OUTPUT

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: "3.11"

      - name: Update dependencies to latest versions
        id: update-deps
        run: |
          cd libs/python/agent

          # Install required package for PyPI API access
          pip install requests

          # Create a more robust Python script for PyPI version checking
          cat > get_latest_versions.py << 'EOF'
          import requests
          import json
          import sys

          def get_package_version(package_name, fallback="0.1.0"):
              try:
                  response = requests.get(f'https://pypi.org/pypi/{package_name}/json')
                  print(f"API Response Status for {package_name}: {response.status_code}", file=sys.stderr)
                  
                  if response.status_code != 200:
                      print(f"API request failed for {package_name}, using fallback version", file=sys.stderr)
                      return fallback
                  
                  data = json.loads(response.text)
                  
                  if 'info' not in data:
                      print(f"Missing 'info' key in API response for {package_name}, using fallback version", file=sys.stderr)
                      return fallback
                      
                  return data['info']['version']
              except Exception as e:
                  print(f"Error fetching version for {package_name}: {str(e)}", file=sys.stderr)
                  return fallback

          # Get latest versions
          print(get_package_version('cua-computer'))
          print(get_package_version('cua-som'))
          print(get_package_version('cua-core'))
          EOF

          # Execute the script to get the versions
          VERSIONS=($(python get_latest_versions.py))
          LATEST_COMPUTER=${VERSIONS[0]}
          LATEST_SOM=${VERSIONS[1]}
          LATEST_CORE=${VERSIONS[2]}

          echo "Latest cua-computer version: $LATEST_COMPUTER"
          echo "Latest cua-som version: $LATEST_SOM"
          echo "Latest cua-core version: $LATEST_CORE"

          # Output the versions for the next job
          echo "computer_version=$LATEST_COMPUTER" >> $GITHUB_OUTPUT
          echo "som_version=$LATEST_SOM" >> $GITHUB_OUTPUT
          echo "core_version=$LATEST_CORE" >> $GITHUB_OUTPUT

          # Determine major version for version constraint
          COMPUTER_MAJOR=$(echo $LATEST_COMPUTER | cut -d. -f1)
          SOM_MAJOR=$(echo $LATEST_SOM | cut -d. -f1)
          CORE_MAJOR=$(echo $LATEST_CORE | cut -d. -f1)

          NEXT_COMPUTER_MAJOR=$((COMPUTER_MAJOR + 1))
          NEXT_SOM_MAJOR=$((SOM_MAJOR + 1))
          NEXT_CORE_MAJOR=$((CORE_MAJOR + 1))

          # Update dependencies in pyproject.toml
          if [[ "$OSTYPE" == "darwin"* ]]; then
            # macOS version of sed needs an empty string for -i
            sed -i '' "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml
            sed -i '' "s/\"cua-som>=.*,<.*\"/\"cua-som>=$LATEST_SOM,<$NEXT_SOM_MAJOR.0.0\"/" pyproject.toml
            sed -i '' "s/\"cua-core>=.*,<.*\"/\"cua-core>=$LATEST_CORE,<$NEXT_CORE_MAJOR.0.0\"/" pyproject.toml
          else
            # Linux version
            sed -i "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml
            sed -i "s/\"cua-som>=.*,<.*\"/\"cua-som>=$LATEST_SOM,<$NEXT_SOM_MAJOR.0.0\"/" pyproject.toml
            sed -i "s/\"cua-core>=.*,<.*\"/\"cua-core>=$LATEST_CORE,<$NEXT_CORE_MAJOR.0.0\"/" pyproject.toml
          fi

          # Display the updated dependencies
          echo "Updated dependencies in pyproject.toml:"
          grep -E "cua-computer|cua-som|cua-core" pyproject.toml

  publish:
    needs: prepare
    uses: ./.github/workflows/pypi-reusable-publish.yml
    with:
      package_name: "agent"
      package_dir: "libs/python/agent"
      version: ${{ needs.prepare.outputs.version }}
      is_lume_package: false
      base_package_name: "cua-agent"
    secrets:
      PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}

  set-env-variables:
    needs: [prepare, publish]
    runs-on: macos-latest
    steps:
      - name: Set environment variables for use in other jobs
        run: |
          echo "COMPUTER_VERSION=${{ needs.prepare.outputs.computer_version }}" >> $GITHUB_ENV
          echo "SOM_VERSION=${{ needs.prepare.outputs.som_version }}" >> $GITHUB_ENV
          echo "CORE_VERSION=${{ needs.prepare.outputs.core_version }}" >> $GITHUB_ENV

```

--------------------------------------------------------------------------------
/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py:
--------------------------------------------------------------------------------

```python
import asyncio
import functools
import warnings
from concurrent.futures import ThreadPoolExecutor
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional

from litellm import acompletion, completion
from litellm.llms.custom_llm import CustomLLM
from litellm.types.utils import GenericStreamingChunk, ModelResponse

# Try to import HuggingFace dependencies
try:
    import torch
    from transformers import AutoModelForImageTextToText, AutoProcessor

    HF_AVAILABLE = True
except ImportError:
    HF_AVAILABLE = False

from .models import load_model as load_model_handler


class HuggingFaceLocalAdapter(CustomLLM):
    """HuggingFace Local Adapter for running vision-language models locally."""

    def __init__(self, device: str = "auto", trust_remote_code: bool = False, **kwargs):
        """Initialize the adapter.

        Args:
            device: Device to load model on ("auto", "cuda", "cpu", etc.)
            trust_remote_code: Whether to trust remote code
            **kwargs: Additional arguments
        """
        super().__init__()
        self.device = device
        self.trust_remote_code = trust_remote_code
        # Cache for model handlers keyed by model_name
        self._handlers: Dict[str, Any] = {}
        self._executor = ThreadPoolExecutor(max_workers=1)  # Single thread pool

    def _get_handler(self, model_name: str):
        """Get or create a model handler for the given model name."""
        if model_name not in self._handlers:
            self._handlers[model_name] = load_model_handler(
                model_name=model_name, device=self.device, trust_remote_code=self.trust_remote_code
            )
        return self._handlers[model_name]

    def _convert_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Convert OpenAI format messages to HuggingFace format.

        Args:
            messages: Messages in OpenAI format

        Returns:
            Messages in HuggingFace format
        """
        converted_messages = []

        for message in messages:
            converted_message = {"role": message["role"], "content": []}

            content = message.get("content", [])
            if isinstance(content, str):
                # Simple text content
                converted_message["content"].append({"type": "text", "text": content})
            elif isinstance(content, list):
                # Multi-modal content
                for item in content:
                    if item.get("type") == "text":
                        converted_message["content"].append(
                            {"type": "text", "text": item.get("text", "")}
                        )
                    elif item.get("type") == "image_url":
                        # Convert image_url format to image format
                        image_url = item.get("image_url", {}).get("url", "")
                        converted_message["content"].append({"type": "image", "image": image_url})

            converted_messages.append(converted_message)

        return converted_messages

    def _generate(self, **kwargs) -> str:
        """Generate response using the local HuggingFace model.

        Args:
            **kwargs: Keyword arguments containing messages and model info

        Returns:
            Generated text response
        """
        if not HF_AVAILABLE:
            raise ImportError(
                "HuggingFace transformers dependencies not found. "
                'Please install with: pip install "cua-agent[uitars-hf]"'
            )

        # Extract messages and model from kwargs
        messages = kwargs.get("messages", [])
        model_name = kwargs.get("model", "ByteDance-Seed/UI-TARS-1.5-7B")
        max_new_tokens = kwargs.get("max_tokens", 128)

        # Warn about ignored kwargs
        ignored_kwargs = set(kwargs.keys()) - {"messages", "model", "max_tokens"}
        if ignored_kwargs:
            warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")

        # Convert messages to HuggingFace format
        hf_messages = self._convert_messages(messages)

        # Delegate to model handler
        handler = self._get_handler(model_name)
        generated_text = handler.generate(hf_messages, max_new_tokens=max_new_tokens)
        return generated_text

    def completion(self, *args, **kwargs) -> ModelResponse:
        """Synchronous completion method.

        Returns:
            ModelResponse with generated text
        """
        generated_text = self._generate(**kwargs)

        return completion(
            model=f"huggingface-local/{kwargs['model']}",
            mock_response=generated_text,
        )

    async def acompletion(self, *args, **kwargs) -> ModelResponse:
        """Asynchronous completion method.

        Returns:
            ModelResponse with generated text
        """
        # Run _generate in thread pool to avoid blocking
        loop = asyncio.get_event_loop()
        generated_text = await loop.run_in_executor(
            self._executor, functools.partial(self._generate, **kwargs)
        )

        return await acompletion(
            model=f"huggingface-local/{kwargs['model']}",
            mock_response=generated_text,
        )

    def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
        """Synchronous streaming method.

        Returns:
            Iterator of GenericStreamingChunk
        """
        generated_text = self._generate(**kwargs)

        generic_streaming_chunk: GenericStreamingChunk = {
            "finish_reason": "stop",
            "index": 0,
            "is_finished": True,
            "text": generated_text,
            "tool_use": None,
            "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
        }

        yield generic_streaming_chunk

    async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
        """Asynchronous streaming method.

        Returns:
            AsyncIterator of GenericStreamingChunk
        """
        # Run _generate in thread pool to avoid blocking
        loop = asyncio.get_event_loop()
        generated_text = await loop.run_in_executor(
            self._executor, functools.partial(self._generate, **kwargs)
        )

        generic_streaming_chunk: GenericStreamingChunk = {
            "finish_reason": "stop",
            "index": 0,
            "is_finished": True,
            "text": generated_text,
            "tool_use": None,
            "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
        }

        yield generic_streaming_chunk

```
Page 6/20FirstPrevNextLast