This is page 8 of 28. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .cursorignore
├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github
│ ├── FUNDING.yml
│ ├── scripts
│ │ ├── get_pyproject_version.py
│ │ └── tests
│ │ ├── __init__.py
│ │ ├── README.md
│ │ └── test_get_pyproject_version.py
│ └── workflows
│ ├── bump-version.yml
│ ├── ci-lume.yml
│ ├── docker-publish-cua-linux.yml
│ ├── docker-publish-cua-windows.yml
│ ├── docker-publish-kasm.yml
│ ├── docker-publish-xfce.yml
│ ├── docker-reusable-publish.yml
│ ├── link-check.yml
│ ├── lint.yml
│ ├── npm-publish-cli.yml
│ ├── npm-publish-computer.yml
│ ├── npm-publish-core.yml
│ ├── publish-lume.yml
│ ├── pypi-publish-agent.yml
│ ├── pypi-publish-computer-server.yml
│ ├── pypi-publish-computer.yml
│ ├── pypi-publish-core.yml
│ ├── pypi-publish-mcp-server.yml
│ ├── pypi-publish-som.yml
│ ├── pypi-reusable-publish.yml
│ ├── python-tests.yml
│ ├── test-cua-models.yml
│ └── test-validation-script.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierignore
├── .prettierrc.yaml
├── .vscode
│ ├── docs.code-workspace
│ ├── extensions.json
│ ├── launch.json
│ ├── libs-ts.code-workspace
│ ├── lume.code-workspace
│ ├── lumier.code-workspace
│ ├── py.code-workspace
│ └── settings.json
├── blog
│ ├── app-use.md
│ ├── assets
│ │ ├── composite-agents.png
│ │ ├── docker-ubuntu-support.png
│ │ ├── hack-booth.png
│ │ ├── hack-closing-ceremony.jpg
│ │ ├── hack-cua-ollama-hud.jpeg
│ │ ├── hack-leaderboard.png
│ │ ├── hack-the-north.png
│ │ ├── hack-winners.jpeg
│ │ ├── hack-workshop.jpeg
│ │ ├── hud-agent-evals.png
│ │ └── trajectory-viewer.jpeg
│ ├── bringing-computer-use-to-the-web.md
│ ├── build-your-own-operator-on-macos-1.md
│ ├── build-your-own-operator-on-macos-2.md
│ ├── cloud-windows-ga-macos-preview.md
│ ├── composite-agents.md
│ ├── computer-use-agents-for-growth-hacking.md
│ ├── cua-hackathon.md
│ ├── cua-playground-preview.md
│ ├── cua-vlm-router.md
│ ├── hack-the-north.md
│ ├── hud-agent-evals.md
│ ├── human-in-the-loop.md
│ ├── introducing-cua-cli.md
│ ├── introducing-cua-cloud-containers.md
│ ├── lume-to-containerization.md
│ ├── neurips-2025-cua-papers.md
│ ├── sandboxed-python-execution.md
│ ├── training-computer-use-models-trajectories-1.md
│ ├── trajectory-viewer.md
│ ├── ubuntu-docker-support.md
│ └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│ ├── .env.example
│ ├── .gitignore
│ ├── content
│ │ └── docs
│ │ ├── agent-sdk
│ │ │ ├── agent-loops.mdx
│ │ │ ├── benchmarks
│ │ │ │ ├── index.mdx
│ │ │ │ ├── interactive.mdx
│ │ │ │ ├── introduction.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── osworld-verified.mdx
│ │ │ │ ├── screenspot-pro.mdx
│ │ │ │ └── screenspot-v2.mdx
│ │ │ ├── callbacks
│ │ │ │ ├── agent-lifecycle.mdx
│ │ │ │ ├── cost-saving.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── logging.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── pii-anonymization.mdx
│ │ │ │ └── trajectories.mdx
│ │ │ ├── chat-history.mdx
│ │ │ ├── custom-tools.mdx
│ │ │ ├── customizing-computeragent.mdx
│ │ │ ├── integrations
│ │ │ │ ├── hud.mdx
│ │ │ │ ├── meta.json
│ │ │ │ └── observability.mdx
│ │ │ ├── mcp-server
│ │ │ │ ├── client-integrations.mdx
│ │ │ │ ├── configuration.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ ├── llm-integrations.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── tools.mdx
│ │ │ │ └── usage.mdx
│ │ │ ├── message-format.mdx
│ │ │ ├── meta.json
│ │ │ ├── migration-guide.mdx
│ │ │ ├── prompt-caching.mdx
│ │ │ ├── supported-agents
│ │ │ │ ├── composed-agents.mdx
│ │ │ │ ├── computer-use-agents.mdx
│ │ │ │ ├── grounding-models.mdx
│ │ │ │ ├── human-in-the-loop.mdx
│ │ │ │ └── meta.json
│ │ │ ├── supported-model-providers
│ │ │ │ ├── cua-vlm-router.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ └── local-models.mdx
│ │ │ ├── telemetry.mdx
│ │ │ └── usage-tracking.mdx
│ │ ├── cli-playbook
│ │ │ ├── commands.mdx
│ │ │ ├── index.mdx
│ │ │ └── meta.json
│ │ ├── computer-sdk
│ │ │ ├── cloud-vm-management.mdx
│ │ │ ├── commands.mdx
│ │ │ ├── computer-server
│ │ │ │ ├── Commands.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── REST-API.mdx
│ │ │ │ └── WebSocket-API.mdx
│ │ │ ├── computer-ui.mdx
│ │ │ ├── computers.mdx
│ │ │ ├── custom-computer-handlers.mdx
│ │ │ ├── meta.json
│ │ │ ├── sandboxed-python.mdx
│ │ │ └── tracing-api.mdx
│ │ ├── example-usecases
│ │ │ ├── form-filling.mdx
│ │ │ ├── gemini-complex-ui-navigation.mdx
│ │ │ ├── meta.json
│ │ │ ├── post-event-contact-export.mdx
│ │ │ └── windows-app-behind-vpn.mdx
│ │ ├── get-started
│ │ │ ├── meta.json
│ │ │ └── quickstart.mdx
│ │ ├── index.mdx
│ │ ├── macos-vm-cli-playbook
│ │ │ ├── lume
│ │ │ │ ├── cli-reference.mdx
│ │ │ │ ├── faq.md
│ │ │ │ ├── http-api.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ ├── meta.json
│ │ │ │ └── prebuilt-images.mdx
│ │ │ ├── lumier
│ │ │ │ ├── building-lumier.mdx
│ │ │ │ ├── docker-compose.mdx
│ │ │ │ ├── docker.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ └── meta.json
│ │ │ └── meta.json
│ │ └── meta.json
│ ├── next.config.mjs
│ ├── package-lock.json
│ ├── package.json
│ ├── pnpm-lock.yaml
│ ├── postcss.config.mjs
│ ├── public
│ │ └── img
│ │ ├── agent_gradio_ui.png
│ │ ├── agent.png
│ │ ├── bg-dark.jpg
│ │ ├── bg-light.jpg
│ │ ├── cli.png
│ │ ├── computer.png
│ │ ├── grounding-with-gemini3.gif
│ │ ├── hero.png
│ │ ├── laminar_trace_example.png
│ │ ├── som_box_threshold.png
│ │ └── som_iou_threshold.png
│ ├── README.md
│ ├── source.config.ts
│ ├── src
│ │ ├── app
│ │ │ ├── (home)
│ │ │ │ ├── [[...slug]]
│ │ │ │ │ └── page.tsx
│ │ │ │ └── layout.tsx
│ │ │ ├── api
│ │ │ │ ├── posthog
│ │ │ │ │ └── [...path]
│ │ │ │ │ └── route.ts
│ │ │ │ └── search
│ │ │ │ └── route.ts
│ │ │ ├── favicon.ico
│ │ │ ├── global.css
│ │ │ ├── layout.config.tsx
│ │ │ ├── layout.tsx
│ │ │ ├── llms.mdx
│ │ │ │ └── [[...slug]]
│ │ │ │ └── route.ts
│ │ │ ├── llms.txt
│ │ │ │ └── route.ts
│ │ │ ├── robots.ts
│ │ │ └── sitemap.ts
│ │ ├── assets
│ │ │ ├── discord-black.svg
│ │ │ ├── discord-white.svg
│ │ │ ├── logo-black.svg
│ │ │ └── logo-white.svg
│ │ ├── components
│ │ │ ├── analytics-tracker.tsx
│ │ │ ├── cookie-consent.tsx
│ │ │ ├── doc-actions-menu.tsx
│ │ │ ├── editable-code-block.tsx
│ │ │ ├── footer.tsx
│ │ │ ├── hero.tsx
│ │ │ ├── iou.tsx
│ │ │ ├── mermaid.tsx
│ │ │ └── page-feedback.tsx
│ │ ├── lib
│ │ │ ├── llms.ts
│ │ │ └── source.ts
│ │ ├── mdx-components.tsx
│ │ └── providers
│ │ └── posthog-provider.tsx
│ └── tsconfig.json
├── examples
│ ├── agent_examples.py
│ ├── agent_ui_examples.py
│ ├── browser_tool_example.py
│ ├── cloud_api_examples.py
│ ├── computer_examples_windows.py
│ ├── computer_examples.py
│ ├── computer_ui_examples.py
│ ├── computer-example-ts
│ │ ├── .env.example
│ │ ├── .gitignore
│ │ ├── package-lock.json
│ │ ├── package.json
│ │ ├── pnpm-lock.yaml
│ │ ├── README.md
│ │ ├── src
│ │ │ ├── helpers.ts
│ │ │ └── index.ts
│ │ └── tsconfig.json
│ ├── docker_examples.py
│ ├── evals
│ │ ├── hud_eval_examples.py
│ │ └── wikipedia_most_linked.txt
│ ├── pylume_examples.py
│ ├── sandboxed_functions_examples.py
│ ├── som_examples.py
│ ├── tracing_examples.py
│ ├── utils.py
│ └── winsandbox_example.py
├── img
│ ├── agent_gradio_ui.png
│ ├── agent.png
│ ├── cli.png
│ ├── computer.png
│ ├── logo_black.png
│ └── logo_white.png
├── libs
│ ├── kasm
│ │ ├── Dockerfile
│ │ ├── LICENSE
│ │ ├── README.md
│ │ └── src
│ │ └── ubuntu
│ │ └── install
│ │ └── firefox
│ │ ├── custom_startup.sh
│ │ ├── firefox.desktop
│ │ └── install_firefox.sh
│ ├── lume
│ │ ├── .cursorignore
│ │ ├── CONTRIBUTING.md
│ │ ├── Development.md
│ │ ├── img
│ │ │ └── cli.png
│ │ ├── Package.resolved
│ │ ├── Package.swift
│ │ ├── README.md
│ │ ├── resources
│ │ │ └── lume.entitlements
│ │ ├── scripts
│ │ │ ├── build
│ │ │ │ ├── build-debug.sh
│ │ │ │ ├── build-release-notarized.sh
│ │ │ │ └── build-release.sh
│ │ │ └── install.sh
│ │ ├── src
│ │ │ ├── Commands
│ │ │ │ ├── Clone.swift
│ │ │ │ ├── Config.swift
│ │ │ │ ├── Create.swift
│ │ │ │ ├── Delete.swift
│ │ │ │ ├── Get.swift
│ │ │ │ ├── Images.swift
│ │ │ │ ├── IPSW.swift
│ │ │ │ ├── List.swift
│ │ │ │ ├── Logs.swift
│ │ │ │ ├── Options
│ │ │ │ │ └── FormatOption.swift
│ │ │ │ ├── Prune.swift
│ │ │ │ ├── Pull.swift
│ │ │ │ ├── Push.swift
│ │ │ │ ├── Run.swift
│ │ │ │ ├── Serve.swift
│ │ │ │ ├── Set.swift
│ │ │ │ └── Stop.swift
│ │ │ ├── ContainerRegistry
│ │ │ │ ├── ImageContainerRegistry.swift
│ │ │ │ ├── ImageList.swift
│ │ │ │ └── ImagesPrinter.swift
│ │ │ ├── Errors
│ │ │ │ └── Errors.swift
│ │ │ ├── FileSystem
│ │ │ │ ├── Home.swift
│ │ │ │ ├── Settings.swift
│ │ │ │ ├── VMConfig.swift
│ │ │ │ ├── VMDirectory.swift
│ │ │ │ └── VMLocation.swift
│ │ │ ├── LumeController.swift
│ │ │ ├── Main.swift
│ │ │ ├── Server
│ │ │ │ ├── Handlers.swift
│ │ │ │ ├── HTTP.swift
│ │ │ │ ├── Requests.swift
│ │ │ │ ├── Responses.swift
│ │ │ │ └── Server.swift
│ │ │ ├── Utils
│ │ │ │ ├── CommandRegistry.swift
│ │ │ │ ├── CommandUtils.swift
│ │ │ │ ├── Logger.swift
│ │ │ │ ├── NetworkUtils.swift
│ │ │ │ ├── Path.swift
│ │ │ │ ├── ProcessRunner.swift
│ │ │ │ ├── ProgressLogger.swift
│ │ │ │ ├── String.swift
│ │ │ │ └── Utils.swift
│ │ │ ├── Virtualization
│ │ │ │ ├── DarwinImageLoader.swift
│ │ │ │ ├── DHCPLeaseParser.swift
│ │ │ │ ├── ImageLoaderFactory.swift
│ │ │ │ └── VMVirtualizationService.swift
│ │ │ ├── VM
│ │ │ │ ├── DarwinVM.swift
│ │ │ │ ├── LinuxVM.swift
│ │ │ │ ├── VM.swift
│ │ │ │ ├── VMDetails.swift
│ │ │ │ ├── VMDetailsPrinter.swift
│ │ │ │ ├── VMDisplayResolution.swift
│ │ │ │ └── VMFactory.swift
│ │ │ └── VNC
│ │ │ ├── PassphraseGenerator.swift
│ │ │ └── VNCService.swift
│ │ └── tests
│ │ ├── Mocks
│ │ │ ├── MockVM.swift
│ │ │ ├── MockVMVirtualizationService.swift
│ │ │ └── MockVNCService.swift
│ │ ├── VM
│ │ │ └── VMDetailsPrinterTests.swift
│ │ ├── VMTests.swift
│ │ ├── VMVirtualizationServiceTests.swift
│ │ └── VNCServiceTests.swift
│ ├── lumier
│ │ ├── .dockerignore
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ └── src
│ │ ├── bin
│ │ │ └── entry.sh
│ │ ├── config
│ │ │ └── constants.sh
│ │ ├── hooks
│ │ │ └── on-logon.sh
│ │ └── lib
│ │ ├── utils.sh
│ │ └── vm.sh
│ ├── python
│ │ ├── agent
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── agent
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── adapters
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cua_adapter.py
│ │ │ │ │ ├── huggingfacelocal_adapter.py
│ │ │ │ │ ├── human_adapter.py
│ │ │ │ │ ├── mlxvlm_adapter.py
│ │ │ │ │ └── models
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── internvl.py
│ │ │ │ │ ├── opencua.py
│ │ │ │ │ └── qwen2_5_vl.py
│ │ │ │ ├── agent.py
│ │ │ │ ├── callbacks
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── budget_manager.py
│ │ │ │ │ ├── image_retention.py
│ │ │ │ │ ├── logging.py
│ │ │ │ │ ├── operator_validator.py
│ │ │ │ │ ├── pii_anonymization.py
│ │ │ │ │ ├── prompt_instructions.py
│ │ │ │ │ ├── telemetry.py
│ │ │ │ │ └── trajectory_saver.py
│ │ │ │ ├── cli.py
│ │ │ │ ├── computers
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── cua.py
│ │ │ │ │ └── custom.py
│ │ │ │ ├── decorators.py
│ │ │ │ ├── human_tool
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── __main__.py
│ │ │ │ │ ├── server.py
│ │ │ │ │ └── ui.py
│ │ │ │ ├── integrations
│ │ │ │ │ └── hud
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── agent.py
│ │ │ │ │ └── proxy.py
│ │ │ │ ├── loops
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── anthropic.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── composed_grounded.py
│ │ │ │ │ ├── gelato.py
│ │ │ │ │ ├── gemini.py
│ │ │ │ │ ├── generic_vlm.py
│ │ │ │ │ ├── glm45v.py
│ │ │ │ │ ├── gta1.py
│ │ │ │ │ ├── holo.py
│ │ │ │ │ ├── internvl.py
│ │ │ │ │ ├── model_types.csv
│ │ │ │ │ ├── moondream3.py
│ │ │ │ │ ├── omniparser.py
│ │ │ │ │ ├── openai.py
│ │ │ │ │ ├── opencua.py
│ │ │ │ │ ├── uiins.py
│ │ │ │ │ ├── uitars.py
│ │ │ │ │ └── uitars2.py
│ │ │ │ ├── proxy
│ │ │ │ │ ├── examples.py
│ │ │ │ │ └── handlers.py
│ │ │ │ ├── responses.py
│ │ │ │ ├── tools
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── browser_tool.py
│ │ │ │ ├── types.py
│ │ │ │ └── ui
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ └── gradio
│ │ │ │ ├── __init__.py
│ │ │ │ ├── app.py
│ │ │ │ └── ui_components.py
│ │ │ ├── benchmarks
│ │ │ │ ├── .gitignore
│ │ │ │ ├── contrib.md
│ │ │ │ ├── interactive.py
│ │ │ │ ├── models
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ └── gta1.py
│ │ │ │ ├── README.md
│ │ │ │ ├── ss-pro.py
│ │ │ │ ├── ss-v2.py
│ │ │ │ └── utils.py
│ │ │ ├── example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_computer_agent.py
│ │ ├── bench-ui
│ │ │ ├── bench_ui
│ │ │ │ ├── __init__.py
│ │ │ │ ├── api.py
│ │ │ │ └── child.py
│ │ │ ├── examples
│ │ │ │ ├── folder_example.py
│ │ │ │ ├── gui
│ │ │ │ │ ├── index.html
│ │ │ │ │ ├── logo.svg
│ │ │ │ │ └── styles.css
│ │ │ │ ├── output_overlay.png
│ │ │ │ └── simple_example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ └── test_port_detection.py
│ │ ├── computer
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── computer
│ │ │ │ ├── __init__.py
│ │ │ │ ├── computer.py
│ │ │ │ ├── diorama_computer.py
│ │ │ │ ├── helpers.py
│ │ │ │ ├── interface
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── linux.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ ├── models.py
│ │ │ │ │ └── windows.py
│ │ │ │ ├── logger.py
│ │ │ │ ├── models.py
│ │ │ │ ├── providers
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── cloud
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── docker
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── lume
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── lume_api.py
│ │ │ │ │ ├── lumier
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── types.py
│ │ │ │ │ └── winsandbox
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── provider.py
│ │ │ │ │ └── setup_script.ps1
│ │ │ │ ├── tracing_wrapper.py
│ │ │ │ ├── tracing.py
│ │ │ │ ├── ui
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── __main__.py
│ │ │ │ │ └── gradio
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── app.py
│ │ │ │ └── utils.py
│ │ │ ├── poetry.toml
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_computer.py
│ │ ├── computer-server
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── computer_server
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── browser.py
│ │ │ │ ├── cli.py
│ │ │ │ ├── diorama
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── diorama_computer.py
│ │ │ │ │ ├── diorama.py
│ │ │ │ │ ├── draw.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ └── safezone.py
│ │ │ │ ├── handlers
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── linux.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ └── windows.py
│ │ │ │ ├── main.py
│ │ │ │ ├── server.py
│ │ │ │ ├── utils
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── wallpaper.py
│ │ │ │ └── watchdog.py
│ │ │ ├── examples
│ │ │ │ ├── __init__.py
│ │ │ │ └── usage_example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ ├── run_server.py
│ │ │ ├── test_connection.py
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_server.py
│ │ ├── core
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── core
│ │ │ │ ├── __init__.py
│ │ │ │ └── telemetry
│ │ │ │ ├── __init__.py
│ │ │ │ └── posthog.py
│ │ │ ├── poetry.toml
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_telemetry.py
│ │ ├── mcp-server
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── build-extension.py
│ │ │ ├── CONCURRENT_SESSIONS.md
│ │ │ ├── desktop-extension
│ │ │ │ ├── cua-extension.mcpb
│ │ │ │ ├── desktop_extension.png
│ │ │ │ ├── manifest.json
│ │ │ │ ├── README.md
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── run_server.sh
│ │ │ │ └── setup.py
│ │ │ ├── mcp_server
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── server.py
│ │ │ │ └── session_manager.py
│ │ │ ├── pdm.lock
│ │ │ ├── pyproject.toml
│ │ │ ├── QUICK_TEST_COMMANDS.sh
│ │ │ ├── quick_test_local_option.py
│ │ │ ├── README.md
│ │ │ ├── scripts
│ │ │ │ ├── install_mcp_server.sh
│ │ │ │ └── start_mcp_server.sh
│ │ │ ├── test_mcp_server_local_option.py
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_mcp_server.py
│ │ ├── pylume
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_pylume.py
│ │ └── som
│ │ ├── .bumpversion.cfg
│ │ ├── LICENSE
│ │ ├── poetry.toml
│ │ ├── pyproject.toml
│ │ ├── README.md
│ │ ├── som
│ │ │ ├── __init__.py
│ │ │ ├── detect.py
│ │ │ ├── detection.py
│ │ │ ├── models.py
│ │ │ ├── ocr.py
│ │ │ ├── util
│ │ │ │ └── utils.py
│ │ │ └── visualization.py
│ │ └── tests
│ │ ├── conftest.py
│ │ └── test_omniparser.py
│ ├── qemu-docker
│ │ ├── linux
│ │ │ ├── Dockerfile
│ │ │ ├── README.md
│ │ │ └── src
│ │ │ ├── entry.sh
│ │ │ └── vm
│ │ │ ├── image
│ │ │ │ └── README.md
│ │ │ └── setup
│ │ │ ├── install.sh
│ │ │ ├── setup-cua-server.sh
│ │ │ └── setup.sh
│ │ ├── README.md
│ │ └── windows
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ └── src
│ │ ├── entry.sh
│ │ └── vm
│ │ ├── image
│ │ │ └── README.md
│ │ └── setup
│ │ ├── install.bat
│ │ ├── on-logon.ps1
│ │ ├── setup-cua-server.ps1
│ │ ├── setup-utils.psm1
│ │ └── setup.ps1
│ ├── typescript
│ │ ├── .gitignore
│ │ ├── .nvmrc
│ │ ├── agent
│ │ │ ├── examples
│ │ │ │ ├── playground-example.html
│ │ │ │ └── README.md
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── client.ts
│ │ │ │ ├── index.ts
│ │ │ │ └── types.ts
│ │ │ ├── tests
│ │ │ │ └── client.test.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── computer
│ │ │ ├── .editorconfig
│ │ │ ├── .gitattributes
│ │ │ ├── .gitignore
│ │ │ ├── LICENSE
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── computer
│ │ │ │ │ ├── index.ts
│ │ │ │ │ ├── providers
│ │ │ │ │ │ ├── base.ts
│ │ │ │ │ │ ├── cloud.ts
│ │ │ │ │ │ └── index.ts
│ │ │ │ │ └── types.ts
│ │ │ │ ├── index.ts
│ │ │ │ ├── interface
│ │ │ │ │ ├── base.ts
│ │ │ │ │ ├── factory.ts
│ │ │ │ │ ├── index.ts
│ │ │ │ │ ├── linux.ts
│ │ │ │ │ ├── macos.ts
│ │ │ │ │ └── windows.ts
│ │ │ │ └── types.ts
│ │ │ ├── tests
│ │ │ │ ├── computer
│ │ │ │ │ └── cloud.test.ts
│ │ │ │ ├── interface
│ │ │ │ │ ├── factory.test.ts
│ │ │ │ │ ├── index.test.ts
│ │ │ │ │ ├── linux.test.ts
│ │ │ │ │ ├── macos.test.ts
│ │ │ │ │ └── windows.test.ts
│ │ │ │ └── setup.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── core
│ │ │ ├── .editorconfig
│ │ │ ├── .gitattributes
│ │ │ ├── .gitignore
│ │ │ ├── LICENSE
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── index.ts
│ │ │ │ └── telemetry
│ │ │ │ ├── clients
│ │ │ │ │ ├── index.ts
│ │ │ │ │ └── posthog.ts
│ │ │ │ └── index.ts
│ │ │ ├── tests
│ │ │ │ └── telemetry.test.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── cua-cli
│ │ │ ├── .gitignore
│ │ │ ├── .prettierrc
│ │ │ ├── bun.lock
│ │ │ ├── CLAUDE.md
│ │ │ ├── index.ts
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── auth.ts
│ │ │ │ ├── cli.ts
│ │ │ │ ├── commands
│ │ │ │ │ ├── auth.ts
│ │ │ │ │ └── sandbox.ts
│ │ │ │ ├── config.ts
│ │ │ │ ├── http.ts
│ │ │ │ ├── storage.ts
│ │ │ │ └── util.ts
│ │ │ └── tsconfig.json
│ │ ├── package.json
│ │ ├── pnpm-lock.yaml
│ │ ├── pnpm-workspace.yaml
│ │ └── README.md
│ └── xfce
│ ├── .dockerignore
│ ├── .gitignore
│ ├── Development.md
│ ├── Dockerfile
│ ├── Dockerfile.dev
│ ├── README.md
│ └── src
│ ├── scripts
│ │ ├── resize-display.sh
│ │ ├── start-computer-server.sh
│ │ ├── start-novnc.sh
│ │ ├── start-vnc.sh
│ │ └── xstartup.sh
│ ├── supervisor
│ │ └── supervisord.conf
│ └── xfce-config
│ ├── helpers.rc
│ ├── xfce4-power-manager.xml
│ └── xfce4-session.xml
├── LICENSE.md
├── Makefile
├── notebooks
│ ├── agent_nb.ipynb
│ ├── blog
│ │ ├── build-your-own-operator-on-macos-1.ipynb
│ │ └── build-your-own-operator-on-macos-2.ipynb
│ ├── composite_agents_docker_nb.ipynb
│ ├── computer_nb.ipynb
│ ├── computer_server_nb.ipynb
│ ├── customizing_computeragent.ipynb
│ ├── eval_osworld.ipynb
│ ├── ollama_nb.ipynb
│ ├── README.md
│ ├── sota_hackathon_cloud.ipynb
│ └── sota_hackathon.ipynb
├── package-lock.json
├── package.json
├── pnpm-lock.yaml
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── scripts
│ ├── install-cli.ps1
│ ├── install-cli.sh
│ ├── playground-docker.sh
│ ├── playground.sh
│ ├── run-docker-dev.sh
│ └── typescript-typecheck.js
├── TESTING.md
├── tests
│ ├── agent_loop_testing
│ │ ├── agent_test.py
│ │ └── README.md
│ ├── pytest.ini
│ ├── shell_cmd.py
│ ├── test_files.py
│ ├── test_mcp_server_session_management.py
│ ├── test_mcp_server_streaming.py
│ ├── test_shell_bash.py
│ ├── test_telemetry.py
│ ├── test_tracing.py
│ ├── test_venv.py
│ └── test_watchdog.py
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/libs/python/agent/agent/integrations/hud/__init__.py:
--------------------------------------------------------------------------------
```python
1 | """HUD integration: dataset runners and MCP-based computer agent export.
2 |
3 | This module exposes helpers to evaluate HUD-compatible datasets and exports
4 | the MCP-compatible computer agent implementation.
5 |
6 | Exports:
7 | - run_single_task(dataset, ...)
8 | - run_full_dataset(dataset, ...)
9 | - MCPComputerAgent
10 | """
11 |
12 | import time
13 | from typing import Any, Optional
14 |
15 | from agent.computers import is_agent_computer
16 | from datasets import Dataset, load_dataset
17 | from hud import trace
18 | from hud.datasets import Task, run_dataset
19 |
20 | from .agent import MCPComputerAgent
21 |
22 | # ---------------------------------------------------------------------------
23 | # Single-task runner
24 | # ---------------------------------------------------------------------------
25 |
26 |
27 | async def run_single_task(
28 | dataset: str | Dataset | list[dict[str, Any]],
29 | *,
30 | task_id: int = 0,
31 | model: str | None = None,
32 | allowed_tools: list[str] | None = None,
33 | # === ComputerAgent kwargs ===
34 | tools: list[Any] | None = None,
35 | custom_loop: Any | None = None,
36 | only_n_most_recent_images: int | None = None,
37 | callbacks: list[Any] | None = None,
38 | instructions: str | None = None,
39 | verbosity: int | None = None,
40 | trajectory_dir: str | dict | None = None,
41 | max_retries: int | None = 3,
42 | screenshot_delay: float | int = 0.5,
43 | use_prompt_caching: bool | None = False,
44 | max_trajectory_budget: float | dict | None = None,
45 | telemetry_enabled: bool | None = True,
46 | ) -> None:
47 | """Load one task from the dataset and execute it with MCPComputerAgent."""
48 |
49 | # Load dataset and pick a sample
50 | if isinstance(dataset, str):
51 | dataset = load_dataset(dataset, split="train") # type: ignore[arg-type]
52 | elif isinstance(dataset, list):
53 | dataset = dataset
54 | else:
55 | dataset = dataset["train"]
56 |
57 | sample_task = dataset[task_id] # type: ignore[index]
58 | task_prompt = sample_task.get("prompt", f"Task {sample_task.get('id', 0)}") # type: ignore[attr-defined]
59 |
60 | # Filter any existing Computer tools
61 | # The eval framework will add its own Computer tool per task
62 | if tools:
63 | tools = [tool for tool in tools if not is_agent_computer(tool)]
64 |
65 | with trace(name=task_prompt):
66 | task = Task(**sample_task) # type: ignore[arg-type]
67 |
68 | agent = MCPComputerAgent(
69 | model=model or "computer-use-preview",
70 | allowed_tools=allowed_tools or ["openai_computer"],
71 | # === ComputerAgent kwargs passthrough ===
72 | tools=tools,
73 | custom_loop=custom_loop,
74 | only_n_most_recent_images=only_n_most_recent_images,
75 | callbacks=callbacks,
76 | instructions=instructions,
77 | verbosity=verbosity,
78 | trajectory_dir=trajectory_dir,
79 | max_retries=max_retries,
80 | screenshot_delay=screenshot_delay,
81 | use_prompt_caching=use_prompt_caching,
82 | max_trajectory_budget=max_trajectory_budget,
83 | telemetry_enabled=telemetry_enabled,
84 | )
85 | print(f"Running: {task_prompt}")
86 | result = await agent.run(task, max_steps=10)
87 | print(f"✅ Reward: {result.reward}")
88 |
89 |
90 | # ---------------------------------------------------------------------------
91 | # Full-dataset runner
92 | # ---------------------------------------------------------------------------
93 |
94 |
95 | async def run_full_dataset(
96 | dataset: str | Dataset | list[dict[str, Any]],
97 | *,
98 | job_name: Optional[str] = None,
99 | model: str | None = None,
100 | allowed_tools: list[str] | None = None,
101 | max_concurrent: int = 30,
102 | max_steps: int = 50,
103 | split: str = "train",
104 | trajectory_dir: str | dict | None = None,
105 | # === ComputerAgent kwargs ===
106 | tools: list[Any] | None = None,
107 | custom_loop: Any | None = None,
108 | only_n_most_recent_images: int | None = 5,
109 | callbacks: list[Any] | None = None,
110 | instructions: str | None = None,
111 | verbosity: int | None = None,
112 | max_retries: int | None = 3,
113 | screenshot_delay: float | int = 0.5,
114 | use_prompt_caching: bool | None = False,
115 | max_trajectory_budget: float | dict | None = None,
116 | telemetry_enabled: bool | None = True,
117 | ) -> list[Any]:
118 | """Run evaluation across the entire dataset using hud.datasets.run_dataset."""
119 |
120 | # Run with our MCP-based agent class.
121 | if isinstance(dataset, str):
122 | dataset_name = dataset.split("/")[-1]
123 | job_name = job_name or f"Evaluation {dataset_name}"
124 | dataset = load_dataset(dataset, split=split) # type: ignore[arg-type]
125 | else:
126 | dataset_name = "custom"
127 | job_name = job_name or f"Evaluation {time.strftime('%H:%M %Y-%m-%d')}"
128 |
129 | # Filter any existing Computer tools
130 | # The eval framework will add its own Computer tool per task
131 | if tools:
132 | tools = [tool for tool in tools if not is_agent_computer(tool)]
133 |
134 | # Execute evaluation
135 | return await run_dataset(
136 | name=job_name,
137 | dataset=dataset,
138 | agent_class=MCPComputerAgent,
139 | agent_config={
140 | "model": model,
141 | "allowed_tools": allowed_tools,
142 | "trajectory_dir": trajectory_dir,
143 | # === ComputerAgent kwargs passthrough ===
144 | "tools": tools,
145 | "custom_loop": custom_loop,
146 | "only_n_most_recent_images": only_n_most_recent_images,
147 | "callbacks": callbacks,
148 | "instructions": instructions,
149 | "verbosity": verbosity,
150 | "max_retries": max_retries,
151 | "screenshot_delay": screenshot_delay,
152 | "use_prompt_caching": use_prompt_caching,
153 | "max_trajectory_budget": max_trajectory_budget,
154 | "telemetry_enabled": telemetry_enabled,
155 | },
156 | max_concurrent=max_concurrent,
157 | metadata={"dataset": dataset_name},
158 | max_steps=max_steps,
159 | auto_respond=True,
160 | )
161 |
162 |
163 | __all__ = [
164 | "run_single_task",
165 | "run_full_dataset",
166 | "MCPComputerAgent",
167 | ]
168 |
```
--------------------------------------------------------------------------------
/libs/lume/tests/VMTests.swift:
--------------------------------------------------------------------------------
```swift
1 | import Foundation
2 | import Testing
3 |
4 | @testable import lume
5 |
6 | class MockProcessRunner: ProcessRunner {
7 | var runCalls: [(executable: String, arguments: [String])] = []
8 |
9 | func run(executable: String, arguments: [String]) throws {
10 | runCalls.append((executable, arguments))
11 | }
12 | }
13 |
14 | private func setupVMDirectory(_ tempDir: URL) throws -> VMDirectory {
15 | let vmDir = VMDirectory(Path(tempDir.path))
16 |
17 | // Create disk image file
18 | let diskPath = vmDir.diskPath
19 | let diskData = Data(repeating: 0, count: 1024 * 1024) // 1MB mock disk
20 | try diskData.write(to: diskPath.url)
21 |
22 | // Create nvram file
23 | let nvramPath = vmDir.nvramPath
24 | let nvramData = Data(repeating: 0, count: 1024) // 1KB mock nvram
25 | try nvramData.write(to: nvramPath.url)
26 |
27 | // Create initial config file
28 | var config = try VMConfig(
29 | os: "mock-os",
30 | cpuCount: 1,
31 | memorySize: 1024,
32 | diskSize: 1024,
33 | display: "1024x768"
34 | )
35 | config.setMacAddress("00:11:22:33:44:55")
36 | try vmDir.saveConfig(config)
37 |
38 | // Create .initialized file to mark VM as initialized
39 | let initializedPath = vmDir.dir.file(".initialized")
40 | try Data().write(to: initializedPath.url)
41 |
42 | return vmDir
43 | }
44 |
45 | @MainActor
46 | @Test("VM initialization and configuration")
47 | func testVMInitialization() async throws {
48 | let tempDir = try createTempDirectory()
49 | let vmDir = try setupVMDirectory(tempDir)
50 | var config = try VMConfig(
51 | os: "mock-os",
52 | cpuCount: 1,
53 | memorySize: 1024,
54 | diskSize: 1024,
55 | display: "1024x768"
56 | )
57 | config.setMacAddress("00:11:22:33:44:55") // Set MAC address to avoid nil
58 | let home = Home(fileManager: FileManager.default)
59 | let context = VMDirContext(dir: vmDir, config: config, home: home, storage: nil)
60 |
61 | let vm = MockVM(
62 | vmDirContext: context,
63 | virtualizationServiceFactory: { _ in MockVMVirtualizationService() },
64 | vncServiceFactory: { MockVNCService(vmDirectory: $0) }
65 | )
66 |
67 | // Test initial state
68 | let details = vm.details
69 | #expect(details.name == vmDir.name)
70 | #expect(details.os == "mock-os")
71 | #expect(details.status == "stopped")
72 | #expect(details.vncUrl == nil)
73 | }
74 |
75 | @MainActor
76 | @Test("VM run and stop operations")
77 | func testVMRunAndStop() async throws {
78 | let tempDir = try createTempDirectory()
79 | let vmDir = try setupVMDirectory(tempDir)
80 | var config = try VMConfig(
81 | os: "mock-os",
82 | cpuCount: 2,
83 | memorySize: 2048,
84 | diskSize: 1024,
85 | display: "1024x768"
86 | )
87 | config.setMacAddress("00:11:22:33:44:55")
88 | let home = Home(fileManager: FileManager.default)
89 | let context = VMDirContext(dir: vmDir, config: config, home: home, storage: nil)
90 |
91 | let vm = MockVM(
92 | vmDirContext: context,
93 | virtualizationServiceFactory: { _ in MockVMVirtualizationService() },
94 | vncServiceFactory: { MockVNCService(vmDirectory: $0) }
95 | )
96 |
97 | // Test running VM
98 | let runTask = Task {
99 | try await vm.run(
100 | noDisplay: false, sharedDirectories: [], mount: nil as Path?, vncPort: 0,
101 | recoveryMode: false)
102 | }
103 |
104 | // Give the VM time to start
105 | try await Task.sleep(nanoseconds: UInt64(1e9))
106 |
107 | // Test stopping VM
108 | try await vm.stop()
109 | runTask.cancel()
110 | }
111 |
112 | @MainActor
113 | @Test("VM configuration updates")
114 | func testVMConfigurationUpdates() async throws {
115 | let tempDir = try createTempDirectory()
116 | let vmDir = try setupVMDirectory(tempDir)
117 | var config = try VMConfig(
118 | os: "mock-os",
119 | cpuCount: 1,
120 | memorySize: 1024,
121 | diskSize: 1024,
122 | display: "1024x768"
123 | )
124 | config.setMacAddress("00:11:22:33:44:55")
125 | let home = Home(fileManager: FileManager.default)
126 | let context = VMDirContext(dir: vmDir, config: config, home: home, storage: nil)
127 |
128 | let vm = MockVM(
129 | vmDirContext: context,
130 | virtualizationServiceFactory: { _ in MockVMVirtualizationService() },
131 | vncServiceFactory: { MockVNCService(vmDirectory: $0) }
132 | )
133 |
134 | // Test CPU count update
135 | try vm.setCpuCount(4)
136 | #expect(vm.vmDirContext.config.cpuCount == 4)
137 |
138 | // Test memory size update
139 | try vm.setMemorySize(4096)
140 | #expect(vm.vmDirContext.config.memorySize == 4096)
141 |
142 | // Test MAC address update
143 | try vm.setMacAddress("00:11:22:33:44:66")
144 | #expect(vm.vmDirContext.config.macAddress == "00:11:22:33:44:66")
145 | }
146 |
147 | @MainActor
148 | @Test("VM setup process")
149 | func testVMSetup() async throws {
150 | let tempDir = try createTempDirectory()
151 | let vmDir = try setupVMDirectory(tempDir)
152 | var config = try VMConfig(
153 | os: "mock-os",
154 | cpuCount: 1,
155 | memorySize: 1024,
156 | diskSize: 1024,
157 | display: "1024x768"
158 | )
159 | config.setMacAddress("00:11:22:33:44:55")
160 | let home = Home(fileManager: FileManager.default)
161 | let context = VMDirContext(dir: vmDir, config: config, home: home, storage: nil)
162 |
163 | let vm = MockVM(
164 | vmDirContext: context,
165 | virtualizationServiceFactory: { _ in MockVMVirtualizationService() },
166 | vncServiceFactory: { MockVNCService(vmDirectory: $0) }
167 | )
168 |
169 | let expectedDiskSize: UInt64 = 64 * 1024 * 1024 * 1024 // 64 GB
170 |
171 | try await vm.setup(
172 | ipswPath: "/path/to/mock.ipsw",
173 | cpuCount: 2,
174 | memorySize: 2048,
175 | diskSize: expectedDiskSize,
176 | display: "1024x768"
177 | )
178 |
179 | #expect(vm.vmDirContext.config.cpuCount == 2)
180 | #expect(vm.vmDirContext.config.memorySize == 2048)
181 | let actualDiskSize = vm.vmDirContext.config.diskSize ?? 0
182 | #expect(
183 | actualDiskSize == expectedDiskSize,
184 | "Expected disk size \(expectedDiskSize), but got \(actualDiskSize)")
185 | #expect(vm.vmDirContext.config.macAddress == "00:11:22:33:44:55")
186 | }
187 |
188 | private func createTempDirectory() throws -> URL {
189 | let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent(UUID().uuidString)
190 | try FileManager.default.createDirectory(at: tempDir, withIntermediateDirectories: true)
191 | return tempDir
192 | }
193 |
```
--------------------------------------------------------------------------------
/tests/agent_loop_testing/agent_test.py:
--------------------------------------------------------------------------------
```python
1 | #!/usr/bin/env python3
2 | """
3 | Simple CUA Agent Test
4 |
5 | Tests the actual CUA ComputerAgent SDK with a mock computer.
6 | Only provides screenshot functionality - no complex computer actions.
7 | """
8 |
9 | import asyncio
10 | import base64
11 | import sys
12 | from io import BytesIO
13 | from pathlib import Path
14 |
15 | from PIL import Image, ImageDraw
16 |
17 | # Add project root to path
18 | project_root = Path(__file__).parent.parent.parent
19 | sys.path.insert(0, str(project_root))
20 |
21 |
22 | class MockComputer:
23 | """Mock computer that only provides screenshots."""
24 |
25 | def __init__(self):
26 | self.action_count = 0
27 | self._image = self._create_image()
28 |
29 | def _create_image(self) -> str:
30 | """Create a simple desktop image."""
31 | img = Image.new("RGB", (1920, 1080), color="lightblue")
32 | draw = ImageDraw.Draw(img)
33 |
34 | # Draw Safari icon
35 | draw.rectangle([100, 950, 150, 1000], fill="blue", outline="black", width=2)
36 | draw.text((110, 960), "Safari", fill="white")
37 |
38 | # Draw Terminal icon
39 | draw.rectangle([200, 950, 250, 1000], fill="green", outline="black", width=2)
40 | draw.text((210, 960), "Terminal", fill="white")
41 |
42 | # Convert to base64
43 | img_bytes = BytesIO()
44 | img.save(img_bytes, format="PNG")
45 | return base64.b64encode(img_bytes.getvalue()).decode("utf-8")
46 |
47 | async def screenshot(self) -> str:
48 | self.action_count += 1
49 | return self._image
50 |
51 | async def get_dimensions(self) -> tuple[int, int]:
52 | return (1920, 1080)
53 |
54 | # All other methods are no-ops (required by CUA interface)
55 | async def click(self, x: int, y: int, button: str = "left") -> None:
56 | await asyncio.sleep(0.1)
57 |
58 | async def double_click(self, x: int, y: int) -> None:
59 | await asyncio.sleep(0.1)
60 |
61 | async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
62 | await asyncio.sleep(0.1)
63 |
64 | async def type(self, text: str) -> None:
65 | await asyncio.sleep(0.1)
66 |
67 | async def wait(self, ms: int = 1000) -> None:
68 | await asyncio.sleep(ms / 1000.0)
69 |
70 | async def move(self, x: int, y: int) -> None:
71 | await asyncio.sleep(0.1)
72 |
73 | async def keypress(self, keys) -> None:
74 | await asyncio.sleep(0.1)
75 |
76 | async def drag(self, path) -> None:
77 | await asyncio.sleep(0.1)
78 |
79 | async def get_current_url(self) -> str:
80 | return "desktop://mock"
81 |
82 | async def get_environment(self) -> str:
83 | return "mac"
84 |
85 | # Required abstract methods
86 | async def left_mouse_down(self, x: int = 0, y: int = 0) -> None:
87 | await asyncio.sleep(0.1)
88 |
89 | async def left_mouse_up(self, x: int = 0, y: int = 0) -> None:
90 | await asyncio.sleep(0.1)
91 |
92 | async def right_mouse_down(self, x: int = 0, y: int = 0) -> None:
93 | await asyncio.sleep(0.1)
94 |
95 | async def right_mouse_up(self, x: int = 0, y: int = 0) -> None:
96 | await asyncio.sleep(0.1)
97 |
98 | async def mouse_move(self, x: int, y: int) -> None:
99 | await asyncio.sleep(0.1)
100 |
101 | async def key_down(self, key: str) -> None:
102 | await asyncio.sleep(0.1)
103 |
104 | async def key_up(self, key: str) -> None:
105 | await asyncio.sleep(0.1)
106 |
107 | async def type_text(self, text: str) -> None:
108 | await asyncio.sleep(0.1)
109 |
110 |
111 | async def test_cua_agent(model_name: str):
112 | """Test CUA agent with mock computer."""
113 | print(f"🤖 Testing CUA Agent: {model_name}")
114 | print("=" * 50)
115 |
116 | try:
117 | # Import the real CUA agent
118 | from agent import ComputerAgent
119 |
120 | # Create mock computer
121 | mock_computer = MockComputer()
122 |
123 | # Create the real CUA ComputerAgent
124 | agent = ComputerAgent(model=model_name, tools=[mock_computer], max_trajectory_budget=5.0)
125 |
126 | print("✅ CUA Agent created")
127 | print("✅ Mock computer ready")
128 | print("🚀 Running agent...")
129 | print()
130 |
131 | # Run the agent with a specific task
132 | message = "Open Safari browser"
133 |
134 | iteration = 0
135 | async for result in agent.run([{"role": "user", "content": message}]):
136 | iteration += 1
137 | print(f"Iteration {iteration}:")
138 |
139 | # Print agent output
140 | output_items = result.get("output", [])
141 | if not output_items:
142 | print(" (No output from agent)")
143 | else:
144 | for item in output_items:
145 | if item["type"] == "message":
146 | print(f" Agent: {item['content'][0]['text']}")
147 | elif item["type"] == "tool_call":
148 | print(f" Tool: {item.get('tool_name')} {item.get('arguments')}")
149 | else:
150 | print(f" Unknown output type: {item}")
151 |
152 | # Debug: print full result for empty iterations
153 | if not output_items:
154 | print(f" Debug - Full result: {result}")
155 |
156 | # Let the agent decide when to stop (it should try to complete the task)
157 | # Only stop after 5 iterations to prevent infinite loops
158 | if iteration >= 5:
159 | print("🏁 Stopping after 5 iterations (safety limit)")
160 | break
161 |
162 | print()
163 | print("=" * 50)
164 | print("🎉 TEST COMPLETE!")
165 | print("=" * 50)
166 | print(f"✅ Model: {model_name}")
167 | print(f"✅ Iterations: {iteration}")
168 | print(f"✅ Screenshots: {mock_computer.action_count}")
169 | print("✅ Agent executed successfully")
170 |
171 | return True
172 |
173 | except ImportError as e:
174 | print(f"❌ Import error: {e}")
175 | print("💡 Install CUA: pip install -e libs/python/agent -e libs/python/computer")
176 | return False
177 | except Exception as e:
178 | print(f"❌ Test failed: {e}")
179 | return False
180 |
181 |
182 | if __name__ == "__main__":
183 | import argparse
184 |
185 | parser = argparse.ArgumentParser(description="Test CUA Agent with mock computer")
186 | parser.add_argument(
187 | "--model", default="anthropic/claude-sonnet-4-5-20250929", help="CUA model to test"
188 | )
189 | args = parser.parse_args()
190 |
191 | success = asyncio.run(test_cua_agent(args.model))
192 | sys.exit(0 if success else 1)
193 |
```
--------------------------------------------------------------------------------
/libs/python/bench-ui/bench_ui/api.py:
--------------------------------------------------------------------------------
```python
1 | import json
2 | import os
3 | import subprocess
4 | import sys
5 | import tempfile
6 | import time
7 | from pathlib import Path
8 | from typing import Any, Dict, Optional
9 | from urllib import request
10 | from urllib.error import HTTPError, URLError
11 |
12 | import psutil
13 |
14 | # Map child PID -> listening port
15 | _pid_to_port: Dict[int, int] = {}
16 |
17 |
18 | def _post_json(url: str, payload: Dict[str, Any]) -> Dict[str, Any]:
19 | data = json.dumps(payload).encode("utf-8")
20 | req = request.Request(
21 | url, data=data, headers={"Content-Type": "application/json"}, method="POST"
22 | )
23 | try:
24 | with request.urlopen(req, timeout=5) as resp:
25 | text = resp.read().decode("utf-8")
26 | return json.loads(text)
27 | except HTTPError as e:
28 | try:
29 | body = (e.read() or b"").decode("utf-8", errors="ignore")
30 | return json.loads(body)
31 | except Exception:
32 | return {"error": "http_error", "status": getattr(e, "code", None)}
33 | except URLError as e:
34 | return {"error": "url_error", "reason": str(e.reason)}
35 |
36 |
37 | def _detect_port_for_pid(pid: int) -> int:
38 | """Detect a listening local TCP port for the given PID using psutil.
39 |
40 | Fails fast if psutil is unavailable or if no suitable port is found.
41 | """
42 | if psutil is None:
43 | raise RuntimeError("psutil is required for PID->port detection. Please install psutil.")
44 |
45 | # Scan system-wide connections and filter by PID
46 | for c in psutil.net_connections(kind="tcp"):
47 | if getattr(c, "pid", None) != pid:
48 | continue
49 | laddr = getattr(c, "laddr", None)
50 | status = str(getattr(c, "status", ""))
51 | if not laddr or not isinstance(laddr, tuple) or len(laddr) < 2:
52 | continue
53 | lip, lport = laddr[0], int(laddr[1])
54 | if status.upper() != "LISTEN":
55 | continue
56 | if lip in ("127.0.0.1", "::1", "0.0.0.0", "::"):
57 | return lport
58 |
59 | raise RuntimeError(f"Could not detect listening port for pid {pid}")
60 |
61 |
62 | def launch_window(
63 | url: Optional[str] = None,
64 | *,
65 | html: Optional[str] = None,
66 | folder: Optional[str] = None,
67 | title: str = "Window",
68 | x: Optional[int] = None,
69 | y: Optional[int] = None,
70 | width: int = 600,
71 | height: int = 400,
72 | icon: Optional[str] = None,
73 | use_inner_size: bool = False,
74 | title_bar_style: str = "default",
75 | ) -> int:
76 | """Create a pywebview window in a child process and return its PID.
77 |
78 | Preferred input is a URL via the positional `url` parameter.
79 | To load inline HTML instead, pass `html=...`.
80 | To serve a static folder, pass `folder=...` (path to directory).
81 |
82 | Spawns `python -m bench_ui.child` with a JSON config passed via a temp file.
83 | The child prints a single JSON line: {"pid": <pid>, "port": <port>}.
84 | We cache pid->port for subsequent control calls like get_element_rect.
85 | """
86 | if not url and not html and not folder:
87 | raise ValueError("launch_window requires either a url, html, or folder")
88 |
89 | config = {
90 | "url": url,
91 | "html": html,
92 | "folder": folder,
93 | "title": title,
94 | "x": x,
95 | "y": y,
96 | "width": width,
97 | "height": height,
98 | "icon": icon,
99 | "use_inner_size": use_inner_size,
100 | "title_bar_style": title_bar_style,
101 | }
102 |
103 | with tempfile.NamedTemporaryFile("w", delete=False, suffix=".json") as f:
104 | json.dump(config, f)
105 | cfg_path = f.name
106 |
107 | try:
108 | # Launch child process
109 | proc = subprocess.Popen(
110 | [sys.executable, "-m", "bench_ui.child", cfg_path],
111 | stdout=subprocess.PIPE,
112 | stderr=subprocess.STDOUT,
113 | text=True,
114 | )
115 | assert proc.stdout is not None
116 | # Read first line with startup info
117 | line = proc.stdout.readline().strip()
118 | info = json.loads(line)
119 | pid = int(info["pid"]) if "pid" in info else proc.pid
120 | port = int(info["port"]) # required
121 | _pid_to_port[pid] = port
122 | return pid
123 | finally:
124 | try:
125 | os.unlink(cfg_path)
126 | except Exception:
127 | pass
128 |
129 |
130 | def get_element_rect(pid: int, selector: str, *, space: str = "window"):
131 | """Ask the child process to compute element client rect via injected JS.
132 |
133 | Returns a dict like {"x": float, "y": float, "width": float, "height": float} or None if not found.
134 | """
135 | if pid not in _pid_to_port:
136 | _pid_to_port[pid] = _detect_port_for_pid(pid)
137 | port = _pid_to_port[pid]
138 | url = f"http://127.0.0.1:{port}/rect"
139 | last: Dict[str, Any] = {}
140 | for _ in range(30): # ~3s total
141 | resp = _post_json(url, {"selector": selector, "space": space})
142 | last = resp or {}
143 | rect = last.get("rect") if isinstance(last, dict) else None
144 | err = last.get("error") if isinstance(last, dict) else None
145 | if rect is not None:
146 | return rect
147 | if err in ("window_not_ready", "invalid_json"):
148 | time.sleep(0.1)
149 | continue
150 | # If other transient errors, brief retry
151 | if err:
152 | time.sleep(0.1)
153 | continue
154 | time.sleep(0.1)
155 | raise RuntimeError(f"Failed to get element rect: {last}")
156 |
157 |
158 | def execute_javascript(pid: int, javascript: str):
159 | """Execute arbitrary JavaScript in the window and return its result.
160 |
161 | Retries briefly while the window is still becoming ready.
162 | """
163 | if pid not in _pid_to_port:
164 | _pid_to_port[pid] = _detect_port_for_pid(pid)
165 | port = _pid_to_port[pid]
166 | url = f"http://127.0.0.1:{port}/eval"
167 | last: Dict[str, Any] = {}
168 | for _ in range(30): # ~3s total
169 | resp = _post_json(url, {"javascript": javascript})
170 | last = resp or {}
171 | if isinstance(last, dict):
172 | if "result" in last:
173 | return last["result"]
174 | if last.get("error") in ("window_not_ready", "invalid_json"):
175 | time.sleep(0.1)
176 | continue
177 | if last.get("error"):
178 | time.sleep(0.1)
179 | continue
180 | time.sleep(0.1)
181 | raise RuntimeError(f"Failed to execute JavaScript: {last}")
182 |
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/uiins.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | UI-Ins agent loop implementation for click prediction using litellm.acompletion
3 | Paper: https://arxiv.org/pdf/2510.202861
4 | Code: https://github.com/alibaba/UI-Ins
5 | """
6 |
7 | import asyncio
8 | import base64
9 | import json
10 | import math
11 | import re
12 | import uuid
13 | from io import BytesIO
14 | from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
15 |
16 | import litellm
17 | from PIL import Image
18 |
19 | from ..decorators import register_agent
20 | from ..loops.base import AsyncAgentConfig
21 | from ..types import AgentCapability, AgentResponse, Messages, Tools
22 |
23 | SYSTEM_PROMPT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.\n\n## Output Format\nReturn a json object with a reasoning process in tags, a function name and arguments within XML tags:\n```\n\n...\n\n\n{"name": "grounding", "arguments": }\n\n```\n represents the following item of the action space:\n## Action Space{"action": "click", "coordinate": [x, y]}\nYour task is to accurately locate a UI element based on the instruction. You should first analyze instruction in tags and finally output the function in tags.\n"""
24 |
25 |
26 | def parse_coordinates(raw_string: str) -> tuple[int, int]:
27 | matches = re.findall(r"\[(\d+),\s*(\d+)\]", raw_string)
28 | if matches:
29 | return tuple(map(int, matches[0]))
30 | return -1, -1
31 |
32 |
33 | def smart_resize(
34 | height: int,
35 | width: int,
36 | factor: int = 28,
37 | min_pixels: int = 3136,
38 | max_pixels: int = 8847360,
39 | ) -> Tuple[int, int]:
40 | """Smart resize function similar to qwen_vl_utils."""
41 | # Calculate the total pixels
42 | total_pixels = height * width
43 |
44 | # If already within bounds, return original dimensions
45 | if min_pixels <= total_pixels <= max_pixels:
46 | # Round to nearest factor
47 | new_height = (height // factor) * factor
48 | new_width = (width // factor) * factor
49 | return new_height, new_width
50 |
51 | # Calculate scaling factor
52 | if total_pixels > max_pixels:
53 | scale = (max_pixels / total_pixels) ** 0.5
54 | else:
55 | scale = (min_pixels / total_pixels) ** 0.5
56 |
57 | # Apply scaling
58 | new_height = int(height * scale)
59 | new_width = int(width * scale)
60 |
61 | # Round to nearest factor
62 | new_height = (new_height // factor) * factor
63 | new_width = (new_width // factor) * factor
64 |
65 | # Ensure minimum size
66 | new_height = max(new_height, factor)
67 | new_width = max(new_width, factor)
68 |
69 | return new_height, new_width
70 |
71 |
72 | @register_agent(models=r".*UI-Ins.*")
73 | class UIInsConfig(AsyncAgentConfig):
74 | """UI-Ins agent configuration implementing AsyncAgentConfig protocol for click prediction."""
75 |
76 | def __init__(self):
77 | self.current_model = None
78 | self.last_screenshot_b64 = None
79 |
80 | async def predict_step(
81 | self,
82 | messages: List[Dict[str, Any]],
83 | model: str,
84 | tools: Optional[List[Dict[str, Any]]] = None,
85 | max_retries: Optional[int] = None,
86 | stream: bool = False,
87 | computer_handler=None,
88 | _on_api_start=None,
89 | _on_api_end=None,
90 | _on_usage=None,
91 | _on_screenshot=None,
92 | **kwargs,
93 | ) -> Dict[str, Any]:
94 | raise NotImplementedError()
95 |
96 | async def predict_click(
97 | self, model: str, image_b64: str, instruction: str, **kwargs
98 | ) -> Optional[Tuple[float, float]]:
99 | """
100 | Predict click coordinates using UI-Ins model via litellm.acompletion.
101 |
102 | Args:
103 | model: The UI-Ins model name
104 | image_b64: Base64 encoded image
105 | instruction: Instruction for where to click
106 |
107 | Returns:
108 | Tuple of (x, y) coordinates or None if prediction fails
109 | """
110 | # Decode base64 image
111 | image_data = base64.b64decode(image_b64)
112 | image = Image.open(BytesIO(image_data))
113 | width, height = image.width, image.height
114 |
115 | # Smart resize the image (similar to qwen_vl_utils)
116 | resized_height, resized_width = smart_resize(
117 | height,
118 | width,
119 | factor=28, # Default factor for Qwen models
120 | min_pixels=3136,
121 | max_pixels=4096 * 2160,
122 | )
123 | resized_image = image.resize((resized_width, resized_height))
124 | scale_x, scale_y = width / resized_width, height / resized_height
125 |
126 | # Convert resized image back to base64
127 | buffered = BytesIO()
128 | resized_image.save(buffered, format="PNG")
129 | resized_image_b64 = base64.b64encode(buffered.getvalue()).decode()
130 |
131 | # Prepare system and user messages
132 | system_message = {
133 | "role": "system",
134 | "content": [
135 | {"type": "text", "text": "You are a helpful assistant."},
136 | {"type": "text", "text": SYSTEM_PROMPT},
137 | ],
138 | }
139 |
140 | user_message = {
141 | "role": "user",
142 | "content": [
143 | {
144 | "type": "image_url",
145 | "image_url": {"url": f"data:image/png;base64,{resized_image_b64}"},
146 | },
147 | {"type": "text", "text": instruction},
148 | ],
149 | }
150 |
151 | # Prepare API call kwargs
152 | api_kwargs = {
153 | "model": model,
154 | "messages": [system_message, user_message],
155 | "max_tokens": 2056,
156 | "temperature": 0.0,
157 | **kwargs,
158 | }
159 |
160 | # Use liteLLM acompletion
161 | response = await litellm.acompletion(**api_kwargs)
162 |
163 | # Extract response text
164 | output_text = response.choices[0].message.content # type: ignore
165 |
166 | # Extract and rescale coordinates
167 | pred_x, pred_y = parse_coordinates(output_text) # type: ignore
168 | pred_x *= scale_x
169 | pred_y *= scale_y
170 |
171 | return (math.floor(pred_x), math.floor(pred_y))
172 |
173 | def get_capabilities(self) -> List[AgentCapability]:
174 | """Return the capabilities supported by this agent."""
175 | return ["click"]
176 |
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/proxy/examples.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Example usage of the proxy server and client requests.
3 | """
4 |
5 | import dotenv
6 |
7 | dotenv.load_dotenv()
8 |
9 | import asyncio
10 | import json
11 | import os
12 | from typing import Any, Dict
13 |
14 | import aiohttp
15 |
16 |
17 | async def test_http_endpoint():
18 | """Test the HTTP /responses endpoint."""
19 |
20 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
21 | assert isinstance(anthropic_api_key, str), "ANTHROPIC_API_KEY environment variable must be set"
22 |
23 | # Example 1: Simple text request
24 | simple_request = {
25 | "model": "anthropic/claude-sonnet-4-5-20250929",
26 | "input": "Tell me a three sentence bedtime story about a unicorn.",
27 | "env": {"ANTHROPIC_API_KEY": anthropic_api_key},
28 | }
29 |
30 | # Example 2: Multi-modal request with image
31 | multimodal_request = {
32 | "model": "anthropic/claude-sonnet-4-5-20250929",
33 | "input": [
34 | {
35 | "role": "user",
36 | "content": [
37 | {"type": "input_text", "text": "what is in this image?"},
38 | {
39 | "type": "input_image",
40 | "image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
41 | },
42 | ],
43 | }
44 | ],
45 | "env": {"ANTHROPIC_API_KEY": anthropic_api_key},
46 | }
47 |
48 | # Example 3: Request with custom agent and computer kwargs
49 | custom_request = {
50 | "model": "anthropic/claude-sonnet-4-5-20250929",
51 | "input": "Take a screenshot and tell me what you see",
52 | "env": {"ANTHROPIC_API_KEY": anthropic_api_key},
53 | }
54 |
55 | # Test requests
56 | base_url = "https://m-linux-96lcxd2c2k.containers.cloud.trycua.com:8443"
57 | # base_url = "http://localhost:8000"
58 | api_key = os.getenv("CUA_API_KEY")
59 | assert isinstance(api_key, str), "CUA_API_KEY environment variable must be set"
60 |
61 | async with aiohttp.ClientSession() as session:
62 | for i, request_data in enumerate(
63 | [
64 | simple_request,
65 | # multimodal_request,
66 | custom_request,
67 | ],
68 | 1,
69 | ):
70 | print(f"\n--- Test {i} ---")
71 | print(f"Request: {json.dumps(request_data, indent=2)}")
72 |
73 | try:
74 | print(f"Sending request to {base_url}/responses")
75 | async with session.post(
76 | f"{base_url}/responses",
77 | json=request_data,
78 | headers={"Content-Type": "application/json", "X-API-Key": api_key},
79 | ) as response:
80 | result = await response.json()
81 | print(f"Status: {response.status}")
82 | print(f"Response: {json.dumps(result, indent=2)}")
83 |
84 | except Exception as e:
85 | print(f"Error: {e}")
86 |
87 |
88 | def curl_examples():
89 | """Print curl command examples."""
90 |
91 | print("=== CURL Examples ===\n")
92 |
93 | print("1. Simple text request:")
94 | print(
95 | """curl http://localhost:8000/responses \\
96 | -H "Content-Type: application/json" \\
97 | -d '{
98 | "model": "anthropic/claude-sonnet-4-5-20250929",
99 | "input": "Tell me a three sentence bedtime story about a unicorn."
100 | }'"""
101 | )
102 |
103 | print("\n2. Multi-modal request with image:")
104 | print(
105 | """curl http://localhost:8000/responses \\
106 | -H "Content-Type: application/json" \\
107 | -d '{
108 | "model": "anthropic/claude-sonnet-4-5-20250929",
109 | "input": [
110 | {
111 | "role": "user",
112 | "content": [
113 | {"type": "input_text", "text": "what is in this image?"},
114 | {
115 | "type": "input_image",
116 | "image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
117 | }
118 | ]
119 | }
120 | ]
121 | }'"""
122 | )
123 |
124 | print("\n3. Request with custom configuration:")
125 | print(
126 | """curl http://localhost:8000/responses \\
127 | -H "Content-Type: application/json" \\
128 | -d '{
129 | "model": "anthropic/claude-sonnet-4-5-20250929",
130 | "input": "Take a screenshot and tell me what you see",
131 | "agent_kwargs": {
132 | "save_trajectory": true,
133 | "verbosity": 20
134 | },
135 | "computer_kwargs": {
136 | "os_type": "linux",
137 | "provider_type": "cloud"
138 | }
139 | }'"""
140 | )
141 |
142 |
143 | async def test_p2p_client():
144 | """Example P2P client using peerjs-python."""
145 | try:
146 | from aiortc import RTCConfiguration, RTCIceServer
147 | from peerjs import ConnectionEventType, Peer, PeerOptions
148 |
149 | # Set up client peer
150 | options = PeerOptions(
151 | host="0.peerjs.com",
152 | port=443,
153 | secure=True,
154 | config=RTCConfiguration(iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]),
155 | )
156 |
157 | client_peer = Peer(id="test-client", peer_options=options)
158 | await client_peer.start()
159 |
160 | # Connect to proxy server
161 | connection = client_peer.connect("computer-agent-proxy")
162 |
163 | @connection.on(ConnectionEventType.Open)
164 | async def connection_open():
165 | print("Connected to proxy server")
166 |
167 | # Send a test request
168 | request = {
169 | "model": "anthropic/claude-sonnet-4-5-20250929",
170 | "input": "Hello from P2P client!",
171 | }
172 | await connection.send(json.dumps(request))
173 |
174 | @connection.on(ConnectionEventType.Data)
175 | async def connection_data(data):
176 | print(f"Received response: {data}")
177 | await client_peer.destroy()
178 |
179 | # Wait for connection
180 | await asyncio.sleep(10)
181 |
182 | except ImportError:
183 | print("P2P dependencies not available. Install peerjs-python for P2P testing.")
184 | except Exception as e:
185 | print(f"P2P test error: {e}")
186 |
187 |
188 | if __name__ == "__main__":
189 | import sys
190 |
191 | if len(sys.argv) > 1 and sys.argv[1] == "curl":
192 | curl_examples()
193 | elif len(sys.argv) > 1 and sys.argv[1] == "p2p":
194 | asyncio.run(test_p2p_client())
195 | else:
196 | asyncio.run(test_http_endpoint())
197 |
```
--------------------------------------------------------------------------------
/libs/python/computer-server/computer_server/diorama/safezone.py:
--------------------------------------------------------------------------------
```python
1 | #!/usr/bin/env python3
2 | """
3 | UI Safezone Helper - A utility to get accurate bounds for macOS UI elements
4 |
5 | This module provides helper functions to get accurate bounds for macOS UI elements
6 | like the menubar and dock, which are needed for proper screenshot composition.
7 | """
8 |
9 | import sys
10 | import time
11 | from typing import Any, Dict, Optional, Tuple
12 |
13 | # Import Objective-C bridge libraries
14 | try:
15 | import AppKit
16 | import Foundation
17 | from AppKit import NSRunningApplication, NSWorkspace
18 | from ApplicationServices import (
19 | AXUIElementCopyAttributeValue,
20 | AXUIElementCopyAttributeValues,
21 | AXUIElementCreateApplication,
22 | AXUIElementCreateSystemWide,
23 | AXUIElementGetTypeID,
24 | AXValueGetType,
25 | AXValueGetValue,
26 | kAXChildrenAttribute,
27 | kAXErrorSuccess,
28 | kAXMenuBarAttribute,
29 | kAXPositionAttribute,
30 | kAXRoleAttribute,
31 | kAXSizeAttribute,
32 | kAXTitleAttribute,
33 | kAXValueCGPointType,
34 | kAXValueCGSizeType,
35 | )
36 | except ImportError:
37 | print("Error: This script requires PyObjC to be installed.")
38 | print("Please install it with: pip install pyobjc")
39 | sys.exit(1)
40 |
41 | # Constants for accessibility API
42 | kAXErrorSuccess = 0
43 | kAXRoleAttribute = "AXRole"
44 | kAXSubroleAttribute = "AXSubrole"
45 | kAXTitleAttribute = "AXTitle"
46 | kAXPositionAttribute = "AXPosition"
47 | kAXSizeAttribute = "AXSize"
48 | kAXChildrenAttribute = "AXChildren"
49 | kAXMenuBarAttribute = "AXMenuBar"
50 |
51 |
52 | def element_attribute(element, attribute):
53 | """Get an attribute from an accessibility element"""
54 | if attribute == kAXChildrenAttribute:
55 | err, value = AXUIElementCopyAttributeValues(element, attribute, 0, 999, None)
56 | if err == kAXErrorSuccess:
57 | if isinstance(value, Foundation.NSArray):
58 | return list(value)
59 | else:
60 | return value
61 | err, value = AXUIElementCopyAttributeValue(element, attribute, None)
62 | if err == kAXErrorSuccess:
63 | return value
64 | return None
65 |
66 |
67 | def element_value(element, type):
68 | """Get a value from an accessibility element"""
69 | err, value = AXValueGetValue(element, type, None)
70 | if err == True:
71 | return value
72 | return None
73 |
74 |
75 | def get_element_bounds(element):
76 | """Get the bounds of an accessibility element"""
77 | bounds = {"x": 0, "y": 0, "width": 0, "height": 0}
78 |
79 | # Get position
80 | position_value = element_attribute(element, kAXPositionAttribute)
81 | if position_value:
82 | position_value = element_value(position_value, kAXValueCGPointType)
83 | if position_value:
84 | bounds["x"] = position_value.x
85 | bounds["y"] = position_value.y
86 |
87 | # Get size
88 | size_value = element_attribute(element, kAXSizeAttribute)
89 | if size_value:
90 | size_value = element_value(size_value, kAXValueCGSizeType)
91 | if size_value:
92 | bounds["width"] = size_value.width
93 | bounds["height"] = size_value.height
94 |
95 | return bounds
96 |
97 |
98 | def find_dock_process():
99 | """Find the Dock process"""
100 | running_apps = NSWorkspace.sharedWorkspace().runningApplications()
101 | for app in running_apps:
102 | if app.localizedName() == "Dock" and app.bundleIdentifier() == "com.apple.dock":
103 | return app.processIdentifier()
104 | return None
105 |
106 |
107 | def get_menubar_bounds():
108 | """Get the bounds of the macOS menubar
109 |
110 | Returns:
111 | Dictionary with x, y, width, height of the menubar
112 | """
113 | # Get the system-wide accessibility element
114 | system_element = AXUIElementCreateSystemWide()
115 |
116 | # Try to find the menubar
117 | menubar = element_attribute(system_element, kAXMenuBarAttribute)
118 | if menubar is None:
119 | # If we can't get it directly, try through the frontmost app
120 | frontmost_app = NSWorkspace.sharedWorkspace().frontmostApplication()
121 | if frontmost_app:
122 | app_pid = frontmost_app.processIdentifier()
123 | app_element = AXUIElementCreateApplication(app_pid)
124 | menubar = element_attribute(app_element, kAXMenuBarAttribute)
125 |
126 | if menubar is None:
127 | print("Error: Could not get menubar")
128 | # Return default menubar bounds as fallback
129 | return {"x": 0, "y": 0, "width": 1800, "height": 24}
130 |
131 | # Get menubar bounds
132 | return get_element_bounds(menubar)
133 |
134 |
135 | def get_dock_bounds():
136 | """Get the bounds of the macOS Dock
137 |
138 | Returns:
139 | Dictionary with x, y, width, height of the Dock
140 | """
141 | dock_pid = find_dock_process()
142 | if dock_pid is None:
143 | print("Error: Could not find Dock process")
144 | # Return empty bounds as fallback
145 | return {"x": 0, "y": 0, "width": 0, "height": 0}
146 |
147 | # Create an accessibility element for the Dock
148 | dock_element = AXUIElementCreateApplication(dock_pid)
149 | if dock_element is None:
150 | print(f"Error: Could not create accessibility element for Dock (PID {dock_pid})")
151 | return {"x": 0, "y": 0, "width": 0, "height": 0}
152 |
153 | # Get the Dock's children
154 | children = element_attribute(dock_element, kAXChildrenAttribute)
155 | if not children or len(children) == 0:
156 | print("Error: Could not get Dock children")
157 | return {"x": 0, "y": 0, "width": 0, "height": 0}
158 |
159 | # Find the Dock's list (first child is usually the main dock list)
160 | dock_list = None
161 | for child in children:
162 | role = element_attribute(child, kAXRoleAttribute)
163 | if role == "AXList":
164 | dock_list = child
165 | break
166 |
167 | if dock_list is None:
168 | print("Error: Could not find Dock list")
169 | return {"x": 0, "y": 0, "width": 0, "height": 0}
170 |
171 | # Get the bounds of the dock list
172 | return get_element_bounds(dock_list)
173 |
174 |
175 | def get_ui_element_bounds():
176 | """Get the bounds of important UI elements like menubar and dock
177 |
178 | Returns:
179 | Dictionary with menubar and dock bounds
180 | """
181 | menubar_bounds = get_menubar_bounds()
182 | dock_bounds = get_dock_bounds()
183 |
184 | return {"menubar": menubar_bounds, "dock": dock_bounds}
185 |
186 |
187 | if __name__ == "__main__":
188 | # Example usage
189 | bounds = get_ui_element_bounds()
190 | print("Menubar bounds:", bounds["menubar"])
191 | print("Dock bounds:", bounds["dock"])
192 |
```
--------------------------------------------------------------------------------
/docs/content/docs/macos-vm-cli-playbook/lume/cli-reference.mdx:
--------------------------------------------------------------------------------
```markdown
1 | ---
2 | title: Lume CLI Reference
3 | description: Command Line Interface reference for Lume
4 | ---
5 |
6 | import { Callout } from 'fumadocs-ui/components/callout';
7 |
8 | Once installed, you can start using Lume with these common workflows:
9 |
10 | ### Run a Prebuilt VM
11 |
12 | ```bash
13 | # Run a macOS Sequoia VM
14 | lume run macos-sequoia-vanilla:latest
15 |
16 | # Run an Ubuntu VM
17 | lume run ubuntu-noble-vanilla:latest
18 | ```
19 |
20 | <Callout>
21 | We provide [prebuilt VM images](../lume/prebuilt-images) in our [ghcr
22 | registry](https://github.com/orgs/trycua/packages).
23 | </Callout>
24 |
25 | ### Create a Custom VM
26 |
27 | ```bash
28 | # Create a new macOS VM
29 | lume create my-macos-vm --cpu 4 --memory 8GB --disk-size 50GB
30 |
31 | # Create a Linux VM
32 | lume create my-linux-vm --os linux --cpu 2 --memory 4GB
33 | ```
34 |
35 | <Callout title="Disk Space">
36 | The actual disk space used by sparse images will be much lower than the logical size listed. You can resize VM disks after creation using `lume set <name> --disk-size <size>`.
37 | </Callout>
38 |
39 | ## VM Management
40 |
41 | lume create <name>
42 | Create a new macOS or Linux virtual machine.
43 |
44 | **Options:**
45 |
46 | - `--os <os>` - Operating system to install (macOS or linux, default: macOS)
47 | - `--cpu <cores>` - Number of CPU cores (default: 4)
48 | - `--memory <size>` - Memory size, e.g., 8GB (default: 4GB)
49 | - `--disk-size <size>` - Disk size, e.g., 50GB (default: 40GB)
50 | - `--display <res>` - Display resolution (default: 1024x768)
51 | - `--ipsw <path>` - Path to IPSW file or 'latest' for macOS VMs
52 | - `--storage <name>` - VM storage location to use
53 |
54 | **Examples:**
55 |
56 | ```bash
57 | # Create macOS VM with custom specs
58 | lume create my-mac --cpu 6 --memory 16GB --disk-size 100GB
59 |
60 | # Create Linux VM
61 | lume create my-ubuntu --os linux --cpu 2 --memory 8GB
62 |
63 | # Create macOS VM with latest IPSW
64 | lume create my-sequoia --ipsw latest
65 | ```
66 |
67 | lume run <name>
68 | Start and run a virtual machine.
69 |
70 | **Options:**
71 |
72 | - `--no-display` - Do not start the VNC client app
73 | - `--shared-dir <dir>` - Share directory with VM (format: path[:ro|rw])
74 | - `--mount <path>` - For Linux VMs only, attach a read-only disk image
75 | - `--registry <url>` - Container registry URL (default: ghcr.io)
76 | - `--organization <org>` - Organization to pull from (default: trycua)
77 | - `--vnc-port <port>` - Port to use for the VNC server (default: 0 for auto-assign)
78 | - `--recovery-mode <boolean>` - For macOS VMs only, start VM in recovery mode (default: false)
79 | - `--storage <name>` - VM storage location to use
80 |
81 | **Examples:**
82 |
83 | ```bash
84 | # Run VM with shared directory
85 | lume run my-vm --shared-dir /path/to/share:rw
86 |
87 | # Run VM without display (headless)
88 | lume run my-vm --no-display
89 |
90 | # Run macOS VM in recovery mode
91 | lume run my-mac --recovery-mode true
92 | ```
93 |
94 | lume stop <name>
95 | Stop a running virtual machine.
96 |
97 | **Options:**
98 |
99 | - `--storage <name>` - VM storage location to use
100 |
101 | ### lume delete <name>
102 |
103 | Delete a virtual machine and its associated files.
104 |
105 | **Options:**
106 |
107 | - `--force` - Force deletion without confirmation
108 | - `--storage <name>` - VM storage location to use
109 |
110 | ### lume clone <name> <new-name>
111 |
112 | Create a copy of an existing virtual machine.
113 |
114 | **Options:**
115 |
116 | - `--source-storage <name>` - Source VM storage location
117 | - `--dest-storage <name>` - Destination VM storage location
118 |
119 | ## VM Information and Configuration
120 |
121 | ### lume ls
122 |
123 | List all virtual machines and their status.
124 |
125 | ### lume get <name>
126 |
127 | Get detailed information about a specific virtual machine.
128 |
129 | **Options:**
130 |
131 | - `-f, --format <format>` - Output format (json|text)
132 | - `--storage <name>` - VM storage location to use
133 |
134 | ### lume set <name>
135 |
136 | Modify virtual machine configuration.
137 |
138 | **Options:**
139 |
140 | - `--cpu <cores>` - New number of CPU cores (e.g., 4)
141 | - `--memory <size>` - New memory size (e.g., 8192MB or 8GB)
142 | - `--disk-size <size>` - New disk size (e.g., 40960MB or 40GB)
143 | - `--display <res>` - New display resolution in format WIDTHxHEIGHT (e.g., 1024x768)
144 | - `--storage <name>` - VM storage location to use
145 |
146 | **Examples:**
147 |
148 | ```bash
149 | # Increase VM memory
150 | lume set my-vm --memory 16GB
151 |
152 | # Change display resolution
153 | lume set my-vm --display 1920x1080
154 |
155 | # Add more CPU cores
156 | lume set my-vm --cpu 8
157 | ```
158 |
159 | ## Image Management
160 |
161 | ### lume images
162 |
163 | List available macOS images in local cache.
164 |
165 | ### lume pull <image>
166 |
167 | Download a VM image from a container registry.
168 |
169 | **Options:**
170 |
171 | - `--registry <url>` - Container registry URL (default: ghcr.io)
172 | - `--organization <org>` - Organization to pull from (default: trycua)
173 | - `--storage <name>` - VM storage location to use
174 |
175 | ### lume push <name> <image:tag>
176 |
177 | Upload a VM image to a container registry.
178 |
179 | **Options:**
180 |
181 | - `--additional-tags <tags...>` - Additional tags to push the same image to
182 | - `--registry <url>` - Container registry URL (default: ghcr.io)
183 | - `--organization <org>` - Organization/user to push to (default: trycua)
184 | - `--storage <name>` - VM storage location to use
185 | - `--chunk-size-mb <size>` - Chunk size for disk image upload in MB (default: 512)
186 | - `--verbose` - Enable verbose logging
187 | - `--dry-run` - Prepare files and show plan without uploading
188 | - `--reassemble` - Verify integrity by reassembling chunks (requires --dry-run)
189 |
190 | ### lume ipsw
191 |
192 | Get the latest macOS restore image URL.
193 |
194 | ### lume prune
195 |
196 | Remove cached images to free up disk space.
197 |
198 | ## Configuration
199 |
200 | ### lume config
201 |
202 | Manage Lume configuration settings.
203 |
204 | **Subcommands:**
205 |
206 | ##### Storage Management
207 |
208 | - `lume config storage add <name> <path>` - Add a new VM storage location
209 | - `lume config storage remove <name>` - Remove a VM storage location
210 | - `lume config storage list` - List all VM storage locations
211 | - `lume config storage default <name>` - Set the default VM storage location
212 |
213 | ##### Cache Management
214 |
215 | - `lume config cache get` - Get current cache directory
216 | - `lume config cache set <path>` - Set cache directory
217 |
218 | ##### Image Caching
219 |
220 | - `lume config caching get` - Show current caching status
221 | - `lume config caching set <boolean>` - Enable or disable image caching
222 |
223 | ## API Server
224 |
225 | ### lume serve
226 |
227 | Start the Lume API server for programmatic access.
228 |
229 | **Options:**
230 |
231 | - `--port <port>` - Port to listen on (default: 7777)
232 |
233 | ## Global Options
234 |
235 | These options are available for all commands:
236 |
237 | - `--help` - Show help information
238 | - `--version` - Show version number
239 |
```
--------------------------------------------------------------------------------
/libs/lumier/src/lib/utils.sh:
--------------------------------------------------------------------------------
```bash
1 | #!/usr/bin/env bash
2 |
3 | # Function to wait for SSH to become available
4 | wait_for_ssh() {
5 | local host_ip=$1
6 | local user=$2
7 | local password=$3
8 | local retry_interval=${4:-5} # Default retry interval is 5 seconds
9 | local max_retries=${5:-20} # Default maximum retries is 20 (0 for infinite)
10 |
11 | # Only show waiting message in debug mode
12 | if [ "${LUMIER_DEBUG:-0}" == "1" ]; then
13 | echo "Waiting for SSH to become available on $host_ip..."
14 | fi
15 |
16 | local retry_count=0
17 | while true; do
18 | # Try to connect via SSH
19 | # Add -q for completely silent operation, redirect stderr to /dev/null
20 | sshpass -p "$password" ssh -q -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR "$user@$host_ip" "exit" 2>/dev/null
21 |
22 | # Check the exit status of the SSH command
23 | if [ $? -eq 0 ]; then
24 | echo "SSH is ready on $host_ip!"
25 | return 0
26 | fi
27 |
28 | # Increment retry count
29 | ((retry_count++))
30 |
31 | # Exit if maximum retries are reached
32 | if [ $max_retries -ne 0 ] && [ $retry_count -ge $max_retries ]; then
33 | echo "Maximum retries reached. SSH is not available."
34 | return 1
35 | fi
36 |
37 | # Only show retry messages in debug mode
38 | if [ "${LUMIER_DEBUG:-0}" == "1" ]; then
39 | echo "SSH not ready. Retrying in $retry_interval seconds... (Attempt $retry_count)"
40 | fi
41 | sleep $retry_interval
42 | done
43 | }
44 |
45 | # Function to execute a script on a remote server using sshpass
46 | execute_remote_script() {
47 | local host="$1"
48 | local user="$2"
49 | local password="$3"
50 | local script_path="$4"
51 | local vnc_password="$5"
52 | local data_folder="$6"
53 |
54 | # Check if all required arguments are provided
55 | if [ -z "$host" ] || [ -z "$user" ] || [ -z "$password" ] || [ -z "$script_path" ] || [ -z "$vnc_password" ]; then
56 | echo "Usage: execute_remote_script <host> <user> <password> <script_path> <vnc_password> [data_folder]"
57 | return 1
58 | fi
59 |
60 | # Only show VNC info in debug mode
61 | if [ "${LUMIER_DEBUG:-0}" == "1" ]; then
62 | echo "VNC password exported to VM: $vnc_password"
63 | fi
64 |
65 | # Set the shared folder path for the VM
66 | if [ -n "$data_folder" ]; then
67 | # VM always sees shared folders at this path, regardless of container path
68 | shared_folder_path="/Volumes/My Shared Files"
69 |
70 | # Only show path in debug mode
71 | if [ "${LUMIER_DEBUG:-0}" == "1" ]; then
72 | echo "Data folder path in VM: $shared_folder_path"
73 | fi
74 | else
75 | shared_folder_path=""
76 | fi
77 |
78 | # Read the script content and prepend the shebang
79 | script_content="#!/usr/bin/env bash\n"
80 | # Always export VNC_PASSWORD
81 | script_content+="export VNC_PASSWORD='$vnc_password'\n"
82 | # Export SHARED_FOLDER_PATH only if we have a data folder path
83 | if [ -n "$shared_folder_path" ]; then
84 | script_content+="export SHARED_FOLDER_PATH='$shared_folder_path'\n"
85 | fi
86 | # Pass debug setting to the VM
87 | script_content+="export VNC_DEBUG='${LUMIER_DEBUG:-0}'\n"
88 |
89 | # Add debug messages only if debug mode is enabled
90 | if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then
91 | script_content+="echo \"[DEBUG] Starting on-logon script execution...\"\n"
92 | fi
93 |
94 | # Add the original script content
95 | script_content+="$(<"$script_path")"
96 |
97 | # Add debug messages only if debug mode is enabled
98 | if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then
99 | script_content+="\necho \"[DEBUG] Finished executing on-logon script.\"\n"
100 | fi
101 |
102 | # Print debug info only when debug mode is enabled
103 | if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then
104 | echo "[DEBUG] Executing remote script with content length: $(echo -n "$script_content" | wc -c) bytes"
105 | echo "[DEBUG] Script path: $script_path"
106 | fi
107 |
108 | # Use a here-document to send the script content
109 | # We'll capture both stdout and stderr when debug is enabled
110 | if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then
111 | echo "[DEBUG] Connecting to $user@$host to execute script..."
112 | sshpass -p "$password" ssh -q -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR "$user@$host" "bash -s -- '$vnc_password' '$data_folder'" 2>&1 <<EOF
113 | $script_content
114 | EOF
115 | else
116 | # Otherwise run quietly
117 | sshpass -p "$password" ssh -q -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR "$user@$host" "bash -s -- '$vnc_password' '$data_folder'" 2>/dev/null <<EOF
118 | $script_content
119 | EOF
120 | fi
121 |
122 | # Print completion message only in debug mode
123 | if [[ "${LUMIER_DEBUG:-0}" == "1" ]]; then
124 | echo "[DEBUG] Script execution completed."
125 | fi
126 |
127 | # Check the exit status of the sshpass command
128 | if [ $? -ne 0 ]; then
129 | echo "Failed to execute script on remote host $host."
130 | return 1
131 | fi
132 | }
133 |
134 | extract_json_field() {
135 | local field_name=$1
136 | local input=$2
137 | local result=""
138 |
139 | # First attempt with jq if available (most reliable JSON parsing)
140 | if command -v jq &> /dev/null; then
141 | # Use jq for reliable JSON parsing
142 | result=$(echo "$input" | jq -r ".$field_name // empty" 2>/dev/null)
143 | if [[ -n "$result" ]]; then
144 | echo "$result"
145 | return 0
146 | fi
147 | fi
148 |
149 | # Fallback to grep-based approach with improvements
150 | # First try for quoted string values
151 | result=$(echo "$input" | tr -d '\n' | grep -o "\"$field_name\"\s*:\s*\"[^\"]*\"" | sed -E 's/.*":\s*"(.*)"$/\1/')
152 | if [[ -n "$result" ]]; then
153 | echo "$result"
154 | return 0
155 | fi
156 |
157 | # Try for non-quoted values (numbers, true, false, null)
158 | result=$(echo "$input" | tr -d '\n' | grep -o "\"$field_name\"\s*:\s*[^,}]*" | sed -E 's/.*":\s*(.*)$/\1/')
159 | if [[ -n "$result" ]]; then
160 | echo "$result"
161 | return 0
162 | fi
163 |
164 | # Return empty string if field not found
165 | echo ""
166 | }
167 |
168 | extract_json_field_from_file() {
169 | local field_name=$1
170 | local json_file=$2
171 | local json_text
172 | json_text=$(<"$json_file")
173 | extract_json_field "$field_name" "$json_text"
174 | }
175 |
176 | extract_json_field_from_text() {
177 | local field_name=$1
178 | local json_text=$2
179 | extract_json_field "$field_name" "$json_text"
180 | }
181 |
```
--------------------------------------------------------------------------------
/libs/python/agent/benchmarks/ss-pro.py:
--------------------------------------------------------------------------------
```python
1 | #!/usr/bin/env python3
2 | """
3 | ScreenSpot-Pro Benchmark Script
4 |
5 | Evaluates models on the ScreenSpot-Pro dataset for click prediction accuracy.
6 | Supports both ComputerAgent model strings and custom model classes.
7 | """
8 |
9 | import argparse
10 | import asyncio
11 | import random
12 | import statistics
13 | import time
14 | from typing import Optional
15 |
16 | from datasets import load_dataset
17 | from tqdm import tqdm
18 | from utils import (
19 | ModelWrapper,
20 | get_available_models,
21 | get_gpu_memory,
22 | is_click_in_bbox,
23 | save_results_to_markdown,
24 | save_visualizations,
25 | )
26 |
27 |
28 | async def evaluate_model(
29 | model_wrapper: ModelWrapper, dataset, max_samples: Optional[int] = None
30 | ) -> dict:
31 | """
32 | Evaluate a model on the ScreenSpot-Pro dataset.
33 |
34 | Args:
35 | model_wrapper: ModelWrapper instance
36 | dataset: ScreenSpot-Pro dataset (list of samples)
37 | max_samples: Maximum number of samples to evaluate (None for all)
38 |
39 | Returns:
40 | Dictionary with evaluation results
41 | """
42 | print(f"\nEvaluating model: {model_wrapper.model_name}")
43 |
44 | # Load model
45 | await model_wrapper.load_model()
46 |
47 | total_samples = len(dataset)
48 | if max_samples is not None:
49 | total_samples = min(max_samples, total_samples)
50 |
51 | correct_predictions = 0
52 | error_predictions = 0
53 | results = []
54 |
55 | for i in tqdm(range(total_samples), desc=f"Evaluating {model_wrapper.model_name}"):
56 | sample = dataset[i]
57 |
58 | # Extract sample data
59 | image = sample["image"]
60 | instruction = sample["instruction"]
61 | bbox = sample["bbox"] # [x1, y1, x2, y2]
62 | sample_id = sample["img_filename"]
63 |
64 | # Predict click coordinates with timing
65 | start_time = time.time()
66 | click_coords = await model_wrapper.predict_click(image, instruction)
67 | prediction_time = time.time() - start_time
68 |
69 | # Check if prediction is correct
70 | is_correct = is_click_in_bbox(click_coords, bbox)
71 |
72 | if is_correct:
73 | correct_predictions += 1
74 |
75 | results.append(
76 | {
77 | "id": sample_id,
78 | "instruction": instruction,
79 | "bbox": bbox,
80 | "predicted_coords": click_coords,
81 | "is_correct": is_correct,
82 | "failed": False,
83 | "prediction_time": prediction_time,
84 | }
85 | )
86 |
87 | # Unload model
88 | await model_wrapper.unload_model()
89 |
90 | # Calculate metrics
91 | accuracy = correct_predictions / total_samples if total_samples > 0 else 0.0
92 | error_rate = error_predictions / total_samples if total_samples > 0 else 0.0
93 |
94 | # Calculate timing statistics
95 | successful_times = [r["prediction_time"] for r in results if not r["failed"]]
96 | avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0
97 | median_prediction_time = statistics.median(successful_times) if successful_times else 0.0
98 | min_prediction_time = min(successful_times) if successful_times else 0.0
99 | max_prediction_time = max(successful_times) if successful_times else 0.0
100 |
101 | # Get VRAM statistics
102 | vram_stats = model_wrapper.get_vram_stats()
103 |
104 | return {
105 | "model_name": model_wrapper.model_name,
106 | "total_samples": total_samples,
107 | "correct_predictions": correct_predictions,
108 | "failed_predictions": error_predictions,
109 | "accuracy": accuracy,
110 | "failure_rate": error_rate,
111 | "avg_prediction_time": avg_prediction_time,
112 | "median_prediction_time": median_prediction_time,
113 | "min_prediction_time": min_prediction_time,
114 | "max_prediction_time": max_prediction_time,
115 | "vram_max_mb": vram_stats["max_mb"],
116 | "vram_avg_mb": vram_stats["avg_mb"],
117 | "results": results,
118 | }
119 |
120 |
121 | async def main():
122 | """
123 | Main function to run the benchmark.
124 | """
125 | # Parse command line arguments
126 | parser = argparse.ArgumentParser(description="ScreenSpot-Pro Benchmark Script")
127 | parser.add_argument(
128 | "--samples", type=int, default=300, help="Number of samples to evaluate (default: 300)"
129 | )
130 | parser.add_argument(
131 | "--seed", type=int, default=42, help="Random seed for shuffling (default: 42)"
132 | )
133 | args = parser.parse_args()
134 |
135 | # Set random seed
136 | random.seed(args.seed)
137 |
138 | # Load dataset
139 | print("Loading ScreenSpot-Pro dataset...")
140 | ds = load_dataset("lmms-lab/ScreenSpot-Pro")
141 | dataset = ds["train"] # type: ignore
142 | # Convert to list to support indexing
143 | dataset_list = list(dataset)
144 | print(f"Dataset loaded: {len(dataset_list)} samples")
145 |
146 | # Shuffle dataset with seed
147 | random.shuffle(dataset_list)
148 | print(f"Dataset shuffled with seed {args.seed}")
149 |
150 | # Get available models
151 | models = get_available_models()
152 |
153 | # Evaluation settings
154 | max_samples = args.samples # Use command line argument
155 |
156 | # Run evaluations
157 | all_results = []
158 |
159 | for model in models:
160 | model_wrapper = ModelWrapper(model)
161 | result = await evaluate_model(model_wrapper, dataset_list, max_samples)
162 | all_results.append(result)
163 |
164 | # Print summary
165 | print(f"\n{result['model_name']} Results:")
166 | print(f" Accuracy: {result['accuracy']*100:.2f}%")
167 | print(f" Correct: {result['correct_predictions']}/{result['total_samples']}")
168 | print(f" Errors: {result['failed_predictions']}")
169 | print(f" Error Rate: {result['failure_rate']*100:.2f}%")
170 | print(f" Avg Time: {result['avg_prediction_time']:.2f}s")
171 | print(f" Median Time: {result['median_prediction_time']:.2f}s")
172 | print(
173 | f" Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s"
174 | )
175 | print(f" VRAM Max: {result['vram_max_mb']:.1f}MB")
176 | print(f" VRAM Avg: {result['vram_avg_mb']:.1f}MB")
177 |
178 | # Print GPU memory info
179 | gpu_memory = get_gpu_memory()
180 | if gpu_memory and gpu_memory[0] > 0:
181 | print(f" GPU Free Memory: {gpu_memory[0]:.1f}MB")
182 |
183 | # Save results
184 | if all_results:
185 | save_results_to_markdown(all_results)
186 | save_visualizations(all_results, dataset_list)
187 | print("\nBenchmark completed successfully!")
188 | else:
189 | print("\nNo successful evaluations completed.")
190 |
191 |
192 | if __name__ == "__main__":
193 | asyncio.run(main())
194 |
```
--------------------------------------------------------------------------------
/libs/lume/src/FileSystem/VMDirectory.swift:
--------------------------------------------------------------------------------
```swift
1 | import Foundation
2 |
3 | // MARK: - VMDirectory
4 |
5 | /// Manages a virtual machine's directory structure and files
6 | /// Responsible for:
7 | /// - Managing VM configuration files
8 | /// - Handling disk operations
9 | /// - Managing VM state and locking
10 | /// - Providing access to VM-related paths
11 | struct VMDirectory: Sendable {
12 | // MARK: - Constants
13 |
14 | private enum FileNames {
15 | static let nvram = "nvram.bin"
16 | static let disk = "disk.img"
17 | static let config = "config.json"
18 | static let sessions = "sessions.json"
19 | }
20 |
21 | // MARK: - Properties
22 |
23 | let dir: Path
24 | let nvramPath: Path
25 | let diskPath: Path
26 | let configPath: Path
27 | let sessionsPath: Path
28 |
29 | /// The name of the VM directory
30 | var name: String { dir.name }
31 |
32 | // MARK: - Initialization
33 |
34 | /// Creates a new VMDirectory instance
35 | /// - Parameters:
36 | /// - dir: The base directory path for the VM
37 | init(_ dir: Path) {
38 | self.dir = dir
39 | self.nvramPath = dir.file(FileNames.nvram)
40 | self.diskPath = dir.file(FileNames.disk)
41 | self.configPath = dir.file(FileNames.config)
42 | self.sessionsPath = dir.file(FileNames.sessions)
43 | }
44 | }
45 |
46 | // MARK: - VM State Management
47 |
48 | extension VMDirectory {
49 | /// Checks if the VM directory is fully initialized with all required files
50 | func initialized() -> Bool {
51 | // Add detailed logging for debugging
52 | let configExists = configPath.exists()
53 | let diskExists = diskPath.exists()
54 | let nvramExists = nvramPath.exists()
55 |
56 | // Logger.info(
57 | // "VM directory initialization check",
58 | // metadata: [
59 | // "directory": dir.path,
60 | // "config_path": configPath.path,
61 | // "config_exists": "\(configExists)",
62 | // "disk_path": diskPath.path,
63 | // "disk_exists": "\(diskExists)",
64 | // "nvram_path": nvramPath.path,
65 | // "nvram_exists": "\(nvramExists)"
66 | // ]
67 | // )
68 |
69 | return configExists && diskExists && nvramExists
70 | }
71 |
72 | /// Checks if the VM directory exists
73 | func exists() -> Bool {
74 | dir.exists()
75 | }
76 | }
77 |
78 | // MARK: - Disk Management
79 |
80 | extension VMDirectory {
81 | /// Resizes the VM's disk to the specified size
82 | /// - Parameter size: The new size in bytes
83 | /// - Throws: VMDirectoryError if the disk operation fails
84 | func setDisk(_ size: UInt64) throws {
85 | do {
86 | if !diskPath.exists() {
87 | guard FileManager.default.createFile(atPath: diskPath.path, contents: nil) else {
88 | throw VMDirectoryError.fileCreationFailed(diskPath.path)
89 | }
90 | }
91 |
92 | let handle = try FileHandle(forWritingTo: diskPath.url)
93 | defer { try? handle.close() }
94 |
95 | try handle.truncate(atOffset: size)
96 | } catch {
97 | }
98 | }
99 | }
100 |
101 | // MARK: - Configuration Management
102 |
103 | extension VMDirectory {
104 | /// Saves the VM configuration to disk
105 | /// - Parameter config: The configuration to save
106 | /// - Throws: VMDirectoryError if the save operation fails
107 | func saveConfig(_ config: VMConfig) throws {
108 | let encoder = JSONEncoder()
109 | encoder.outputFormatting = .prettyPrinted
110 |
111 | do {
112 | let data = try encoder.encode(config)
113 | guard FileManager.default.createFile(atPath: configPath.path, contents: data) else {
114 | throw VMDirectoryError.fileCreationFailed(configPath.path)
115 | }
116 | } catch {
117 | throw VMDirectoryError.invalidConfigData
118 | }
119 | }
120 |
121 | /// Loads the VM configuration from disk
122 | /// - Returns: The loaded configuration
123 | /// - Throws: VMDirectoryError if the load operation fails
124 | func loadConfig() throws -> VMConfig {
125 | guard let data = FileManager.default.contents(atPath: configPath.path) else {
126 | throw VMDirectoryError.configNotFound
127 | }
128 |
129 | do {
130 | let decoder = JSONDecoder()
131 | return try decoder.decode(VMConfig.self, from: data)
132 | } catch {
133 | throw VMDirectoryError.invalidConfigData
134 | }
135 | }
136 | }
137 |
138 | // MARK: - VNC Session Management
139 |
140 | struct VNCSession: Codable {
141 | let url: String
142 | let sharedDirectories: [SharedDirectory]?
143 |
144 | init(url: String, sharedDirectories: [SharedDirectory]? = nil) {
145 | self.url = url
146 | self.sharedDirectories = sharedDirectories
147 | }
148 | }
149 |
150 | extension VMDirectory {
151 | /// Saves VNC session information to disk
152 | /// - Parameters:
153 | /// - session: The VNC session to save
154 | /// - sharedDirectories: Optional array of shared directories to save with the session
155 | /// - Throws: VMDirectoryError if the save operation fails
156 | func saveSession(_ session: VNCSession) throws {
157 | let encoder = JSONEncoder()
158 | encoder.outputFormatting = .prettyPrinted
159 |
160 | do {
161 | let data = try encoder.encode(session)
162 | guard FileManager.default.createFile(atPath: sessionsPath.path, contents: data) else {
163 | throw VMDirectoryError.fileCreationFailed(sessionsPath.path)
164 | }
165 | } catch {
166 | throw VMDirectoryError.invalidSessionData
167 | }
168 | }
169 |
170 | /// Loads the VNC session information from disk
171 | /// - Returns: The loaded VNC session
172 | /// - Throws: VMDirectoryError if the load operation fails
173 | func loadSession() throws -> VNCSession {
174 | guard let data = FileManager.default.contents(atPath: sessionsPath.path) else {
175 | throw VMDirectoryError.sessionNotFound
176 | }
177 |
178 | do {
179 | let decoder = JSONDecoder()
180 | return try decoder.decode(VNCSession.self, from: data)
181 | } catch {
182 | throw VMDirectoryError.invalidSessionData
183 | }
184 | }
185 |
186 | /// Removes the VNC session information from disk
187 | func clearSession() {
188 | try? FileManager.default.removeItem(atPath: sessionsPath.path)
189 | }
190 | }
191 |
192 | // MARK: - CustomStringConvertible
193 | extension VMDirectory: CustomStringConvertible {
194 | var description: String {
195 | "VMDirectory(path: \(dir.path))"
196 | }
197 | }
198 |
199 | extension VMDirectory {
200 | func delete() throws {
201 | try FileManager.default.removeItem(atPath: dir.path)
202 | }
203 | }
204 |
```
--------------------------------------------------------------------------------
/.github/workflows/npm-publish-cli.yml:
--------------------------------------------------------------------------------
```yaml
1 | name: Publish @trycua/cli
2 |
3 | on:
4 | workflow_dispatch:
5 | inputs:
6 | version:
7 | description: "Version to publish (default: from package.json)"
8 | required: false
9 | default: ""
10 |
11 | jobs:
12 | build-and-publish:
13 | permissions:
14 | id-token: write
15 | contents: write
16 | packages: write
17 |
18 | strategy:
19 | matrix:
20 | include:
21 | - target: bun-linux-x64
22 | ext: ""
23 | binary_name: cua-linux-x64
24 | - target: bun-darwin-x64
25 | ext: ""
26 | binary_name: cua-darwin-x64
27 | - target: bun-darwin-arm64
28 | ext: ""
29 | binary_name: cua-darwin-arm64
30 | - target: bun-windows-x64
31 | ext: ".exe"
32 | binary_name: cua-windows-x64
33 |
34 | runs-on: ubuntu-latest
35 |
36 | steps:
37 | - name: Checkout code
38 | uses: actions/checkout@v4
39 | with:
40 | fetch-depth: 0
41 |
42 | - name: Setup Bun
43 | uses: oven-sh/setup-bun@v2
44 | with:
45 | bun-version: latest
46 |
47 | - name: Get version
48 | id: version
49 | run: |
50 | if [ -n "${{ github.event.inputs.version }}" ]; then
51 | echo "version=${{ github.event.inputs.version }}" >> $GITHUB_OUTPUT
52 | else
53 | VERSION=$(bun -p "require('./libs/typescript/cua-cli/package.json').version")
54 | echo "version=${VERSION}" >> $GITHUB_OUTPUT
55 | fi
56 |
57 | - name: Install dependencies
58 | working-directory: ./libs/typescript/cua-cli
59 | run: bun install --frozen-lockfile
60 |
61 | - name: Build binary
62 | working-directory: ./libs/typescript/cua-cli
63 | run: |
64 | bun build --compile --minify --sourcemap --target=${{ matrix.target }} index.ts --outfile ${{ matrix.binary_name }}${{ matrix.ext }}
65 | mkdir -p ../../../dist
66 | mv ${{ matrix.binary_name }}${{ matrix.ext }}* ../../../dist/
67 |
68 | - name: Upload artifacts
69 | uses: actions/upload-artifact@v4
70 | with:
71 | name: cua-binary-${{ matrix.target }}
72 | path: dist/
73 | if-no-files-found: error
74 | retention-days: 1
75 |
76 | publish-npm:
77 | needs: build-and-publish
78 | if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/cua-v')
79 | runs-on: ubuntu-latest
80 | steps:
81 | - name: Checkout code
82 | uses: actions/checkout@v4
83 |
84 | - name: Setup Bun
85 | uses: oven-sh/setup-bun@v2
86 | with:
87 | bun-version: latest
88 |
89 | - name: Install dependencies
90 | working-directory: ./libs/typescript/cua-cli
91 | run: bun install --frozen-lockfile
92 |
93 | - name: Publish to npm
94 | working-directory: ./libs/typescript/cua-cli
95 | env:
96 | NPM_CONFIG_TOKEN: ${{ secrets.NPM_TOKEN }}
97 | run: bun publish --production --access public --tolerate-republish
98 |
99 | create-release:
100 | needs: [build-and-publish, publish-npm]
101 | runs-on: ubuntu-latest
102 | permissions:
103 | contents: write
104 | steps:
105 | - name: Checkout code
106 | uses: actions/checkout@v4
107 |
108 | - name: Setup Bun
109 | uses: oven-sh/setup-bun@v2
110 | with:
111 | bun-version: latest
112 |
113 | - name: Get version
114 | id: version
115 | run: |
116 | VERSION=$(bun -p "require('./libs/typescript/cua-cli/package.json').version")
117 | echo "version=${VERSION}" >> $GITHUB_OUTPUT
118 | echo "tag=cua-v${VERSION}" >> $GITHUB_OUTPUT
119 |
120 | - name: Download all artifacts
121 | uses: actions/download-artifact@v4
122 | with:
123 | path: dist
124 | merge-multiple: true
125 |
126 | - name: Create Release
127 | id: create_release
128 | uses: actions/create-release@v1
129 | env:
130 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
131 | with:
132 | tag_name: ${{ steps.version.outputs.tag }}
133 | release_name: cua-cli v${{ steps.version.outputs.version }}
134 | body: |
135 | # cua-cli v${{ steps.version.outputs.version }}
136 |
137 | ## Installation
138 |
139 | ### Using install script (recommended)
140 | ```bash
141 | # For Linux/macOS
142 | curl -fsSL https://cua.ai/cli/install.sh | sh
143 |
144 | # For Windows (PowerShell)
145 | irm https://cua.ai/cli/install.ps1 | iex
146 | ```
147 |
148 | ### Using npm/bun
149 | ```bash
150 | # Using bun
151 | bun add -g @trycua/cli
152 |
153 | # Or using npm
154 | npm install -g @trycua/cli
155 | ```
156 |
157 | ### From source
158 | ```bash
159 | git clone -b ${{ steps.version.outputs.tag }} https://github.com/trycua/cua.git
160 | cd cua/libs/typescript/cua-cli
161 | bun install
162 | bun link
163 | bun link cua-cli
164 | ```
165 |
166 | ## Release Assets
167 | - `cua-darwin-arm64`: macOS (Apple Silicon)
168 | - `cua-darwin-x64`: macOS (Intel)
169 | - `cua-linux-x64`: Linux (x86_64)
170 | - `cua-windows-x64.exe`: Windows (x86_64)
171 | draft: false
172 | prerelease: false
173 |
174 | - name: Upload Linux Binary
175 | uses: actions/upload-release-asset@v1
176 | with:
177 | upload_url: ${{ steps.create_release.outputs.upload_url }}
178 | asset_path: ./dist/cua-linux-x64
179 | asset_name: cua-linux-x64
180 | asset_content_type: application/octet-stream
181 | env:
182 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
183 |
184 | - name: Upload macOS Intel Binary
185 | uses: actions/upload-release-asset@v1
186 | with:
187 | upload_url: ${{ steps.create_release.outputs.upload_url }}
188 | asset_path: ./dist/cua-darwin-x64
189 | asset_name: cua-darwin-x64
190 | asset_content_type: application/octet-stream
191 | env:
192 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
193 |
194 | - name: Upload macOS Apple Silicon Binary
195 | uses: actions/upload-release-asset@v1
196 | with:
197 | upload_url: ${{ steps.create_release.outputs.upload_url }}
198 | asset_path: ./dist/cua-darwin-arm64
199 | asset_name: cua-darwin-arm64
200 | asset_content_type: application/octet-stream
201 | env:
202 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
203 |
204 | - name: Upload Windows Binary
205 | uses: actions/upload-release-asset@v1
206 | with:
207 | upload_url: ${{ steps.create_release.outputs.upload_url }}
208 | asset_path: ./dist/cua-windows-x64.exe
209 | asset_name: cua-windows-x64.exe
210 | asset_content_type: application/octet-stream
211 | env:
212 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
213 |
```
--------------------------------------------------------------------------------
/libs/xfce/Dockerfile:
--------------------------------------------------------------------------------
```dockerfile
1 | # CUA Docker XFCE Container
2 | # Vanilla XFCE desktop with noVNC and computer-server
3 |
4 | FROM ubuntu:22.04
5 |
6 | # Avoid prompts from apt
7 | ENV DEBIAN_FRONTEND=noninteractive
8 |
9 | # Set environment variables
10 | ENV HOME=/home/cua
11 | ENV DISPLAY=:1
12 | ENV VNC_PORT=5901
13 | ENV NOVNC_PORT=6901
14 | ENV API_PORT=8000
15 | ENV VNC_RESOLUTION=1024x768
16 | ENV VNC_COL_DEPTH=24
17 |
18 | # Install system dependencies first (including sudo)
19 | RUN apt-get update && apt-get install -y \
20 | # System utilities
21 | sudo \
22 | unzip \
23 | zip \
24 | xdg-utils \
25 | gcc \
26 | # Qt/XCB runtime deps for PyQt5 (libqxcb.so)
27 | libxcb-icccm4 \
28 | libxcb-image0 \
29 | libxcb-keysyms1 \
30 | libxcb-render-util0 \
31 | libxcb-xinerama0 \
32 | libxcb-shape0 \
33 | libxcb-randr0 \
34 | libxcb-xfixes0 \
35 | libxcb-sync1 \
36 | libxcb-util1 \
37 | libxcb-cursor0 \
38 | libxkbcommon-x11-0 \
39 | # Desktop environment
40 | xfce4 \
41 | xfce4-terminal \
42 | dbus-x11 \
43 | # VNC server
44 | tigervnc-standalone-server \
45 | tigervnc-common \
46 | # noVNC dependencies
47 | # python will be installed via deadsnakes as 3.12 \
48 | git \
49 | net-tools \
50 | netcat \
51 | supervisor \
52 | # Computer-server dependencies
53 | # python-tk/dev for 3.12 will be installed later \
54 | gnome-screenshot \
55 | wmctrl \
56 | ffmpeg \
57 | socat \
58 | xclip \
59 | # Browser
60 | wget \
61 | software-properties-common \
62 | # Build tools
63 | build-essential \
64 | libncursesw5-dev \
65 | libssl-dev \
66 | libsqlite3-dev \
67 | tk-dev \
68 | libgl1-mesa-dev \
69 | libgdbm-dev \
70 | libc6-dev \
71 | libbz2-dev \
72 | libffi-dev \
73 | zlib1g-dev \
74 | && rm -rf /var/lib/apt/lists/*
75 |
76 | # Install Python 3.12 from deadsnakes (keep system python3 for apt)
77 | RUN add-apt-repository -y ppa:deadsnakes/ppa && \
78 | apt-get update && apt-get install -y \
79 | python3.12 python3.12-venv python3.12-dev python3.12-tk \
80 | && \
81 | python3.12 -m ensurepip --upgrade && \
82 | python3.12 -m pip install --upgrade pip setuptools wheel && \
83 | rm -rf /var/lib/apt/lists/*
84 |
85 | # Ensure 'python' points to Python 3.12
86 | RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 2
87 |
88 | # Remove screensavers and power manager to avoid popups and lock screens
89 | RUN apt-get remove -y \
90 | xfce4-power-manager \
91 | xfce4-power-manager-data \
92 | xfce4-power-manager-plugins \
93 | xfce4-screensaver \
94 | light-locker \
95 | xscreensaver \
96 | xscreensaver-data || true
97 |
98 | # Create user after sudo is installed
99 | RUN useradd -m -s /bin/bash -G sudo cua && \
100 | echo "cua:cua" | chpasswd && \
101 | echo "cua ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
102 |
103 | # Install Firefox from Mozilla PPA (snap-free) - inline to avoid script issues
104 | RUN apt-get update && \
105 | add-apt-repository -y ppa:mozillateam/ppa && \
106 | echo 'Package: *\nPin: release o=LP-PPA-mozillateam\nPin-Priority: 1001' > /etc/apt/preferences.d/mozilla-firefox && \
107 | apt-get update && \
108 | apt-get install -y firefox && \
109 | echo 'pref("datareporting.policy.firstRunURL", "");\npref("datareporting.policy.dataSubmissionEnabled", false);\npref("datareporting.healthreport.service.enabled", false);\npref("datareporting.healthreport.uploadEnabled", false);\npref("trailhead.firstrun.branches", "nofirstrun-empty");\npref("browser.aboutwelcome.enabled", false);' > /usr/lib/firefox/browser/defaults/preferences/firefox.js && \
110 | update-alternatives --install /usr/bin/x-www-browser x-www-browser /usr/bin/firefox 100 && \
111 | update-alternatives --install /usr/bin/gnome-www-browser gnome-www-browser /usr/bin/firefox 100 && \
112 | rm -rf /var/lib/apt/lists/*
113 |
114 | # Install noVNC
115 | RUN git clone https://github.com/novnc/noVNC.git /opt/noVNC && \
116 | git clone https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \
117 | ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html
118 |
119 | # Pre-create cache directory with correct ownership before pip install
120 | RUN mkdir -p /home/cua/.cache && \
121 | chown -R cua:cua /home/cua/.cache
122 |
123 | # Install computer-server using Python 3.12 pip
124 | RUN python3.12 -m pip install cua-computer-server
125 |
126 | # Install GTK and WebKit dependencies for pywebview
127 | RUN apt-get update && apt-get install -y \
128 | python3-gi \
129 | python3-gi-cairo \
130 | gir1.2-gtk-3.0 \
131 | gir1.2-webkit2-4.1 \
132 | libgirepository1.0-dev \
133 | libcairo2-dev \
134 | pkg-config \
135 | gobject-introspection \
136 | && rm -rf /var/lib/apt/lists/*
137 |
138 | # Install pywebview with GTK backend, used by cua-bench for web UIs
139 | RUN python3.12 -m pip install "pywebview[gtk]"
140 | RUN python3.12 -m pip install cua-bench-ui>=0.7.0 --no-cache-dir
141 |
142 | # Install playwright and Firefox dependencies
143 | RUN python3.12 -m pip install playwright && \
144 | python3.12 -m playwright install --with-deps firefox
145 |
146 | # Fix any cache files created by pip
147 | RUN chown -R cua:cua /home/cua/.cache
148 |
149 | # Copy startup scripts
150 | COPY src/supervisor/ /etc/supervisor/conf.d/
151 | COPY src/scripts/ /usr/local/bin/
152 |
153 | # Make scripts executable
154 | RUN chmod +x /usr/local/bin/*.sh
155 |
156 | # Setup VNC
157 | RUN chown -R cua:cua /home/cua
158 | USER cua
159 | WORKDIR /home/cua
160 |
161 | # Create VNC directory (no password needed with SecurityTypes None)
162 | RUN mkdir -p $HOME/.vnc
163 |
164 | # Configure XFCE for first start
165 | RUN mkdir -p $HOME/.config/xfce4/xfconf/xfce-perchannel-xml $HOME/.config/xfce4 $HOME/.config/autostart
166 |
167 | # Copy XFCE config to disable browser launching and welcome screens
168 | COPY --chown=cua:cua src/xfce-config/helpers.rc $HOME/.config/xfce4/helpers.rc
169 | COPY --chown=cua:cua src/xfce-config/xfce4-session.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-session.xml
170 | COPY --chown=cua:cua src/xfce-config/xfce4-power-manager.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-power-manager.xml
171 |
172 | # Disable autostart for screensaver, lock screen, and power manager
173 | RUN echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-tips-autostart.desktop && \
174 | echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-screensaver.desktop && \
175 | echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/light-locker.desktop && \
176 | echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-power-manager.desktop && \
177 | chown -R cua:cua $HOME/.config
178 |
179 | # Create storage and shared directories, and Firefox cache directory
180 | RUN mkdir -p $HOME/storage $HOME/shared $HOME/.cache/dconf $HOME/.mozilla/firefox && \
181 | chown -R cua:cua $HOME/storage $HOME/shared $HOME/.cache $HOME/.mozilla $HOME/.vnc
182 |
183 | USER root
184 |
185 | # Expose ports
186 | EXPOSE $VNC_PORT $NOVNC_PORT $API_PORT
187 |
188 | # Start services via supervisor
189 | CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
190 |
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/callbacks/operator_validator.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | OperatorValidatorCallback
3 |
4 | Ensures agent output actions conform to expected schemas by fixing common issues:
5 | - click: add default button='left' if missing
6 | - keypress: wrap keys string into a list
7 | - etc.
8 |
9 | This runs in on_llm_end, which receives the output array (AgentMessage[] as dicts).
10 | The purpose is to avoid spending another LLM call to fix broken computer call syntax when possible.
11 | """
12 |
13 | from __future__ import annotations
14 |
15 | from typing import Any, Dict, List
16 |
17 | from .base import AsyncCallbackHandler
18 |
19 |
20 | class OperatorNormalizerCallback(AsyncCallbackHandler):
21 | """Normalizes common computer call hallucinations / errors in computer call syntax."""
22 |
23 | async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
24 | # Mutate in-place as requested, but still return the list for chaining
25 | for item in output or []:
26 | if item.get("type") != "computer_call":
27 | continue
28 | action = item.get("action")
29 | if not isinstance(action, dict):
30 | continue
31 |
32 | # rename mouse click actions to "click"
33 | for mouse_btn in ["left", "right", "wheel", "back", "forward"]:
34 | if action.get("type", "") == f"{mouse_btn}_click":
35 | action["type"] = "click"
36 | action["button"] = mouse_btn
37 | # rename hotkey actions to "keypress"
38 | for alias in ["hotkey", "key", "press", "key_press"]:
39 | if action.get("type", "") == alias:
40 | action["type"] = "keypress"
41 | # assume click actions
42 | if "button" in action and "type" not in action:
43 | action["type"] = "click"
44 | if "click" in action and "type" not in action:
45 | action["type"] = "click"
46 | if ("scroll_x" in action or "scroll_y" in action) and "type" not in action:
47 | action["type"] = "scroll"
48 | if "text" in action and "type" not in action:
49 | action["type"] = "type"
50 |
51 | action_type = action.get("type")
52 |
53 | def _keep_keys(action: Dict[str, Any], keys_to_keep: List[str]):
54 | """Keep only the provided keys on action; delete everything else.
55 | Always ensures required 'type' is present if listed in keys_to_keep.
56 | """
57 | for key in list(action.keys()):
58 | if key not in keys_to_keep:
59 | del action[key]
60 |
61 | # rename "coordinate" to "x", "y"
62 | if "coordinate" in action:
63 | action["x"] = action["coordinate"][0]
64 | action["y"] = action["coordinate"][1]
65 | del action["coordinate"]
66 | if action_type == "click":
67 | # convert "click" to "button"
68 | if "button" not in action and "click" in action:
69 | action["button"] = action["click"]
70 | del action["click"]
71 | # default button to "left"
72 | action["button"] = action.get("button", "left")
73 | # add default scroll x, y if missing
74 | if action_type == "scroll":
75 | action["scroll_x"] = action.get("scroll_x", 0)
76 | action["scroll_y"] = action.get("scroll_y", 0)
77 | # ensure keys arg is a list (normalize aliases first)
78 | if action_type == "keypress":
79 | keys = action.get("keys")
80 | for keys_alias in ["keypress", "key", "press", "key_press", "text"]:
81 | if keys_alias in action:
82 | action["keys"] = action[keys_alias]
83 | del action[keys_alias]
84 | keys = action.get("keys")
85 | if isinstance(keys, str):
86 | action["keys"] = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys]
87 | required_keys_by_type = {
88 | # OpenAI actions
89 | "click": ["type", "button", "x", "y"],
90 | "double_click": ["type", "x", "y"],
91 | "drag": ["type", "path"],
92 | "keypress": ["type", "keys"],
93 | "move": ["type", "x", "y"],
94 | "screenshot": ["type"],
95 | "scroll": ["type", "scroll_x", "scroll_y", "x", "y"],
96 | "type": ["type", "text"],
97 | "wait": ["type"],
98 | # Anthropic actions
99 | "left_mouse_down": ["type", "x", "y"],
100 | "left_mouse_up": ["type", "x", "y"],
101 | "triple_click": ["type", "button", "x", "y"],
102 | }
103 | keep = required_keys_by_type.get(action_type or "")
104 | if keep:
105 | _keep_keys(action, keep)
106 |
107 | # # Second pass: if an assistant message is immediately followed by a computer_call,
108 | # # replace the assistant message itself with a reasoning message with summary text.
109 | # if isinstance(output, list):
110 | # for i, item in enumerate(output):
111 | # # AssistantMessage shape: { type: 'message', role: 'assistant', content: OutputContent[] }
112 | # if item.get("type") == "message" and item.get("role") == "assistant":
113 | # next_idx = i + 1
114 | # if next_idx >= len(output):
115 | # continue
116 | # next_item = output[next_idx]
117 | # if not isinstance(next_item, dict):
118 | # continue
119 | # if next_item.get("type") != "computer_call":
120 | # continue
121 | # contents = item.get("content") or []
122 | # # Extract text from OutputContent[]
123 | # text_parts: List[str] = []
124 | # if isinstance(contents, list):
125 | # for c in contents:
126 | # if isinstance(c, dict) and c.get("type") == "output_text" and isinstance(c.get("text"), str):
127 | # text_parts.append(c["text"])
128 | # text_content = "\n".join(text_parts).strip()
129 | # # Replace assistant message with reasoning message
130 | # output[i] = {
131 | # "type": "reasoning",
132 | # "summary": [
133 | # {
134 | # "type": "summary_text",
135 | # "text": text_content,
136 | # }
137 | # ],
138 | # }
139 |
140 | return output
141 |
```
--------------------------------------------------------------------------------
/scripts/install-cli.ps1:
--------------------------------------------------------------------------------
```
1 | # CUA CLI Installation Script for Windows
2 | $ErrorActionPreference = "Stop"
3 |
4 | function Install-WithBun {
5 | Write-Host "Installing CUA CLI using Bun..." -ForegroundColor Yellow
6 |
7 | # Check if bun is already installed
8 | if (-not (Get-Command bun -ErrorAction SilentlyContinue)) {
9 | Write-Host "Installing Bun..." -ForegroundColor Yellow
10 | try {
11 | powershell -c "irm bun.sh/install.ps1|iex"
12 |
13 | # Refresh environment variables
14 | $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
15 |
16 | # Add bun to PATH for this session if not already there
17 | $bunPath = "$env:USERPROFILE\.bun\bin"
18 | if ($env:Path -notlike "*$bunPath*") {
19 | $env:Path = "$bunPath;$env:Path"
20 | }
21 | } catch {
22 | Write-Host "Error: Failed to install Bun. Please install manually from https://bun.sh" -ForegroundColor Red
23 | return $false
24 | }
25 | }
26 |
27 | # Verify bun installation
28 | if (-not (Get-Command bun -ErrorAction SilentlyContinue)) {
29 | Write-Host "Error: Bun installation failed. Please install manually from https://bun.sh" -ForegroundColor Red
30 | return $false
31 | }
32 |
33 | try {
34 | bun add -g @trycua/cli
35 | # Determine installed version from npm registry
36 | try {
37 | $bunVersion = (npm view @trycua/cli version) 2>$null
38 | if (-not $bunVersion) { $bunVersion = "unknown" }
39 | } catch { $bunVersion = "unknown" }
40 | # Ensure install dir and write version file
41 | $installDir = "$env:USERPROFILE\.cua\bin"
42 | if (-not (Test-Path $installDir)) { New-Item -ItemType Directory -Path $installDir -Force | Out-Null }
43 | Set-Content -Path (Join-Path $installDir ".version") -Value $bunVersion -NoNewline
44 | return $true
45 | } catch {
46 | Write-Host "Warning: Failed to install with Bun, trying npm..." -ForegroundColor Yellow
47 | try {
48 | npm install -g @trycua/cli
49 | # Determine installed version from npm registry
50 | try {
51 | $npmVersion = (npm view @trycua/cli version) 2>$null
52 | if (-not $npmVersion) { $npmVersion = "unknown" }
53 | } catch { $npmVersion = "unknown" }
54 | # Ensure install dir and write version file
55 | $installDir = "$env:USERPROFILE\.cua\bin"
56 | if (-not (Test-Path $installDir)) { New-Item -ItemType Directory -Path $installDir -Force | Out-Null }
57 | Set-Content -Path (Join-Path $installDir ".version") -Value $npmVersion -NoNewline
58 | return $true
59 | } catch {
60 | Write-Host "Error: Installation failed with npm as well." -ForegroundColor Red
61 | return $false
62 | }
63 | }
64 | }
65 |
66 | Write-Host "Installing CUA CLI..." -ForegroundColor Green
67 |
68 | # Determine if this is a 64-bit system
69 | $is64Bit = [Environment]::Is64BitOperatingSystem
70 | if (-not $is64Bit) {
71 | Write-Host "Warning: 32-bit Windows is not supported. Falling back to Bun installation..." -ForegroundColor Yellow
72 | if (Install-WithBun) {
73 | exit 0
74 | } else {
75 | Write-Host "Error: Installation failed. Please try installing manually:" -ForegroundColor Red
76 | Write-Host " irm https://cua.ai/install.ps1 | iex"
77 | exit 1
78 | }
79 | }
80 |
81 | # Get the latest release version
82 | try {
83 | $release = Invoke-RestMethod -Uri "https://api.github.com/repos/trycua/cua/releases/latest" -ErrorAction Stop
84 | $version = $release.tag_name -replace '^cua-v', ''
85 | # Look for the windows binary in the release assets
86 | $windowsAsset = $release.assets | Where-Object { $_.name -eq 'cua-windows-x64.exe' }
87 |
88 | if (-not $windowsAsset) {
89 | throw "Windows binary not found in release assets"
90 | }
91 |
92 | $binaryUrl = $windowsAsset.browser_download_url
93 | } catch {
94 | Write-Host "Warning: Could not fetch latest release, falling back to Bun installation" -ForegroundColor Yellow
95 | if (Install-WithBun) {
96 | exit 0
97 | } else {
98 | Write-Host "Error: Installation failed. Please try installing manually:" -ForegroundColor Red
99 | Write-Host " irm https://cua.ai/install.ps1 | iex"
100 | exit 1
101 | }
102 | }
103 |
104 | # Create installation directory
105 | $installDir = "$env:USERPROFILE\.cua\bin"
106 | if (-not (Test-Path $installDir)) {
107 | New-Item -ItemType Directory -Path $installDir -Force | Out-Null
108 | }
109 |
110 | $binaryPath = Join-Path $installDir "cua.exe"
111 |
112 | # Download the binary
113 | Write-Host "Downloading CUA CLI $version for Windows x64..." -ForegroundColor Cyan
114 | try {
115 | Invoke-WebRequest -Uri $binaryUrl -OutFile $binaryPath -ErrorAction Stop
116 | } catch {
117 | Write-Host "Warning: Failed to download pre-built binary, falling back to Bun installation" -ForegroundColor Yellow
118 | if (Install-WithBun) {
119 | exit 0
120 | } else {
121 | Write-Host "Error: Installation failed. Please try installing manually:" -ForegroundColor Red
122 | Write-Host " irm https://cua.ai/install.ps1 | iex"
123 | exit 1
124 | }
125 | }
126 |
127 | # Write version file for binary install
128 | try {
129 | Set-Content -Path (Join-Path $installDir ".version") -Value $version -NoNewline
130 | } catch {
131 | # Non-fatal
132 | }
133 |
134 | # Add to PATH if not already there
135 | $currentPath = [Environment]::GetEnvironmentVariable("Path", "User")
136 | if ($currentPath -notlike "*$installDir*") {
137 | [Environment]::SetEnvironmentVariable("Path", "$currentPath;$installDir", "User")
138 | $env:Path = "$env:Path;$installDir"
139 | Write-Host "Success: Added $installDir to your PATH" -ForegroundColor Green
140 | }
141 |
142 | # Verify installation
143 | if (Test-Path $binaryPath) {
144 | Write-Host "Success: CUA CLI $version installed successfully to $binaryPath" -ForegroundColor Green
145 | Write-Host ""
146 | Write-Host "Get started with:" -ForegroundColor Cyan
147 | Write-Host " cua login"
148 | Write-Host " cua create --os linux --configuration small --region north-america"
149 | Write-Host ""
150 | Write-Host "For more help, visit: https://docs.cua.ai/libraries/cua-cli" -ForegroundColor Cyan
151 |
152 | # Offer to add to PATH if not already there
153 | if (-not ($env:Path -like "*$installDir*")) {
154 | Write-Host ""
155 | Write-Host "Note: Please restart your terminal or run the following command to use CUA CLI:" -ForegroundColor Yellow
156 | Write-Host " `$env:Path += ';$installDir'"
157 | }
158 | } else {
159 | Write-Host "Error: Installation failed. Please try installing manually:" -ForegroundColor Red
160 | Write-Host " irm https://cua.ai/install.ps1 | iex"
161 | exit 1
162 | }
```
--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
```json
1 | {
2 | "configurations": [
3 | {
4 | "name": "Agent UI",
5 | "type": "debugpy",
6 | "request": "launch",
7 | "program": "examples/agent_ui_examples.py",
8 | "console": "integratedTerminal",
9 | "justMyCode": false,
10 | "python": "${workspaceFolder:cua-root}/.venv/bin/python",
11 | "cwd": "${workspaceFolder:cua-root}",
12 | "env": {
13 | "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
14 | }
15 | },
16 | {
17 | "name": "Computer UI",
18 | "type": "debugpy",
19 | "request": "launch",
20 | "program": "examples/computer_ui_examples.py",
21 | "console": "integratedTerminal",
22 | "justMyCode": false,
23 | "python": "${workspaceFolder:cua-root}/.venv/bin/python",
24 | "cwd": "${workspaceFolder:cua-root}",
25 | "env": {
26 | "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
27 | }
28 | },
29 | {
30 | "name": "Run Computer Examples",
31 | "type": "debugpy",
32 | "request": "launch",
33 | "program": "examples/computer_examples.py",
34 | "console": "integratedTerminal",
35 | "justMyCode": true,
36 | "python": "${workspaceFolder:cua-root}/.venv/bin/python",
37 | "cwd": "${workspaceFolder:cua-root}",
38 | "env": {
39 | "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
40 | }
41 | },
42 | {
43 | "name": "Run Agent Examples",
44 | "type": "debugpy",
45 | "request": "launch",
46 | "program": "examples/agent_examples.py",
47 | "console": "integratedTerminal",
48 | "justMyCode": false,
49 | "python": "${workspaceFolder:cua-root}/.venv/bin/python",
50 | "cwd": "${workspaceFolder:cua-root}",
51 | "env": {
52 | "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
53 | }
54 | },
55 | {
56 | "name": "SOM: Run Experiments (No OCR)",
57 | "type": "debugpy",
58 | "request": "launch",
59 | "program": "examples/som_examples.py",
60 | "args": [
61 | "examples/test_data",
62 | "--output-dir",
63 | "examples/output",
64 | "--ocr",
65 | "none",
66 | "--mode",
67 | "experiment"
68 | ],
69 | "console": "integratedTerminal",
70 | "justMyCode": false,
71 | "python": "${workspaceFolder:cua-root}/.venv/bin/python",
72 | "cwd": "${workspaceFolder:cua-root}",
73 | "env": {
74 | "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
75 | }
76 | },
77 | {
78 | "name": "SOM: Run Experiments (EasyOCR)",
79 | "type": "debugpy",
80 | "request": "launch",
81 | "program": "examples/som_examples.py",
82 | "args": [
83 | "examples/test_data",
84 | "--output-dir",
85 | "examples/output",
86 | "--ocr",
87 | "easyocr",
88 | "--mode",
89 | "experiment"
90 | ],
91 | "console": "integratedTerminal",
92 | "justMyCode": false,
93 | "python": "${workspaceFolder:cua-root}/.venv/bin/python",
94 | "cwd": "${workspaceFolder:cua-root}",
95 | "env": {
96 | "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
97 | }
98 | },
99 | {
100 | "name": "Run Computer Server",
101 | "type": "debugpy",
102 | "request": "launch",
103 | "program": "${workspaceFolder}/libs/python/computer-server/run_server.py",
104 | "console": "integratedTerminal",
105 | "justMyCode": true,
106 | "python": "${workspaceFolder:cua-root}/.venv/bin/python",
107 | "cwd": "${workspaceFolder:cua-root}",
108 | "env": {
109 | "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
110 | }
111 | },
112 | {
113 | "name": "Run Computer Server with Args",
114 | "type": "debugpy",
115 | "request": "launch",
116 | "program": "${workspaceFolder}/libs/python/computer-server/run_server.py",
117 | "args": [
118 | "--host",
119 | "0.0.0.0",
120 | "--port",
121 | "8000",
122 | "--log-level",
123 | "debug"
124 | ],
125 | "console": "integratedTerminal",
126 | "justMyCode": false,
127 | "python": "${workspaceFolder:cua-root}/.venv/bin/python",
128 | "cwd": "${workspaceFolder:cua-root}",
129 | "env": {
130 | "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer-server"
131 | }
132 | },
133 | {
134 | "type": "lldb",
135 | "request": "launch",
136 | "args": [],
137 | "cwd": "${workspaceFolder:cua-root}/libs/lume",
138 | "name": "Debug lume (libs/lume)",
139 | "program": "${workspaceFolder:cua-root}/libs/lume/.build/debug/lume",
140 | "preLaunchTask": "swift: Build Debug lume (libs/lume)"
141 | },
142 | {
143 | "type": "lldb",
144 | "request": "launch",
145 | "args": [],
146 | "cwd": "${workspaceFolder:cua-root}/libs/lume",
147 | "name": "Release lume (libs/lume)",
148 | "program": "${workspaceFolder:cua-root}/libs/lume/.build/release/lume",
149 | "preLaunchTask": "swift: Build Release lume (libs/lume)"
150 | }
151 | ]
152 | }
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/internvl.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | InternVL agent loop implementation for click prediction using litellm.acompletion.
3 |
4 | Implements the ScreenSpot InternVL grounding baseline behavior:
5 | - Uses the exact grounding prompt format with <image> and <ref> tags
6 | - Expects coordinates in 0-1000 normalized range in formats [[x1,y1,x2,y2]] or [[x,y]]
7 | - Converts to pixel coordinates relative to the original screenshot size
8 |
9 | Note: We do NOT manually load the InternVL model; acompletions (via HuggingFaceLocalAdapter)
10 | will handle loading based on the provided model name.
11 | """
12 |
13 | from __future__ import annotations
14 |
15 | import base64
16 | import math
17 | import re
18 | from io import BytesIO
19 | from typing import Any, Dict, List, Optional, Tuple
20 |
21 | import litellm
22 | from PIL import Image
23 |
24 | from ..decorators import register_agent
25 | from ..types import AgentCapability
26 | from .composed_grounded import ComposedGroundedConfig
27 |
28 | # Regex patterns for extracting coordinates
29 | # Accept optional whitespace and optional decimal fractions
30 | _NUM = r"(\d+(?:\.\d+)?)"
31 | _POINT_PATTERN = re.compile(r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]")
32 | _BBOX_PATTERN = re.compile(
33 | r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]"
34 | )
35 |
36 |
37 | def _extract_first_point(text: str) -> Optional[Tuple[float, float]]:
38 | """Extract the first [[x,y]] as normalized (0-1000) floats."""
39 | m = _POINT_PATTERN.search(text)
40 | if not m:
41 | return None
42 | try:
43 | x = float(m.group(1))
44 | y = float(m.group(2))
45 | return x, y
46 | except Exception:
47 | return None
48 |
49 |
50 | def _extract_last_bbox(text: str) -> Optional[Tuple[float, float, float, float]]:
51 | """Extract the last [[x1,y1,x2,y2]] as normalized (0-1000) floats."""
52 | matches = list(_BBOX_PATTERN.finditer(text))
53 | if not matches:
54 | return None
55 | m = matches[-1]
56 | try:
57 | x1 = float(m.group(1))
58 | y1 = float(m.group(2))
59 | x2 = float(m.group(3))
60 | y2 = float(m.group(4))
61 | return x1, y1, x2, y2
62 | except Exception:
63 | return None
64 |
65 |
66 | def _scale_norm_to_pixels(x_norm: float, y_norm: float, width: int, height: int) -> Tuple[int, int]:
67 | """Scale 0-1000 normalized coordinates to pixel coordinates for given image size."""
68 | x_px = int(math.floor((x_norm / 1000.0) * width))
69 | y_px = int(math.floor((y_norm / 1000.0) * height))
70 | # Clamp to image bounds just in case
71 | x_px = max(0, min(width - 1, x_px))
72 | y_px = max(0, min(height - 1, y_px))
73 | return x_px, y_px
74 |
75 |
76 | @register_agent(models=r"(?i).*InternVL.*")
77 | class InternVLConfig(ComposedGroundedConfig):
78 | """InternVL agent configuration reusing ComposedGroundedConfig for steps and
79 | overriding predict_click to implement ScreenSpot InternVL grounding baseline."""
80 |
81 | async def predict_step(
82 | self,
83 | messages: List[Dict[str, Any]],
84 | model: str,
85 | tools: Optional[List[Dict[str, Any]]] = None,
86 | max_retries: Optional[int] = None,
87 | stream: bool = False,
88 | computer_handler=None,
89 | _on_api_start=None,
90 | _on_api_end=None,
91 | _on_usage=None,
92 | _on_screenshot=None,
93 | **kwargs,
94 | ) -> Dict[str, Any]:
95 | """Fallback to a self-composed model"""
96 | return await super().predict_step(
97 | messages=messages,
98 | model=f"{model}+{model}",
99 | tools=tools,
100 | max_retries=max_retries,
101 | stream=stream,
102 | computer_handler=computer_handler,
103 | _on_api_start=_on_api_start,
104 | _on_api_end=_on_api_end,
105 | _on_usage=_on_usage,
106 | _on_screenshot=_on_screenshot,
107 | **kwargs,
108 | )
109 |
110 | async def predict_click(
111 | self, model: str, image_b64: str, instruction: str, **kwargs
112 | ) -> Optional[Tuple[int, int]]:
113 | """
114 | Predict click coordinates using InternVL via litellm.acompletion.
115 |
116 | Behavior mirrors the ScreenSpot InternVL baseline:
117 | - Prompt: "<image>\nPlease provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. Answer in the format of [[x1, y1, x2, y2]]"
118 | - Parse either [[x,y]] point or [[x1,y1,x2,y2]] bbox, using bbox center if point missing
119 | - Coordinates are 0-1000 normalized; convert to pixel coordinates for the original screenshot
120 | """
121 | try:
122 | # Decode image dimensions to scale the normalized outputs
123 | img_bytes = base64.b64decode(image_b64)
124 | image = Image.open(BytesIO(img_bytes))
125 | width, height = image.size
126 | except Exception:
127 | # If decoding fails, proceed with a safe default size to avoid crash
128 | width, height = 1920, 1080
129 |
130 | # Build grounding prompt exactly like the baseline
131 | grounding_prompt = (
132 | f"Please provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. "
133 | f"Answer in the format of [[x1, y1, x2, y2]]"
134 | )
135 |
136 | # Prepare messages for LiteLLM
137 | messages = [
138 | {
139 | "role": "user",
140 | "content": [
141 | {
142 | "type": "image_url",
143 | "image_url": {"url": f"data:image/png;base64,{image_b64}"},
144 | },
145 | {"type": "text", "text": grounding_prompt},
146 | ],
147 | }
148 | ]
149 |
150 | # Call acompletion; HuggingFaceLocalAdapter/model handler will handle InternVL loading
151 | api_kwargs = {
152 | "model": model,
153 | "messages": messages,
154 | # Conservative generation params akin to baseline (deterministic)
155 | "max_tokens": kwargs.get("max_tokens", 256),
156 | "temperature": kwargs.get("temperature", 0.0),
157 | }
158 |
159 | response = await litellm.acompletion(**api_kwargs)
160 | output_text = (response.choices[0].message.content or "").strip() # type: ignore
161 |
162 | # print(f"InternVL output: {output_text}")
163 |
164 | # Try to parse a point first; if absent, parse bbox and take center
165 | point = _extract_first_point(output_text)
166 | if point is None:
167 | bbox = _extract_last_bbox(output_text)
168 | if bbox is None:
169 | return None
170 | x1, y1, x2, y2 = bbox
171 | cx = (x1 + x2) / 2.0
172 | cy = (y1 + y2) / 2.0
173 | point = (cx, cy)
174 |
175 | x_norm, y_norm = point
176 | x_px, y_px = _scale_norm_to_pixels(x_norm, y_norm, width, height)
177 | return (x_px, y_px)
178 |
179 | def get_capabilities(self) -> List[AgentCapability]:
180 | return ["click", "step"]
181 |
```
--------------------------------------------------------------------------------
/libs/python/agent/benchmarks/interactive.py:
--------------------------------------------------------------------------------
```python
1 | #!/usr/bin/env python3
2 | """
3 | Interactive Click Prediction Tool
4 |
5 | Takes screenshots and allows testing multiple models interactively.
6 | Models are loaded/unloaded one at a time to avoid memory issues.
7 | """
8 |
9 | import asyncio
10 | import os
11 | from datetime import datetime
12 | from typing import Any, Dict, List
13 |
14 | from utils import (
15 | ModelWrapper,
16 | get_available_models,
17 | save_prediction_visualization,
18 | take_screenshot,
19 | )
20 |
21 |
22 | async def predict_with_all_models(image, instruction: str, models) -> List[Dict[str, Any]]:
23 | """
24 | Predict click coordinates with all models sequentially.
25 |
26 | Args:
27 | image: PIL Image to analyze
28 | instruction: Instruction text
29 | models: List of model instances
30 |
31 | Returns:
32 | List of prediction results
33 | """
34 | predictions = []
35 |
36 | for model in models:
37 | model_wrapper = ModelWrapper(model)
38 | print(f"\n🔄 Loading {model_wrapper.model_name}...")
39 |
40 | try:
41 | # Load model
42 | await model_wrapper.load_model()
43 |
44 | # Predict
45 | coords = await model_wrapper.predict_click(image, instruction)
46 |
47 | predictions.append(
48 | {"model_name": model_wrapper.model_name, "coords": coords, "error": None}
49 | )
50 |
51 | if coords:
52 | print(f"✅ {model_wrapper.model_name}: ({coords[0]}, {coords[1]})")
53 | else:
54 | print(f"❌ {model_wrapper.model_name}: No prediction")
55 |
56 | except Exception as e:
57 | print(f"❌ {model_wrapper.model_name}: ERROR - {str(e)}")
58 | predictions.append(
59 | {"model_name": model_wrapper.model_name, "coords": None, "error": str(e)}
60 | )
61 |
62 | finally:
63 | # Always unload model to free memory
64 | try:
65 | await model_wrapper.unload_model()
66 | print(f"🗑️ Unloaded {model_wrapper.model_name}")
67 | except Exception as e:
68 | print(f"⚠️ Error unloading {model_wrapper.model_name}: {e}")
69 |
70 | return predictions
71 |
72 |
73 | def print_header():
74 | """Print the interactive tool header."""
75 | print("=" * 60)
76 | print("🖱️ Interactive Click Prediction Tool")
77 | print("=" * 60)
78 | print("Commands:")
79 | print(" • Type an instruction to test models on last screenshot")
80 | print(" • 'screenshot' - Take a new screenshot")
81 | print(" • 'models' - List available models")
82 | print(" • 'quit' or 'exit' - Exit the tool")
83 | print("=" * 60)
84 | print("💡 Tip: Take a screenshot first, then send instructions to test models!")
85 |
86 |
87 | def print_models(models):
88 | """Print available models."""
89 | print("\n📋 Available Models:")
90 | for i, model in enumerate(models, 1):
91 | if isinstance(model, str):
92 | print(f" {i}. {model}")
93 | else:
94 | print(f" {i}. models.{model.__class__.__name__}")
95 |
96 |
97 | async def main():
98 | """
99 | Main interactive loop.
100 | """
101 | print_header()
102 |
103 | # Get available models
104 | models = get_available_models()
105 | print_models(models)
106 |
107 | # Create output directory for visualizations
108 | output_dir = "interactive_output"
109 | os.makedirs(output_dir, exist_ok=True)
110 |
111 | session_count = 0
112 | last_screenshot = None
113 | screenshot_timestamp = None
114 |
115 | while True:
116 | try:
117 | # Get user input
118 | print(f"\n{'='*40}")
119 | user_input = input("🎯 Enter instruction (or command): ").strip()
120 |
121 | if not user_input:
122 | continue
123 |
124 | # Handle commands
125 | if user_input.lower() in ["quit", "exit", "q"]:
126 | print("👋 Goodbye!")
127 | break
128 |
129 | elif user_input.lower() == "models":
130 | print_models(models)
131 | continue
132 |
133 | elif user_input.lower() == "screenshot":
134 | print("📸 Taking screenshot...")
135 | try:
136 | last_screenshot = take_screenshot()
137 | screenshot_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
138 | screenshot_path = os.path.join(
139 | output_dir, f"screenshot_{screenshot_timestamp}.png"
140 | )
141 | last_screenshot.save(screenshot_path)
142 | print(f"✅ Screenshot captured and saved to: {screenshot_path}")
143 | print(f"📝 Ready for instructions! Screenshot size: {last_screenshot.size}")
144 | except Exception as e:
145 | print(f"❌ Error taking screenshot: {e}")
146 | continue
147 |
148 | # Handle instruction input
149 | if last_screenshot is None:
150 | print(
151 | "⚠️ No screenshot available! Please take a screenshot first using 'screenshot' command."
152 | )
153 | continue
154 |
155 | session_count += 1
156 | print(f"\n🎯 Session {session_count}: '{user_input}'")
157 | print(f"📷 Using screenshot from: {screenshot_timestamp}")
158 |
159 | # Predict with all models using last screenshot
160 | print(f"\n🤖 Testing {len(models)} models on screenshot...")
161 | predictions = await predict_with_all_models(last_screenshot, user_input, models)
162 |
163 | # Display results summary
164 | print("\n📊 Results Summary:")
165 | print("-" * 50)
166 | for pred in predictions:
167 | if pred["coords"]:
168 | print(f"✅ {pred['model_name']}: ({pred['coords'][0]}, {pred['coords'][1]})")
169 | elif pred["error"]:
170 | print(f"❌ {pred['model_name']}: ERROR - {pred['error']}")
171 | else:
172 | print(f"❌ {pred['model_name']}: No prediction")
173 |
174 | # Save visualization
175 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
176 | vis_filename = f"session_{session_count:03d}_{timestamp}.png"
177 | vis_path = os.path.join(output_dir, vis_filename)
178 |
179 | try:
180 | save_prediction_visualization(last_screenshot, user_input, predictions, vis_path)
181 | print(f"\n💾 Visualization saved to: {vis_path}")
182 | except Exception as e:
183 | print(f"⚠️ Error saving visualization: {e}")
184 |
185 | print(f"\n✨ Session {session_count} completed!")
186 |
187 | except KeyboardInterrupt:
188 | print("\n\n👋 Interrupted by user. Goodbye!")
189 | break
190 | except Exception as e:
191 | print(f"\n❌ Unexpected error: {e}")
192 | print("Continuing...")
193 |
194 |
195 | if __name__ == "__main__":
196 | try:
197 | asyncio.run(main())
198 | except KeyboardInterrupt:
199 | print("\n👋 Goodbye!")
200 | except Exception as e:
201 | print(f"❌ Fatal error: {e}")
202 |
```
--------------------------------------------------------------------------------
/.github/workflows/pypi-publish-agent.yml:
--------------------------------------------------------------------------------
```yaml
1 | name: Publish Agent Package
2 |
3 | on:
4 | push:
5 | tags:
6 | - "agent-v*"
7 | workflow_dispatch:
8 | inputs:
9 | version:
10 | description: "Version to publish (without v prefix)"
11 | required: true
12 | default: "0.1.0"
13 | workflow_call:
14 | inputs:
15 | version:
16 | description: "Version to publish"
17 | required: true
18 | type: string
19 |
20 | # Adding permissions at workflow level
21 | permissions:
22 | contents: write
23 |
24 | jobs:
25 | prepare:
26 | runs-on: macos-latest
27 | outputs:
28 | version: ${{ steps.get-version.outputs.version }}
29 | computer_version: ${{ steps.update-deps.outputs.computer_version }}
30 | som_version: ${{ steps.update-deps.outputs.som_version }}
31 | core_version: ${{ steps.update-deps.outputs.core_version }}
32 | steps:
33 | - uses: actions/checkout@v4
34 | with:
35 | ref: main
36 | fetch-depth: 0
37 |
38 | - name: Ensure latest main branch
39 | run: |
40 | git fetch origin main
41 | git reset --hard origin/main
42 | echo "Current HEAD commit:"
43 | git log -1 --oneline
44 |
45 | - name: Determine version
46 | id: get-version
47 | run: |
48 | # Check inputs.version first (works for workflow_call regardless of event_name)
49 | if [ -n "${{ inputs.version }}" ]; then
50 | VERSION=${{ inputs.version }}
51 | elif [ "${{ github.event_name }}" == "push" ]; then
52 | # Extract version from tag (for package-specific tags)
53 | if [[ "${{ github.ref }}" =~ ^refs/tags/agent-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then
54 | VERSION=${BASH_REMATCH[1]}
55 | else
56 | echo "ERROR: Invalid tag format for agent"
57 | exit 1
58 | fi
59 | elif [ -n "${{ github.event.inputs.version }}" ]; then
60 | VERSION=${{ github.event.inputs.version }}
61 | else
62 | echo "ERROR: No version found (inputs.version, event.inputs.version, and tag all empty)"
63 | exit 1
64 | fi
65 |
66 | echo "Agent version: $VERSION"
67 | echo "version=$VERSION" >> $GITHUB_OUTPUT
68 |
69 | - name: Set up Python
70 | uses: actions/setup-python@v4
71 | with:
72 | python-version: "3.11"
73 |
74 | - name: Update dependencies to latest versions
75 | id: update-deps
76 | run: |
77 | cd libs/python/agent
78 |
79 | # Install required package for PyPI API access
80 | pip install requests
81 |
82 | # Create a more robust Python script for PyPI version checking
83 | cat > get_latest_versions.py << 'EOF'
84 | import requests
85 | import json
86 | import sys
87 |
88 | def get_package_version(package_name, fallback="0.1.0"):
89 | try:
90 | response = requests.get(f'https://pypi.org/pypi/{package_name}/json')
91 | print(f"API Response Status for {package_name}: {response.status_code}", file=sys.stderr)
92 |
93 | if response.status_code != 200:
94 | print(f"API request failed for {package_name}, using fallback version", file=sys.stderr)
95 | return fallback
96 |
97 | data = json.loads(response.text)
98 |
99 | if 'info' not in data:
100 | print(f"Missing 'info' key in API response for {package_name}, using fallback version", file=sys.stderr)
101 | return fallback
102 |
103 | return data['info']['version']
104 | except Exception as e:
105 | print(f"Error fetching version for {package_name}: {str(e)}", file=sys.stderr)
106 | return fallback
107 |
108 | # Get latest versions
109 | print(get_package_version('cua-computer'))
110 | print(get_package_version('cua-som'))
111 | print(get_package_version('cua-core'))
112 | EOF
113 |
114 | # Execute the script to get the versions
115 | VERSIONS=($(python get_latest_versions.py))
116 | LATEST_COMPUTER=${VERSIONS[0]}
117 | LATEST_SOM=${VERSIONS[1]}
118 | LATEST_CORE=${VERSIONS[2]}
119 |
120 | echo "Latest cua-computer version: $LATEST_COMPUTER"
121 | echo "Latest cua-som version: $LATEST_SOM"
122 | echo "Latest cua-core version: $LATEST_CORE"
123 |
124 | # Output the versions for the next job
125 | echo "computer_version=$LATEST_COMPUTER" >> $GITHUB_OUTPUT
126 | echo "som_version=$LATEST_SOM" >> $GITHUB_OUTPUT
127 | echo "core_version=$LATEST_CORE" >> $GITHUB_OUTPUT
128 |
129 | # Determine major version for version constraint
130 | COMPUTER_MAJOR=$(echo $LATEST_COMPUTER | cut -d. -f1)
131 | SOM_MAJOR=$(echo $LATEST_SOM | cut -d. -f1)
132 | CORE_MAJOR=$(echo $LATEST_CORE | cut -d. -f1)
133 |
134 | NEXT_COMPUTER_MAJOR=$((COMPUTER_MAJOR + 1))
135 | NEXT_SOM_MAJOR=$((SOM_MAJOR + 1))
136 | NEXT_CORE_MAJOR=$((CORE_MAJOR + 1))
137 |
138 | # Update dependencies in pyproject.toml
139 | if [[ "$OSTYPE" == "darwin"* ]]; then
140 | # macOS version of sed needs an empty string for -i
141 | sed -i '' "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml
142 | sed -i '' "s/\"cua-som>=.*,<.*\"/\"cua-som>=$LATEST_SOM,<$NEXT_SOM_MAJOR.0.0\"/" pyproject.toml
143 | sed -i '' "s/\"cua-core>=.*,<.*\"/\"cua-core>=$LATEST_CORE,<$NEXT_CORE_MAJOR.0.0\"/" pyproject.toml
144 | else
145 | # Linux version
146 | sed -i "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml
147 | sed -i "s/\"cua-som>=.*,<.*\"/\"cua-som>=$LATEST_SOM,<$NEXT_SOM_MAJOR.0.0\"/" pyproject.toml
148 | sed -i "s/\"cua-core>=.*,<.*\"/\"cua-core>=$LATEST_CORE,<$NEXT_CORE_MAJOR.0.0\"/" pyproject.toml
149 | fi
150 |
151 | # Display the updated dependencies
152 | echo "Updated dependencies in pyproject.toml:"
153 | grep -E "cua-computer|cua-som|cua-core" pyproject.toml
154 |
155 | publish:
156 | needs: prepare
157 | uses: ./.github/workflows/pypi-reusable-publish.yml
158 | with:
159 | package_name: "agent"
160 | package_dir: "libs/python/agent"
161 | version: ${{ needs.prepare.outputs.version }}
162 | is_lume_package: false
163 | base_package_name: "cua-agent"
164 | secrets:
165 | PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
166 |
167 | set-env-variables:
168 | needs: [prepare, publish]
169 | runs-on: macos-latest
170 | steps:
171 | - name: Set environment variables for use in other jobs
172 | run: |
173 | echo "COMPUTER_VERSION=${{ needs.prepare.outputs.computer_version }}" >> $GITHUB_ENV
174 | echo "SOM_VERSION=${{ needs.prepare.outputs.som_version }}" >> $GITHUB_ENV
175 | echo "CORE_VERSION=${{ needs.prepare.outputs.core_version }}" >> $GITHUB_ENV
176 |
```