This is page 10 of 28. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .cursorignore
├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github
│ ├── FUNDING.yml
│ ├── scripts
│ │ ├── get_pyproject_version.py
│ │ └── tests
│ │ ├── __init__.py
│ │ ├── README.md
│ │ └── test_get_pyproject_version.py
│ └── workflows
│ ├── bump-version.yml
│ ├── ci-lume.yml
│ ├── docker-publish-cua-linux.yml
│ ├── docker-publish-cua-windows.yml
│ ├── docker-publish-kasm.yml
│ ├── docker-publish-xfce.yml
│ ├── docker-reusable-publish.yml
│ ├── link-check.yml
│ ├── lint.yml
│ ├── npm-publish-cli.yml
│ ├── npm-publish-computer.yml
│ ├── npm-publish-core.yml
│ ├── publish-lume.yml
│ ├── pypi-publish-agent.yml
│ ├── pypi-publish-computer-server.yml
│ ├── pypi-publish-computer.yml
│ ├── pypi-publish-core.yml
│ ├── pypi-publish-mcp-server.yml
│ ├── pypi-publish-som.yml
│ ├── pypi-reusable-publish.yml
│ ├── python-tests.yml
│ ├── test-cua-models.yml
│ └── test-validation-script.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierignore
├── .prettierrc.yaml
├── .vscode
│ ├── docs.code-workspace
│ ├── extensions.json
│ ├── launch.json
│ ├── libs-ts.code-workspace
│ ├── lume.code-workspace
│ ├── lumier.code-workspace
│ ├── py.code-workspace
│ └── settings.json
├── blog
│ ├── app-use.md
│ ├── assets
│ │ ├── composite-agents.png
│ │ ├── docker-ubuntu-support.png
│ │ ├── hack-booth.png
│ │ ├── hack-closing-ceremony.jpg
│ │ ├── hack-cua-ollama-hud.jpeg
│ │ ├── hack-leaderboard.png
│ │ ├── hack-the-north.png
│ │ ├── hack-winners.jpeg
│ │ ├── hack-workshop.jpeg
│ │ ├── hud-agent-evals.png
│ │ └── trajectory-viewer.jpeg
│ ├── bringing-computer-use-to-the-web.md
│ ├── build-your-own-operator-on-macos-1.md
│ ├── build-your-own-operator-on-macos-2.md
│ ├── cloud-windows-ga-macos-preview.md
│ ├── composite-agents.md
│ ├── computer-use-agents-for-growth-hacking.md
│ ├── cua-hackathon.md
│ ├── cua-playground-preview.md
│ ├── cua-vlm-router.md
│ ├── hack-the-north.md
│ ├── hud-agent-evals.md
│ ├── human-in-the-loop.md
│ ├── introducing-cua-cli.md
│ ├── introducing-cua-cloud-containers.md
│ ├── lume-to-containerization.md
│ ├── neurips-2025-cua-papers.md
│ ├── sandboxed-python-execution.md
│ ├── training-computer-use-models-trajectories-1.md
│ ├── trajectory-viewer.md
│ ├── ubuntu-docker-support.md
│ └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│ ├── .env.example
│ ├── .gitignore
│ ├── content
│ │ └── docs
│ │ ├── agent-sdk
│ │ │ ├── agent-loops.mdx
│ │ │ ├── benchmarks
│ │ │ │ ├── index.mdx
│ │ │ │ ├── interactive.mdx
│ │ │ │ ├── introduction.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── osworld-verified.mdx
│ │ │ │ ├── screenspot-pro.mdx
│ │ │ │ └── screenspot-v2.mdx
│ │ │ ├── callbacks
│ │ │ │ ├── agent-lifecycle.mdx
│ │ │ │ ├── cost-saving.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── logging.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── pii-anonymization.mdx
│ │ │ │ └── trajectories.mdx
│ │ │ ├── chat-history.mdx
│ │ │ ├── custom-tools.mdx
│ │ │ ├── customizing-computeragent.mdx
│ │ │ ├── integrations
│ │ │ │ ├── hud.mdx
│ │ │ │ ├── meta.json
│ │ │ │ └── observability.mdx
│ │ │ ├── mcp-server
│ │ │ │ ├── client-integrations.mdx
│ │ │ │ ├── configuration.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ ├── llm-integrations.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── tools.mdx
│ │ │ │ └── usage.mdx
│ │ │ ├── message-format.mdx
│ │ │ ├── meta.json
│ │ │ ├── migration-guide.mdx
│ │ │ ├── prompt-caching.mdx
│ │ │ ├── supported-agents
│ │ │ │ ├── composed-agents.mdx
│ │ │ │ ├── computer-use-agents.mdx
│ │ │ │ ├── grounding-models.mdx
│ │ │ │ ├── human-in-the-loop.mdx
│ │ │ │ └── meta.json
│ │ │ ├── supported-model-providers
│ │ │ │ ├── cua-vlm-router.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ └── local-models.mdx
│ │ │ ├── telemetry.mdx
│ │ │ └── usage-tracking.mdx
│ │ ├── cli-playbook
│ │ │ ├── commands.mdx
│ │ │ ├── index.mdx
│ │ │ └── meta.json
│ │ ├── computer-sdk
│ │ │ ├── cloud-vm-management.mdx
│ │ │ ├── commands.mdx
│ │ │ ├── computer-server
│ │ │ │ ├── Commands.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── REST-API.mdx
│ │ │ │ └── WebSocket-API.mdx
│ │ │ ├── computer-ui.mdx
│ │ │ ├── computers.mdx
│ │ │ ├── custom-computer-handlers.mdx
│ │ │ ├── meta.json
│ │ │ ├── sandboxed-python.mdx
│ │ │ └── tracing-api.mdx
│ │ ├── example-usecases
│ │ │ ├── form-filling.mdx
│ │ │ ├── gemini-complex-ui-navigation.mdx
│ │ │ ├── meta.json
│ │ │ ├── post-event-contact-export.mdx
│ │ │ └── windows-app-behind-vpn.mdx
│ │ ├── get-started
│ │ │ ├── meta.json
│ │ │ └── quickstart.mdx
│ │ ├── index.mdx
│ │ ├── macos-vm-cli-playbook
│ │ │ ├── lume
│ │ │ │ ├── cli-reference.mdx
│ │ │ │ ├── faq.md
│ │ │ │ ├── http-api.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ ├── meta.json
│ │ │ │ └── prebuilt-images.mdx
│ │ │ ├── lumier
│ │ │ │ ├── building-lumier.mdx
│ │ │ │ ├── docker-compose.mdx
│ │ │ │ ├── docker.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ └── meta.json
│ │ │ └── meta.json
│ │ └── meta.json
│ ├── next.config.mjs
│ ├── package-lock.json
│ ├── package.json
│ ├── pnpm-lock.yaml
│ ├── postcss.config.mjs
│ ├── public
│ │ └── img
│ │ ├── agent_gradio_ui.png
│ │ ├── agent.png
│ │ ├── bg-dark.jpg
│ │ ├── bg-light.jpg
│ │ ├── cli.png
│ │ ├── computer.png
│ │ ├── grounding-with-gemini3.gif
│ │ ├── hero.png
│ │ ├── laminar_trace_example.png
│ │ ├── som_box_threshold.png
│ │ └── som_iou_threshold.png
│ ├── README.md
│ ├── source.config.ts
│ ├── src
│ │ ├── app
│ │ │ ├── (home)
│ │ │ │ ├── [[...slug]]
│ │ │ │ │ └── page.tsx
│ │ │ │ └── layout.tsx
│ │ │ ├── api
│ │ │ │ ├── posthog
│ │ │ │ │ └── [...path]
│ │ │ │ │ └── route.ts
│ │ │ │ └── search
│ │ │ │ └── route.ts
│ │ │ ├── favicon.ico
│ │ │ ├── global.css
│ │ │ ├── layout.config.tsx
│ │ │ ├── layout.tsx
│ │ │ ├── llms.mdx
│ │ │ │ └── [[...slug]]
│ │ │ │ └── route.ts
│ │ │ ├── llms.txt
│ │ │ │ └── route.ts
│ │ │ ├── robots.ts
│ │ │ └── sitemap.ts
│ │ ├── assets
│ │ │ ├── discord-black.svg
│ │ │ ├── discord-white.svg
│ │ │ ├── logo-black.svg
│ │ │ └── logo-white.svg
│ │ ├── components
│ │ │ ├── analytics-tracker.tsx
│ │ │ ├── cookie-consent.tsx
│ │ │ ├── doc-actions-menu.tsx
│ │ │ ├── editable-code-block.tsx
│ │ │ ├── footer.tsx
│ │ │ ├── hero.tsx
│ │ │ ├── iou.tsx
│ │ │ ├── mermaid.tsx
│ │ │ └── page-feedback.tsx
│ │ ├── lib
│ │ │ ├── llms.ts
│ │ │ └── source.ts
│ │ ├── mdx-components.tsx
│ │ └── providers
│ │ └── posthog-provider.tsx
│ └── tsconfig.json
├── examples
│ ├── agent_examples.py
│ ├── agent_ui_examples.py
│ ├── browser_tool_example.py
│ ├── cloud_api_examples.py
│ ├── computer_examples_windows.py
│ ├── computer_examples.py
│ ├── computer_ui_examples.py
│ ├── computer-example-ts
│ │ ├── .env.example
│ │ ├── .gitignore
│ │ ├── package-lock.json
│ │ ├── package.json
│ │ ├── pnpm-lock.yaml
│ │ ├── README.md
│ │ ├── src
│ │ │ ├── helpers.ts
│ │ │ └── index.ts
│ │ └── tsconfig.json
│ ├── docker_examples.py
│ ├── evals
│ │ ├── hud_eval_examples.py
│ │ └── wikipedia_most_linked.txt
│ ├── pylume_examples.py
│ ├── sandboxed_functions_examples.py
│ ├── som_examples.py
│ ├── tracing_examples.py
│ ├── utils.py
│ └── winsandbox_example.py
├── img
│ ├── agent_gradio_ui.png
│ ├── agent.png
│ ├── cli.png
│ ├── computer.png
│ ├── logo_black.png
│ └── logo_white.png
├── libs
│ ├── kasm
│ │ ├── Dockerfile
│ │ ├── LICENSE
│ │ ├── README.md
│ │ └── src
│ │ └── ubuntu
│ │ └── install
│ │ └── firefox
│ │ ├── custom_startup.sh
│ │ ├── firefox.desktop
│ │ └── install_firefox.sh
│ ├── lume
│ │ ├── .cursorignore
│ │ ├── CONTRIBUTING.md
│ │ ├── Development.md
│ │ ├── img
│ │ │ └── cli.png
│ │ ├── Package.resolved
│ │ ├── Package.swift
│ │ ├── README.md
│ │ ├── resources
│ │ │ └── lume.entitlements
│ │ ├── scripts
│ │ │ ├── build
│ │ │ │ ├── build-debug.sh
│ │ │ │ ├── build-release-notarized.sh
│ │ │ │ └── build-release.sh
│ │ │ └── install.sh
│ │ ├── src
│ │ │ ├── Commands
│ │ │ │ ├── Clone.swift
│ │ │ │ ├── Config.swift
│ │ │ │ ├── Create.swift
│ │ │ │ ├── Delete.swift
│ │ │ │ ├── Get.swift
│ │ │ │ ├── Images.swift
│ │ │ │ ├── IPSW.swift
│ │ │ │ ├── List.swift
│ │ │ │ ├── Logs.swift
│ │ │ │ ├── Options
│ │ │ │ │ └── FormatOption.swift
│ │ │ │ ├── Prune.swift
│ │ │ │ ├── Pull.swift
│ │ │ │ ├── Push.swift
│ │ │ │ ├── Run.swift
│ │ │ │ ├── Serve.swift
│ │ │ │ ├── Set.swift
│ │ │ │ └── Stop.swift
│ │ │ ├── ContainerRegistry
│ │ │ │ ├── ImageContainerRegistry.swift
│ │ │ │ ├── ImageList.swift
│ │ │ │ └── ImagesPrinter.swift
│ │ │ ├── Errors
│ │ │ │ └── Errors.swift
│ │ │ ├── FileSystem
│ │ │ │ ├── Home.swift
│ │ │ │ ├── Settings.swift
│ │ │ │ ├── VMConfig.swift
│ │ │ │ ├── VMDirectory.swift
│ │ │ │ └── VMLocation.swift
│ │ │ ├── LumeController.swift
│ │ │ ├── Main.swift
│ │ │ ├── Server
│ │ │ │ ├── Handlers.swift
│ │ │ │ ├── HTTP.swift
│ │ │ │ ├── Requests.swift
│ │ │ │ ├── Responses.swift
│ │ │ │ └── Server.swift
│ │ │ ├── Utils
│ │ │ │ ├── CommandRegistry.swift
│ │ │ │ ├── CommandUtils.swift
│ │ │ │ ├── Logger.swift
│ │ │ │ ├── NetworkUtils.swift
│ │ │ │ ├── Path.swift
│ │ │ │ ├── ProcessRunner.swift
│ │ │ │ ├── ProgressLogger.swift
│ │ │ │ ├── String.swift
│ │ │ │ └── Utils.swift
│ │ │ ├── Virtualization
│ │ │ │ ├── DarwinImageLoader.swift
│ │ │ │ ├── DHCPLeaseParser.swift
│ │ │ │ ├── ImageLoaderFactory.swift
│ │ │ │ └── VMVirtualizationService.swift
│ │ │ ├── VM
│ │ │ │ ├── DarwinVM.swift
│ │ │ │ ├── LinuxVM.swift
│ │ │ │ ├── VM.swift
│ │ │ │ ├── VMDetails.swift
│ │ │ │ ├── VMDetailsPrinter.swift
│ │ │ │ ├── VMDisplayResolution.swift
│ │ │ │ └── VMFactory.swift
│ │ │ └── VNC
│ │ │ ├── PassphraseGenerator.swift
│ │ │ └── VNCService.swift
│ │ └── tests
│ │ ├── Mocks
│ │ │ ├── MockVM.swift
│ │ │ ├── MockVMVirtualizationService.swift
│ │ │ └── MockVNCService.swift
│ │ ├── VM
│ │ │ └── VMDetailsPrinterTests.swift
│ │ ├── VMTests.swift
│ │ ├── VMVirtualizationServiceTests.swift
│ │ └── VNCServiceTests.swift
│ ├── lumier
│ │ ├── .dockerignore
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ └── src
│ │ ├── bin
│ │ │ └── entry.sh
│ │ ├── config
│ │ │ └── constants.sh
│ │ ├── hooks
│ │ │ └── on-logon.sh
│ │ └── lib
│ │ ├── utils.sh
│ │ └── vm.sh
│ ├── python
│ │ ├── agent
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── agent
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── adapters
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cua_adapter.py
│ │ │ │ │ ├── huggingfacelocal_adapter.py
│ │ │ │ │ ├── human_adapter.py
│ │ │ │ │ ├── mlxvlm_adapter.py
│ │ │ │ │ └── models
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── internvl.py
│ │ │ │ │ ├── opencua.py
│ │ │ │ │ └── qwen2_5_vl.py
│ │ │ │ ├── agent.py
│ │ │ │ ├── callbacks
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── budget_manager.py
│ │ │ │ │ ├── image_retention.py
│ │ │ │ │ ├── logging.py
│ │ │ │ │ ├── operator_validator.py
│ │ │ │ │ ├── pii_anonymization.py
│ │ │ │ │ ├── prompt_instructions.py
│ │ │ │ │ ├── telemetry.py
│ │ │ │ │ └── trajectory_saver.py
│ │ │ │ ├── cli.py
│ │ │ │ ├── computers
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── cua.py
│ │ │ │ │ └── custom.py
│ │ │ │ ├── decorators.py
│ │ │ │ ├── human_tool
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── __main__.py
│ │ │ │ │ ├── server.py
│ │ │ │ │ └── ui.py
│ │ │ │ ├── integrations
│ │ │ │ │ └── hud
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── agent.py
│ │ │ │ │ └── proxy.py
│ │ │ │ ├── loops
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── anthropic.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── composed_grounded.py
│ │ │ │ │ ├── gelato.py
│ │ │ │ │ ├── gemini.py
│ │ │ │ │ ├── generic_vlm.py
│ │ │ │ │ ├── glm45v.py
│ │ │ │ │ ├── gta1.py
│ │ │ │ │ ├── holo.py
│ │ │ │ │ ├── internvl.py
│ │ │ │ │ ├── model_types.csv
│ │ │ │ │ ├── moondream3.py
│ │ │ │ │ ├── omniparser.py
│ │ │ │ │ ├── openai.py
│ │ │ │ │ ├── opencua.py
│ │ │ │ │ ├── uiins.py
│ │ │ │ │ ├── uitars.py
│ │ │ │ │ └── uitars2.py
│ │ │ │ ├── proxy
│ │ │ │ │ ├── examples.py
│ │ │ │ │ └── handlers.py
│ │ │ │ ├── responses.py
│ │ │ │ ├── tools
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── browser_tool.py
│ │ │ │ ├── types.py
│ │ │ │ └── ui
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ └── gradio
│ │ │ │ ├── __init__.py
│ │ │ │ ├── app.py
│ │ │ │ └── ui_components.py
│ │ │ ├── benchmarks
│ │ │ │ ├── .gitignore
│ │ │ │ ├── contrib.md
│ │ │ │ ├── interactive.py
│ │ │ │ ├── models
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ └── gta1.py
│ │ │ │ ├── README.md
│ │ │ │ ├── ss-pro.py
│ │ │ │ ├── ss-v2.py
│ │ │ │ └── utils.py
│ │ │ ├── example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_computer_agent.py
│ │ ├── bench-ui
│ │ │ ├── bench_ui
│ │ │ │ ├── __init__.py
│ │ │ │ ├── api.py
│ │ │ │ └── child.py
│ │ │ ├── examples
│ │ │ │ ├── folder_example.py
│ │ │ │ ├── gui
│ │ │ │ │ ├── index.html
│ │ │ │ │ ├── logo.svg
│ │ │ │ │ └── styles.css
│ │ │ │ ├── output_overlay.png
│ │ │ │ └── simple_example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ └── test_port_detection.py
│ │ ├── computer
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── computer
│ │ │ │ ├── __init__.py
│ │ │ │ ├── computer.py
│ │ │ │ ├── diorama_computer.py
│ │ │ │ ├── helpers.py
│ │ │ │ ├── interface
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── linux.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ ├── models.py
│ │ │ │ │ └── windows.py
│ │ │ │ ├── logger.py
│ │ │ │ ├── models.py
│ │ │ │ ├── providers
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── cloud
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── docker
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── lume
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── lume_api.py
│ │ │ │ │ ├── lumier
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── types.py
│ │ │ │ │ └── winsandbox
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── provider.py
│ │ │ │ │ └── setup_script.ps1
│ │ │ │ ├── tracing_wrapper.py
│ │ │ │ ├── tracing.py
│ │ │ │ ├── ui
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── __main__.py
│ │ │ │ │ └── gradio
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── app.py
│ │ │ │ └── utils.py
│ │ │ ├── poetry.toml
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_computer.py
│ │ ├── computer-server
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── computer_server
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── browser.py
│ │ │ │ ├── cli.py
│ │ │ │ ├── diorama
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── diorama_computer.py
│ │ │ │ │ ├── diorama.py
│ │ │ │ │ ├── draw.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ └── safezone.py
│ │ │ │ ├── handlers
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── linux.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ └── windows.py
│ │ │ │ ├── main.py
│ │ │ │ ├── server.py
│ │ │ │ ├── utils
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── wallpaper.py
│ │ │ │ └── watchdog.py
│ │ │ ├── examples
│ │ │ │ ├── __init__.py
│ │ │ │ └── usage_example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ ├── run_server.py
│ │ │ ├── test_connection.py
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_server.py
│ │ ├── core
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── core
│ │ │ │ ├── __init__.py
│ │ │ │ └── telemetry
│ │ │ │ ├── __init__.py
│ │ │ │ └── posthog.py
│ │ │ ├── poetry.toml
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_telemetry.py
│ │ ├── mcp-server
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── build-extension.py
│ │ │ ├── CONCURRENT_SESSIONS.md
│ │ │ ├── desktop-extension
│ │ │ │ ├── cua-extension.mcpb
│ │ │ │ ├── desktop_extension.png
│ │ │ │ ├── manifest.json
│ │ │ │ ├── README.md
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── run_server.sh
│ │ │ │ └── setup.py
│ │ │ ├── mcp_server
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── server.py
│ │ │ │ └── session_manager.py
│ │ │ ├── pdm.lock
│ │ │ ├── pyproject.toml
│ │ │ ├── QUICK_TEST_COMMANDS.sh
│ │ │ ├── quick_test_local_option.py
│ │ │ ├── README.md
│ │ │ ├── scripts
│ │ │ │ ├── install_mcp_server.sh
│ │ │ │ └── start_mcp_server.sh
│ │ │ ├── test_mcp_server_local_option.py
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_mcp_server.py
│ │ ├── pylume
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_pylume.py
│ │ └── som
│ │ ├── .bumpversion.cfg
│ │ ├── LICENSE
│ │ ├── poetry.toml
│ │ ├── pyproject.toml
│ │ ├── README.md
│ │ ├── som
│ │ │ ├── __init__.py
│ │ │ ├── detect.py
│ │ │ ├── detection.py
│ │ │ ├── models.py
│ │ │ ├── ocr.py
│ │ │ ├── util
│ │ │ │ └── utils.py
│ │ │ └── visualization.py
│ │ └── tests
│ │ ├── conftest.py
│ │ └── test_omniparser.py
│ ├── qemu-docker
│ │ ├── linux
│ │ │ ├── Dockerfile
│ │ │ ├── README.md
│ │ │ └── src
│ │ │ ├── entry.sh
│ │ │ └── vm
│ │ │ ├── image
│ │ │ │ └── README.md
│ │ │ └── setup
│ │ │ ├── install.sh
│ │ │ ├── setup-cua-server.sh
│ │ │ └── setup.sh
│ │ ├── README.md
│ │ └── windows
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ └── src
│ │ ├── entry.sh
│ │ └── vm
│ │ ├── image
│ │ │ └── README.md
│ │ └── setup
│ │ ├── install.bat
│ │ ├── on-logon.ps1
│ │ ├── setup-cua-server.ps1
│ │ ├── setup-utils.psm1
│ │ └── setup.ps1
│ ├── typescript
│ │ ├── .gitignore
│ │ ├── .nvmrc
│ │ ├── agent
│ │ │ ├── examples
│ │ │ │ ├── playground-example.html
│ │ │ │ └── README.md
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── client.ts
│ │ │ │ ├── index.ts
│ │ │ │ └── types.ts
│ │ │ ├── tests
│ │ │ │ └── client.test.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── computer
│ │ │ ├── .editorconfig
│ │ │ ├── .gitattributes
│ │ │ ├── .gitignore
│ │ │ ├── LICENSE
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── computer
│ │ │ │ │ ├── index.ts
│ │ │ │ │ ├── providers
│ │ │ │ │ │ ├── base.ts
│ │ │ │ │ │ ├── cloud.ts
│ │ │ │ │ │ └── index.ts
│ │ │ │ │ └── types.ts
│ │ │ │ ├── index.ts
│ │ │ │ ├── interface
│ │ │ │ │ ├── base.ts
│ │ │ │ │ ├── factory.ts
│ │ │ │ │ ├── index.ts
│ │ │ │ │ ├── linux.ts
│ │ │ │ │ ├── macos.ts
│ │ │ │ │ └── windows.ts
│ │ │ │ └── types.ts
│ │ │ ├── tests
│ │ │ │ ├── computer
│ │ │ │ │ └── cloud.test.ts
│ │ │ │ ├── interface
│ │ │ │ │ ├── factory.test.ts
│ │ │ │ │ ├── index.test.ts
│ │ │ │ │ ├── linux.test.ts
│ │ │ │ │ ├── macos.test.ts
│ │ │ │ │ └── windows.test.ts
│ │ │ │ └── setup.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── core
│ │ │ ├── .editorconfig
│ │ │ ├── .gitattributes
│ │ │ ├── .gitignore
│ │ │ ├── LICENSE
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── index.ts
│ │ │ │ └── telemetry
│ │ │ │ ├── clients
│ │ │ │ │ ├── index.ts
│ │ │ │ │ └── posthog.ts
│ │ │ │ └── index.ts
│ │ │ ├── tests
│ │ │ │ └── telemetry.test.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── cua-cli
│ │ │ ├── .gitignore
│ │ │ ├── .prettierrc
│ │ │ ├── bun.lock
│ │ │ ├── CLAUDE.md
│ │ │ ├── index.ts
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── auth.ts
│ │ │ │ ├── cli.ts
│ │ │ │ ├── commands
│ │ │ │ │ ├── auth.ts
│ │ │ │ │ └── sandbox.ts
│ │ │ │ ├── config.ts
│ │ │ │ ├── http.ts
│ │ │ │ ├── storage.ts
│ │ │ │ └── util.ts
│ │ │ └── tsconfig.json
│ │ ├── package.json
│ │ ├── pnpm-lock.yaml
│ │ ├── pnpm-workspace.yaml
│ │ └── README.md
│ └── xfce
│ ├── .dockerignore
│ ├── .gitignore
│ ├── Development.md
│ ├── Dockerfile
│ ├── Dockerfile.dev
│ ├── README.md
│ └── src
│ ├── scripts
│ │ ├── resize-display.sh
│ │ ├── start-computer-server.sh
│ │ ├── start-novnc.sh
│ │ ├── start-vnc.sh
│ │ └── xstartup.sh
│ ├── supervisor
│ │ └── supervisord.conf
│ └── xfce-config
│ ├── helpers.rc
│ ├── xfce4-power-manager.xml
│ └── xfce4-session.xml
├── LICENSE.md
├── Makefile
├── notebooks
│ ├── agent_nb.ipynb
│ ├── blog
│ │ ├── build-your-own-operator-on-macos-1.ipynb
│ │ └── build-your-own-operator-on-macos-2.ipynb
│ ├── composite_agents_docker_nb.ipynb
│ ├── computer_nb.ipynb
│ ├── computer_server_nb.ipynb
│ ├── customizing_computeragent.ipynb
│ ├── eval_osworld.ipynb
│ ├── ollama_nb.ipynb
│ ├── README.md
│ ├── sota_hackathon_cloud.ipynb
│ └── sota_hackathon.ipynb
├── package-lock.json
├── package.json
├── pnpm-lock.yaml
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── scripts
│ ├── install-cli.ps1
│ ├── install-cli.sh
│ ├── playground-docker.sh
│ ├── playground.sh
│ ├── run-docker-dev.sh
│ └── typescript-typecheck.js
├── TESTING.md
├── tests
│ ├── agent_loop_testing
│ │ ├── agent_test.py
│ │ └── README.md
│ ├── pytest.ini
│ ├── shell_cmd.py
│ ├── test_files.py
│ ├── test_mcp_server_session_management.py
│ ├── test_mcp_server_streaming.py
│ ├── test_shell_bash.py
│ ├── test_telemetry.py
│ ├── test_tracing.py
│ ├── test_venv.py
│ └── test_watchdog.py
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/libs/python/mcp-server/CONCURRENT_SESSIONS.md:
--------------------------------------------------------------------------------
```markdown
1 | # MCP Server Concurrent Session Management
2 |
3 | This document describes the improvements made to the MCP Server to address concurrent session management and resource lifecycle issues.
4 |
5 | ## Problem Statement
6 |
7 | The original MCP server implementation had several critical issues:
8 |
9 | 1. **Global Computer Instance**: Used a single `global_computer` variable shared across all clients
10 | 2. **No Resource Isolation**: Multiple clients would interfere with each other
11 | 3. **Sequential Task Processing**: Multi-task operations were always sequential
12 | 4. **No Graceful Shutdown**: Server couldn't properly cleanup resources on shutdown
13 | 5. **Hidden Event Loop**: `server.run()` hid the event loop, preventing proper lifecycle management
14 |
15 | ## Solution Architecture
16 |
17 | ### 1. Session Manager (`session_manager.py`)
18 |
19 | The `SessionManager` class provides:
20 |
21 | - **Per-session computer instances**: Each client gets isolated computer resources
22 | - **Computer instance pooling**: Efficient reuse of computer instances with lifecycle management
23 | - **Task registration**: Track active tasks per session for graceful cleanup
24 | - **Automatic cleanup**: Background task cleans up idle sessions
25 | - **Resource limits**: Configurable maximum concurrent sessions
26 |
27 | #### Key Components:
28 |
29 | ```python
30 | class SessionManager:
31 | def __init__(self, max_concurrent_sessions: int = 10):
32 | self._sessions: Dict[str, SessionInfo] = {}
33 | self._computer_pool = ComputerPool()
34 | # ... lifecycle management
35 | ```
36 |
37 | #### Session Lifecycle:
38 |
39 | 1. **Creation**: New session created when client first connects
40 | 2. **Task Registration**: Each task is registered with the session
41 | 3. **Activity Tracking**: Last activity time updated on each operation
42 | 4. **Cleanup**: Sessions cleaned up when idle or on shutdown
43 |
44 | ### 2. Computer Pool (`ComputerPool`)
45 |
46 | Manages computer instances efficiently:
47 |
48 | - **Pool Size Limits**: Maximum number of concurrent computer instances
49 | - **Instance Reuse**: Available instances reused across sessions
50 | - **Lifecycle Management**: Proper startup/shutdown of computer instances
51 | - **Resource Cleanup**: All instances properly closed on shutdown
52 |
53 | ### 3. Enhanced Server Tools
54 |
55 | All server tools now support:
56 |
57 | - **Session ID Parameter**: Optional `session_id` for multi-client support
58 | - **Resource Isolation**: Each session gets its own computer instance
59 | - **Task Tracking**: Proper registration/unregistration of tasks
60 | - **Error Handling**: Graceful error handling with session cleanup
61 |
62 | #### Updated Tool Signatures:
63 |
64 | ```python
65 | async def screenshot_cua(ctx: Context, session_id: Optional[str] = None) -> Any:
66 | async def run_cua_task(ctx: Context, task: str, session_id: Optional[str] = None) -> Any:
67 | async def run_multi_cua_tasks(ctx: Context, tasks: List[str], session_id: Optional[str] = None, concurrent: bool = False) -> Any:
68 | ```
69 |
70 | ### 4. Concurrent Task Execution
71 |
72 | The `run_multi_cua_tasks` tool now supports:
73 |
74 | - **Sequential Mode** (default): Tasks run one after another
75 | - **Concurrent Mode**: Tasks run in parallel using `asyncio.gather()`
76 | - **Progress Tracking**: Proper progress reporting for both modes
77 | - **Error Handling**: Individual task failures don't stop other tasks
78 |
79 | ### 5. Graceful Shutdown
80 |
81 | The server now provides:
82 |
83 | - **Signal Handlers**: Proper handling of SIGINT and SIGTERM
84 | - **Session Cleanup**: All active sessions properly cleaned up
85 | - **Resource Release**: Computer instances returned to pool and closed
86 | - **Async Lifecycle**: Event loop properly exposed for cleanup
87 |
88 | ## Usage Examples
89 |
90 | ### Basic Usage (Backward Compatible)
91 |
92 | ```python
93 | # These calls work exactly as before
94 | await screenshot_cua(ctx)
95 | await run_cua_task(ctx, "Open browser")
96 | await run_multi_cua_tasks(ctx, ["Task 1", "Task 2"])
97 | ```
98 |
99 | ### Multi-Client Usage
100 |
101 | ```python
102 | # Client 1
103 | session_id_1 = "client-1-session"
104 | await screenshot_cua(ctx, session_id_1)
105 | await run_cua_task(ctx, "Open browser", session_id_1)
106 |
107 | # Client 2 (completely isolated)
108 | session_id_2 = "client-2-session"
109 | await screenshot_cua(ctx, session_id_2)
110 | await run_cua_task(ctx, "Open editor", session_id_2)
111 | ```
112 |
113 | ### Concurrent Task Execution
114 |
115 | ```python
116 | # Run tasks concurrently instead of sequentially
117 | tasks = ["Open browser", "Open editor", "Open terminal"]
118 | results = await run_multi_cua_tasks(ctx, tasks, concurrent=True)
119 | ```
120 |
121 | ### Session Management
122 |
123 | ```python
124 | # Get session statistics
125 | stats = await get_session_stats(ctx)
126 | print(f"Active sessions: {stats['total_sessions']}")
127 |
128 | # Cleanup specific session
129 | await cleanup_session(ctx, "session-to-cleanup")
130 | ```
131 |
132 | ## Configuration
133 |
134 | ### Environment Variables
135 |
136 | - `CUA_MODEL_NAME`: Model to use (default: `anthropic/claude-sonnet-4-5-20250929`)
137 | - `CUA_MAX_IMAGES`: Maximum images to keep (default: `3`)
138 |
139 | ### Session Manager Configuration
140 |
141 | ```python
142 | # In session_manager.py
143 | class SessionManager:
144 | def __init__(self, max_concurrent_sessions: int = 10):
145 | # Configurable maximum concurrent sessions
146 |
147 | class ComputerPool:
148 | def __init__(self, max_size: int = 5, idle_timeout: float = 300.0):
149 | # Configurable pool size and idle timeout
150 | ```
151 |
152 | ## Performance Improvements
153 |
154 | ### Before (Issues):
155 |
156 | - ❌ Single global computer instance
157 | - ❌ Client interference and resource conflicts
158 | - ❌ Sequential task processing only
159 | - ❌ No graceful shutdown
160 | - ❌ 30s timeout issues with long-running tasks
161 |
162 | ### After (Benefits):
163 |
164 | - ✅ Per-session computer instances with proper isolation
165 | - ✅ Computer instance pooling for efficient resource usage
166 | - ✅ Concurrent task execution support
167 | - ✅ Graceful shutdown with proper cleanup
168 | - ✅ Streaming updates prevent timeout issues
169 | - ✅ Configurable resource limits
170 | - ✅ Automatic session cleanup
171 |
172 | ## Testing
173 |
174 | Comprehensive test coverage includes:
175 |
176 | - Session creation and reuse
177 | - Concurrent session isolation
178 | - Task registration and cleanup
179 | - Error handling with session management
180 | - Concurrent vs sequential task execution
181 | - Session statistics and cleanup
182 |
183 | Run tests with:
184 |
185 | ```bash
186 | pytest tests/test_mcp_server_session_management.py -v
187 | ```
188 |
189 | ## Migration Guide
190 |
191 | ### For Existing Clients
192 |
193 | No changes required! The new implementation is fully backward compatible:
194 |
195 | ```python
196 | # This still works exactly as before
197 | await run_cua_task(ctx, "My task")
198 | ```
199 |
200 | ### For New Multi-Client Applications
201 |
202 | Use session IDs for proper isolation:
203 |
204 | ```python
205 | # Create a unique session ID for each client
206 | session_id = str(uuid.uuid4())
207 | await run_cua_task(ctx, "My task", session_id)
208 | ```
209 |
210 | ### For Concurrent Task Execution
211 |
212 | Enable concurrent mode for better performance:
213 |
214 | ```python
215 | tasks = ["Task 1", "Task 2", "Task 3"]
216 | results = await run_multi_cua_tasks(ctx, tasks, concurrent=True)
217 | ```
218 |
219 | ## Monitoring and Debugging
220 |
221 | ### Session Statistics
222 |
223 | ```python
224 | stats = await get_session_stats(ctx)
225 | print(f"Total sessions: {stats['total_sessions']}")
226 | print(f"Max concurrent: {stats['max_concurrent']}")
227 | for session_id, session_info in stats['sessions'].items():
228 | print(f"Session {session_id}: {session_info['active_tasks']} active tasks")
229 | ```
230 |
231 | ### Logging
232 |
233 | The server provides detailed logging for:
234 |
235 | - Session creation and cleanup
236 | - Task registration and completion
237 | - Resource pool usage
238 | - Error conditions and recovery
239 |
240 | ### Graceful Shutdown
241 |
242 | The server properly handles shutdown signals:
243 |
244 | ```bash
245 | # Send SIGTERM for graceful shutdown
246 | kill -TERM <server_pid>
247 |
248 | # Or use Ctrl+C (SIGINT)
249 | ```
250 |
251 | ## Future Enhancements
252 |
253 | Potential future improvements:
254 |
255 | 1. **Session Persistence**: Save/restore session state across restarts
256 | 2. **Load Balancing**: Distribute sessions across multiple server instances
257 | 3. **Resource Monitoring**: Real-time monitoring of resource usage
258 | 4. **Auto-scaling**: Dynamic adjustment of pool size based on demand
259 | 5. **Session Timeouts**: Configurable timeouts for different session types
260 |
```
--------------------------------------------------------------------------------
/blog/human-in-the-loop.md:
--------------------------------------------------------------------------------
```markdown
1 | # When Agents Need Human Wisdom - Introducing Human-In-The-Loop Support
2 |
3 | _Published on August 29, 2025 by Francesco Bonacci_
4 |
5 | Sometimes the best AI agent is a human. Whether you're creating training demonstrations, evaluating complex scenarios, or need to intervene when automation hits a wall, our new Human-In-The-Loop integration puts you directly in control.
6 |
7 | With yesterday's [HUD evaluation integration](hud-agent-evals.md), you could benchmark any agent at scale. Today's update lets you _become_ the agent when it matters most—seamlessly switching between automated intelligence and human judgment.
8 |
9 | <div align="center">
10 | <video src="https://github.com/user-attachments/assets/9091b50f-26e7-4981-95ce-40e5d42a1260" width="600" controls></video>
11 | </div>
12 |
13 | ## What you get
14 |
15 | - **One-line human takeover** for any agent configuration with `human/human` or `model+human/human`
16 | - **Interactive web UI** to see what your agent sees and control what it does
17 | - **Zero context switching** - step in exactly where automation left off
18 | - **Training data generation** - create perfect demonstrations by doing tasks yourself
19 | - **Ground truth evaluation** - validate agent performance with human expertise
20 |
21 | ## Why Human-In-The-Loop?
22 |
23 | Even the most sophisticated agents encounter edge cases, ambiguous interfaces, or tasks requiring human judgment. Rather than failing gracefully, they can now fail _intelligently_—by asking for human help.
24 |
25 | This approach bridges the gap between fully automated systems and pure manual control, letting you:
26 |
27 | - **Demonstrate complex workflows** that agents can learn from
28 | - **Evaluate tricky scenarios** where ground truth requires human assessment
29 | - **Intervene selectively** when automated agents need guidance
30 | - **Test and debug** your tools and environments manually
31 |
32 | ## Getting Started
33 |
34 | Launch the human agent interface:
35 |
36 | ```bash
37 | python -m agent.human_tool
38 | ```
39 |
40 | The web UI will show pending completions. Click any completion to take control of the agent and see exactly what it sees.
41 |
42 | ## Usage Examples
43 |
44 | ### Direct Human Control
45 |
46 | Perfect for creating demonstrations or when you want full manual control:
47 |
48 | ```python
49 | from agent import ComputerAgent
50 | from agent.computer import computer
51 |
52 | agent = ComputerAgent(
53 | "human/human",
54 | tools=[computer]
55 | )
56 |
57 | # You'll get full control through the web UI
58 | async for _ in agent.run("Take a screenshot, analyze the UI, and click on the most prominent button"):
59 | pass
60 | ```
61 |
62 | ### Hybrid: AI Planning + Human Execution
63 |
64 | Combine model intelligence with human precision—let AI plan, then execute manually:
65 |
66 | ```python
67 | agent = ComputerAgent(
68 | "huggingface-local/HelloKKMe/GTA1-7B+human/human",
69 | tools=[computer]
70 | )
71 |
72 | # AI creates the plan, human executes each step
73 | async for _ in agent.run("Navigate to the settings page and enable dark mode"):
74 | pass
75 | ```
76 |
77 | ### Fallback Pattern
78 |
79 | Start automated, escalate to human when needed:
80 |
81 | ```python
82 | # Primary automated agent
83 | primary_agent = ComputerAgent("openai/computer-use-preview", tools=[computer])
84 |
85 | # Human fallback agent
86 | fallback_agent = ComputerAgent("human/human", tools=[computer])
87 |
88 | try:
89 | async for result in primary_agent.run(task):
90 | if result.confidence < 0.7: # Low confidence threshold
91 | # Seamlessly hand off to human
92 | async for _ in fallback_agent.run(f"Continue this task: {task}"):
93 | pass
94 | except Exception:
95 | # Agent failed, human takes over
96 | async for _ in fallback_agent.run(f"Handle this failed task: {task}"):
97 | pass
98 | ```
99 |
100 | ## Interactive Features
101 |
102 | The human-in-the-loop interface provides a rich, responsive experience:
103 |
104 | ### **Visual Environment**
105 |
106 | - **Screenshot display** with live updates as you work
107 | - **Click handlers** for direct interaction with UI elements
108 | - **Zoom and pan** to see details clearly
109 |
110 | ### **Action Controls**
111 |
112 | - **Click actions** - precise cursor positioning and clicking
113 | - **Keyboard input** - type text naturally or send specific key combinations
114 | - **Action history** - see the sequence of actions taken
115 | - **Undo support** - step back when needed
116 |
117 | ### **Tool Integration**
118 |
119 | - **Full OpenAI compatibility** - standard tool call format
120 | - **Custom tools** - integrate your own tools seamlessly
121 | - **Real-time feedback** - see tool responses immediately
122 |
123 | ### **Smart Polling**
124 |
125 | - **Responsive updates** - UI refreshes when new completions arrive
126 | - **Background processing** - continue working while waiting for tasks
127 | - **Session persistence** - resume interrupted sessions
128 |
129 | ## Real-World Use Cases
130 |
131 | ### **Training Data Generation**
132 |
133 | Create perfect demonstrations for fine-tuning:
134 |
135 | ```python
136 | # Generate training examples for spreadsheet tasks
137 | demo_agent = ComputerAgent("human/human", tools=[computer])
138 |
139 | tasks = [
140 | "Create a budget spreadsheet with income and expense categories",
141 | "Apply conditional formatting to highlight overbudget items",
142 | "Generate a pie chart showing expense distribution"
143 | ]
144 |
145 | for task in tasks:
146 | # Human demonstrates each task perfectly
147 | async for _ in demo_agent.run(task):
148 | pass # Recorded actions become training data
149 | ```
150 |
151 | ### **Evaluation and Ground Truth**
152 |
153 | Validate agent performance on complex scenarios:
154 |
155 | ```python
156 | # Human evaluates agent performance
157 | evaluator = ComputerAgent("human/human", tools=[computer])
158 |
159 | async for _ in evaluator.run("Review this completed form and rate accuracy (1-10)"):
160 | pass # Human provides authoritative quality assessment
161 | ```
162 |
163 | ### **Interactive Debugging**
164 |
165 | Step through agent behavior manually:
166 |
167 | ```python
168 | # Test a workflow step by step
169 | debug_agent = ComputerAgent("human/human", tools=[computer])
170 |
171 | async for _ in debug_agent.run("Reproduce the agent's failed login sequence"):
172 | pass # Human identifies exactly where automation breaks
173 | ```
174 |
175 | ### **Edge Case Handling**
176 |
177 | Handle scenarios that break automated agents:
178 |
179 | ```python
180 | # Complex UI interaction requiring human judgment
181 | edge_case_agent = ComputerAgent("human/human", tools=[computer])
182 |
183 | async for _ in edge_case_agent.run("Navigate this CAPTCHA-protected form"):
184 | pass # Human handles what automation cannot
185 | ```
186 |
187 | ## Configuration Options
188 |
189 | Customize the human agent experience:
190 |
191 | - **UI refresh rate**: Adjust polling frequency for your workflow
192 | - **Image quality**: Balance detail vs. performance for screenshots
193 | - **Action logging**: Save detailed traces for analysis and training
194 | - **Session timeout**: Configure idle timeouts for security
195 | - **Tool permissions**: Restrict which tools humans can access
196 |
197 | ## When to Use Human-In-The-Loop
198 |
199 | | **Scenario** | **Why Human Control** |
200 | | ---------------------------- | ----------------------------------------------------- |
201 | | **Creating training data** | Perfect demonstrations for model fine-tuning |
202 | | **Evaluating complex tasks** | Human judgment for subjective or nuanced assessment |
203 | | **Handling edge cases** | CAPTCHAs, unusual UIs, context-dependent decisions |
204 | | **Debugging workflows** | Step through failures to identify breaking points |
205 | | **High-stakes operations** | Critical tasks requiring human oversight and approval |
206 | | **Testing new environments** | Validate tools and environments work as expected |
207 |
208 | ## Learn More
209 |
210 | - **Interactive examples**: Try human-in-the-loop control with sample tasks
211 | - **Training data pipelines**: Learn how to convert human demonstrations into model training data
212 | - **Evaluation frameworks**: Build human-validated test suites for your agents
213 | - **API documentation**: Full reference for human agent configuration
214 |
215 | Ready to put humans back in the loop? The most sophisticated AI system knows when to ask for help.
216 |
217 | ---
218 |
219 | _Questions about human-in-the-loop agents? Join the conversation in our [Discord community](https://discord.gg/cua-ai) or check out our [documentation](https://cua.ai/docs/agent-sdk/supported-agents/human-in-the-loop)._
220 |
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/human_tool/server.py:
--------------------------------------------------------------------------------
```python
1 | import asyncio
2 | import uuid
3 | from dataclasses import asdict, dataclass
4 | from datetime import datetime
5 | from enum import Enum
6 | from typing import Any, Dict, List, Optional
7 |
8 | from fastapi import FastAPI, HTTPException
9 | from pydantic import BaseModel
10 |
11 |
12 | class CompletionStatus(str, Enum):
13 | PENDING = "pending"
14 | COMPLETED = "completed"
15 | FAILED = "failed"
16 |
17 |
18 | @dataclass
19 | class CompletionCall:
20 | id: str
21 | messages: List[Dict[str, Any]]
22 | model: str
23 | status: CompletionStatus
24 | created_at: datetime
25 | completed_at: Optional[datetime] = None
26 | response: Optional[str] = None
27 | tool_calls: Optional[List[Dict[str, Any]]] = None
28 | error: Optional[str] = None
29 |
30 |
31 | class ToolCall(BaseModel):
32 | id: str
33 | type: str = "function"
34 | function: Dict[str, Any]
35 |
36 |
37 | class CompletionRequest(BaseModel):
38 | messages: List[Dict[str, Any]]
39 | model: str
40 |
41 |
42 | class CompletionResponse(BaseModel):
43 | response: Optional[str] = None
44 | tool_calls: Optional[List[Dict[str, Any]]] = None
45 |
46 |
47 | class CompletionQueue:
48 | def __init__(self):
49 | self._queue: Dict[str, CompletionCall] = {}
50 | self._pending_order: List[str] = []
51 | self._lock = asyncio.Lock()
52 |
53 | async def add_completion(self, messages: List[Dict[str, Any]], model: str) -> str:
54 | """Add a completion call to the queue."""
55 | async with self._lock:
56 | call_id = str(uuid.uuid4())
57 | completion_call = CompletionCall(
58 | id=call_id,
59 | messages=messages,
60 | model=model,
61 | status=CompletionStatus.PENDING,
62 | created_at=datetime.now(),
63 | )
64 | self._queue[call_id] = completion_call
65 | self._pending_order.append(call_id)
66 | return call_id
67 |
68 | async def get_pending_calls(self) -> List[Dict[str, Any]]:
69 | """Get all pending completion calls."""
70 | async with self._lock:
71 | pending_calls = []
72 | for call_id in self._pending_order:
73 | if (
74 | call_id in self._queue
75 | and self._queue[call_id].status == CompletionStatus.PENDING
76 | ):
77 | call = self._queue[call_id]
78 | pending_calls.append(
79 | {
80 | "id": call.id,
81 | "model": call.model,
82 | "created_at": call.created_at.isoformat(),
83 | "messages": call.messages,
84 | }
85 | )
86 | return pending_calls
87 |
88 | async def get_call_status(self, call_id: str) -> Optional[Dict[str, Any]]:
89 | """Get the status of a specific completion call."""
90 | async with self._lock:
91 | if call_id not in self._queue:
92 | return None
93 |
94 | call = self._queue[call_id]
95 | result = {
96 | "id": call.id,
97 | "status": call.status.value,
98 | "created_at": call.created_at.isoformat(),
99 | "model": call.model,
100 | "messages": call.messages,
101 | }
102 |
103 | if call.completed_at:
104 | result["completed_at"] = call.completed_at.isoformat()
105 | if call.response:
106 | result["response"] = call.response
107 | if call.tool_calls:
108 | result["tool_calls"] = call.tool_calls
109 | if call.error:
110 | result["error"] = call.error
111 |
112 | return result
113 |
114 | async def complete_call(
115 | self,
116 | call_id: str,
117 | response: Optional[str] = None,
118 | tool_calls: Optional[List[Dict[str, Any]]] = None,
119 | ) -> bool:
120 | """Mark a completion call as completed with a response or tool calls."""
121 | async with self._lock:
122 | if call_id not in self._queue:
123 | return False
124 |
125 | call = self._queue[call_id]
126 | if call.status != CompletionStatus.PENDING:
127 | return False
128 |
129 | call.status = CompletionStatus.COMPLETED
130 | call.completed_at = datetime.now()
131 | call.response = response
132 | call.tool_calls = tool_calls
133 |
134 | # Remove from pending order
135 | if call_id in self._pending_order:
136 | self._pending_order.remove(call_id)
137 |
138 | return True
139 |
140 | async def fail_call(self, call_id: str, error: str) -> bool:
141 | """Mark a completion call as failed with an error."""
142 | async with self._lock:
143 | if call_id not in self._queue:
144 | return False
145 |
146 | call = self._queue[call_id]
147 | if call.status != CompletionStatus.PENDING:
148 | return False
149 |
150 | call.status = CompletionStatus.FAILED
151 | call.completed_at = datetime.now()
152 | call.error = error
153 |
154 | # Remove from pending order
155 | if call_id in self._pending_order:
156 | self._pending_order.remove(call_id)
157 |
158 | return True
159 |
160 | async def wait_for_completion(self, call_id: str, timeout: float = 300.0) -> Optional[str]:
161 | """Wait for a completion call to be completed and return the response."""
162 | start_time = asyncio.get_event_loop().time()
163 |
164 | while True:
165 | status = await self.get_call_status(call_id)
166 | if not status:
167 | return None
168 |
169 | if status["status"] == CompletionStatus.COMPLETED.value:
170 | return status.get("response")
171 | elif status["status"] == CompletionStatus.FAILED.value:
172 | raise Exception(f"Completion failed: {status.get('error', 'Unknown error')}")
173 |
174 | # Check timeout
175 | if asyncio.get_event_loop().time() - start_time > timeout:
176 | await self.fail_call(call_id, "Timeout waiting for human response")
177 | raise TimeoutError("Timeout waiting for human response")
178 |
179 | # Wait a bit before checking again
180 | await asyncio.sleep(0.5)
181 |
182 |
183 | # Global queue instance
184 | completion_queue = CompletionQueue()
185 |
186 | # FastAPI app
187 | app = FastAPI(title="Human Completion Server", version="1.0.0")
188 |
189 |
190 | @app.post("/queue", response_model=Dict[str, str])
191 | async def queue_completion(request: CompletionRequest):
192 | """Add a completion request to the queue."""
193 | call_id = await completion_queue.add_completion(request.messages, request.model)
194 | return {"id": call_id, "status": "queued"}
195 |
196 |
197 | @app.get("/pending")
198 | async def list_pending():
199 | """List all pending completion calls."""
200 | pending_calls = await completion_queue.get_pending_calls()
201 | return {"pending_calls": pending_calls}
202 |
203 |
204 | @app.get("/status/{call_id}")
205 | async def get_status(call_id: str):
206 | """Get the status of a specific completion call."""
207 | status = await completion_queue.get_call_status(call_id)
208 | if not status:
209 | raise HTTPException(status_code=404, detail="Completion call not found")
210 | return status
211 |
212 |
213 | @app.post("/complete/{call_id}")
214 | async def complete_call(call_id: str, response: CompletionResponse):
215 | """Complete a call with a human response."""
216 | success = await completion_queue.complete_call(
217 | call_id, response=response.response, tool_calls=response.tool_calls
218 | )
219 | if success:
220 | return {"status": "success", "message": "Call completed"}
221 | else:
222 | raise HTTPException(status_code=404, detail="Call not found or already completed")
223 |
224 |
225 | @app.post("/fail/{call_id}")
226 | async def fail_call(call_id: str, error: Dict[str, str]):
227 | """Mark a call as failed."""
228 | success = await completion_queue.fail_call(call_id, error.get("error", "Unknown error"))
229 | if not success:
230 | raise HTTPException(
231 | status_code=404, detail="Completion call not found or already completed"
232 | )
233 | return {"status": "failed"}
234 |
235 |
236 | @app.get("/")
237 | async def root():
238 | """Root endpoint."""
239 | return {"message": "Human Completion Server is running"}
240 |
241 |
242 | if __name__ == "__main__":
243 | import uvicorn
244 |
245 | uvicorn.run(app, host="0.0.0.0", port=8002)
246 |
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/computers/custom.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Custom computer handler implementation that accepts a dictionary of functions.
3 | """
4 |
5 | import base64
6 | import io
7 | from typing import Any, Callable, Dict, List, Literal, Optional, Union
8 |
9 | from PIL import Image
10 |
11 | from .base import AsyncComputerHandler
12 |
13 |
14 | class CustomComputerHandler(AsyncComputerHandler):
15 | """Computer handler that implements the Computer protocol using a dictionary of custom functions."""
16 |
17 | def __init__(self, functions: Dict[str, Callable]):
18 | """
19 | Initialize with a dictionary of functions.
20 |
21 | Args:
22 | functions: Dictionary where keys are method names and values are callable functions.
23 | Only 'screenshot' is required, all others are optional.
24 |
25 | Raises:
26 | ValueError: If required 'screenshot' function is not provided.
27 | """
28 | if "screenshot" not in functions:
29 | raise ValueError("'screenshot' function is required in functions dictionary")
30 |
31 | self.functions = functions
32 | self._last_screenshot_size: Optional[tuple[int, int]] = None
33 |
34 | async def _call_function(self, func, *args, **kwargs):
35 | """
36 | Call a function, handling both async and sync functions.
37 |
38 | Args:
39 | func: The function to call
40 | *args: Positional arguments to pass to the function
41 | **kwargs: Keyword arguments to pass to the function
42 |
43 | Returns:
44 | The result of the function call
45 | """
46 | import asyncio
47 | import inspect
48 |
49 | if callable(func):
50 | if inspect.iscoroutinefunction(func):
51 | return await func(*args, **kwargs)
52 | else:
53 | return func(*args, **kwargs)
54 | else:
55 | return func
56 |
57 | async def _get_value(self, attribute: str):
58 | """
59 | Get value for an attribute, checking both 'get_{attribute}' and '{attribute}' keys.
60 |
61 | Args:
62 | attribute: The attribute name to look for
63 |
64 | Returns:
65 | The value from the functions dict, called if callable, returned directly if not
66 | """
67 | # Check for 'get_{attribute}' first
68 | get_key = f"get_{attribute}"
69 | if get_key in self.functions:
70 | return await self._call_function(self.functions[get_key])
71 |
72 | # Check for '{attribute}'
73 | if attribute in self.functions:
74 | return await self._call_function(self.functions[attribute])
75 |
76 | return None
77 |
78 | def _to_b64_str(self, img: Union[bytes, Image.Image, str]) -> str:
79 | """
80 | Convert image to base64 string.
81 |
82 | Args:
83 | img: Image as bytes, PIL Image, or base64 string
84 |
85 | Returns:
86 | str: Base64 encoded image string
87 | """
88 | if isinstance(img, str):
89 | # Already a base64 string
90 | return img
91 | elif isinstance(img, bytes):
92 | # Raw bytes
93 | return base64.b64encode(img).decode("utf-8")
94 | elif isinstance(img, Image.Image):
95 | # PIL Image
96 | buffer = io.BytesIO()
97 | img.save(buffer, format="PNG")
98 | return base64.b64encode(buffer.getvalue()).decode("utf-8")
99 | else:
100 | raise ValueError(f"Unsupported image type: {type(img)}")
101 |
102 | # ==== Computer-Use-Preview Action Space ====
103 |
104 | async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
105 | """Get the current environment type."""
106 | result = await self._get_value("environment")
107 | if result is None:
108 | return "linux"
109 | assert result in ["windows", "mac", "linux", "browser"]
110 | return result # type: ignore
111 |
112 | async def get_dimensions(self) -> tuple[int, int]:
113 | """Get screen dimensions as (width, height)."""
114 | result = await self._get_value("dimensions")
115 | if result is not None:
116 | return result # type: ignore
117 |
118 | # Fallback: use last screenshot size if available
119 | if not self._last_screenshot_size:
120 | await self.screenshot()
121 | assert self._last_screenshot_size is not None, "Failed to get screenshot size"
122 |
123 | return self._last_screenshot_size
124 |
125 | async def screenshot(self, text: Optional[str] = None) -> str:
126 | """Take a screenshot and return as base64 string.
127 |
128 | Args:
129 | text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
130 | """
131 | result = await self._call_function(self.functions["screenshot"])
132 | b64_str = self._to_b64_str(result) # type: ignore
133 |
134 | # Try to extract dimensions for fallback use
135 | try:
136 | if isinstance(result, Image.Image):
137 | self._last_screenshot_size = result.size
138 | elif isinstance(result, bytes):
139 | # Try to decode bytes to get dimensions
140 | img = Image.open(io.BytesIO(result))
141 | self._last_screenshot_size = img.size
142 | except Exception:
143 | # If we can't get dimensions, that's okay
144 | pass
145 |
146 | return b64_str
147 |
148 | async def click(self, x: int, y: int, button: str = "left") -> None:
149 | """Click at coordinates with specified button."""
150 | if "click" in self.functions:
151 | await self._call_function(self.functions["click"], x, y, button)
152 | # No-op if not implemented
153 |
154 | async def double_click(self, x: int, y: int) -> None:
155 | """Double click at coordinates."""
156 | if "double_click" in self.functions:
157 | await self._call_function(self.functions["double_click"], x, y)
158 | # No-op if not implemented
159 |
160 | async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
161 | """Scroll at coordinates with specified scroll amounts."""
162 | if "scroll" in self.functions:
163 | await self._call_function(self.functions["scroll"], x, y, scroll_x, scroll_y)
164 | # No-op if not implemented
165 |
166 | async def type(self, text: str) -> None:
167 | """Type text."""
168 | if "type" in self.functions:
169 | await self._call_function(self.functions["type"], text)
170 | # No-op if not implemented
171 |
172 | async def wait(self, ms: int = 1000) -> None:
173 | """Wait for specified milliseconds."""
174 | if "wait" in self.functions:
175 | await self._call_function(self.functions["wait"], ms)
176 | else:
177 | # Default implementation
178 | import asyncio
179 |
180 | await asyncio.sleep(ms / 1000.0)
181 |
182 | async def move(self, x: int, y: int) -> None:
183 | """Move cursor to coordinates."""
184 | if "move" in self.functions:
185 | await self._call_function(self.functions["move"], x, y)
186 | # No-op if not implemented
187 |
188 | async def keypress(self, keys: Union[List[str], str]) -> None:
189 | """Press key combination."""
190 | if "keypress" in self.functions:
191 | await self._call_function(self.functions["keypress"], keys)
192 | # No-op if not implemented
193 |
194 | async def drag(self, path: List[Dict[str, int]]) -> None:
195 | """Drag along specified path."""
196 | if "drag" in self.functions:
197 | await self._call_function(self.functions["drag"], path)
198 | # No-op if not implemented
199 |
200 | async def get_current_url(self) -> str:
201 | """Get current URL (for browser environments)."""
202 | if "get_current_url" in self.functions:
203 | return await self._get_value("current_url") # type: ignore
204 | return "" # Default fallback
205 |
206 | async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
207 | """Left mouse down at coordinates."""
208 | if "left_mouse_down" in self.functions:
209 | await self._call_function(self.functions["left_mouse_down"], x, y)
210 | # No-op if not implemented
211 |
212 | async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
213 | """Left mouse up at coordinates."""
214 | if "left_mouse_up" in self.functions:
215 | await self._call_function(self.functions["left_mouse_up"], x, y)
216 | # No-op if not implemented
217 |
```
--------------------------------------------------------------------------------
/libs/typescript/core/src/telemetry/clients/posthog.ts:
--------------------------------------------------------------------------------
```typescript
1 | /**
2 | * Telemetry client using PostHog for collecting anonymous usage data.
3 | */
4 |
5 | import * as fs from 'node:fs';
6 | import * as os from 'node:os';
7 | import * as path from 'node:path';
8 | import { pino } from 'pino';
9 | import { PostHog } from 'posthog-node';
10 | import { v4 as uuidv4 } from 'uuid';
11 |
12 | // Controls how frequently telemetry will be sent (percentage)
13 | export const TELEMETRY_SAMPLE_RATE = 100; // 100% sampling rate
14 |
15 | // Public PostHog config for anonymous telemetry
16 | // These values are intentionally public and meant for anonymous telemetry only
17 | // https://posthog.com/docs/product-analytics/troubleshooting#is-it-ok-for-my-api-key-to-be-exposed-and-public
18 | export const PUBLIC_POSTHOG_API_KEY = 'phc_eSkLnbLxsnYFaXksif1ksbrNzYlJShr35miFLDppF14';
19 | export const PUBLIC_POSTHOG_HOST = 'https://eu.i.posthog.com';
20 |
21 | export class PostHogTelemetryClient {
22 | private config: {
23 | enabled: boolean;
24 | sampleRate: number;
25 | posthog: { apiKey: string; host: string };
26 | };
27 | private installationId: string;
28 | private initialized = false;
29 | private queuedEvents: {
30 | name: string;
31 | properties: Record<string, unknown>;
32 | timestamp: number;
33 | }[] = [];
34 | private startTime: number; // seconds
35 | private posthogClient?: PostHog;
36 | private counters: Record<string, number> = {};
37 |
38 | private logger = pino({ name: 'core.telemetry' });
39 |
40 | constructor() {
41 | // set up config
42 | this.config = {
43 | enabled: true,
44 | sampleRate: TELEMETRY_SAMPLE_RATE,
45 | posthog: { apiKey: PUBLIC_POSTHOG_API_KEY, host: PUBLIC_POSTHOG_HOST },
46 | };
47 | // Check CUA_TELEMETRY_ENABLED environment variable (defaults to enabled)
48 | const telemetryEnabled = ['1', 'true', 'yes', 'on'].includes(
49 | process.env.CUA_TELEMETRY_ENABLED?.toLowerCase() || 'true'
50 | );
51 |
52 | this.config.enabled = telemetryEnabled;
53 | this.config.sampleRate = Number.parseFloat(
54 | process.env.CUA_TELEMETRY_SAMPLE_RATE || String(TELEMETRY_SAMPLE_RATE)
55 | );
56 | // init client
57 | this.installationId = this._getOrCreateInstallationId();
58 | this.startTime = Date.now() / 1000; // Convert to seconds
59 |
60 | // Log telemetry status on startup
61 | if (this.config.enabled) {
62 | this.logger.info(`Telemetry enabled (sampling at ${this.config.sampleRate}%)`);
63 | // Initialize PostHog client if config is available
64 | this._initializePosthog();
65 | } else {
66 | this.logger.info('Telemetry disabled');
67 | }
68 | }
69 |
70 | /**
71 | * Get or create a random installation ID.
72 | * This ID is not tied to any personal information.
73 | */
74 | private _getOrCreateInstallationId(): string {
75 | const homeDir = os.homedir();
76 | const idFile = path.join(homeDir, '.cua', 'installation_id');
77 |
78 | try {
79 | if (fs.existsSync(idFile)) {
80 | return fs.readFileSync(idFile, 'utf-8').trim();
81 | }
82 | } catch (error) {
83 | this.logger.debug(`Failed to read installation ID: ${error}`);
84 | }
85 |
86 | // Create new ID if not exists
87 | const newId = uuidv4();
88 | try {
89 | const dir = path.dirname(idFile);
90 | if (!fs.existsSync(dir)) {
91 | fs.mkdirSync(dir, { recursive: true });
92 | }
93 | fs.writeFileSync(idFile, newId);
94 | return newId;
95 | } catch (error) {
96 | this.logger.debug(`Failed to write installation ID: ${error}`);
97 | }
98 |
99 | // Fallback to in-memory ID if file operations fail
100 | return newId;
101 | }
102 |
103 | /**
104 | * Initialize the PostHog client with configuration.
105 | */
106 | private _initializePosthog(): boolean {
107 | if (this.initialized) {
108 | return true;
109 | }
110 |
111 | try {
112 | this.posthogClient = new PostHog(this.config.posthog.apiKey, {
113 | host: this.config.posthog.host,
114 | flushAt: 20, // Number of events to batch before sending
115 | flushInterval: 30000, // Send events every 30 seconds
116 | });
117 | this.initialized = true;
118 | this.logger.debug('PostHog client initialized successfully');
119 |
120 | // Process any queued events
121 | this._processQueuedEvents();
122 | return true;
123 | } catch (error) {
124 | this.logger.error(`Failed to initialize PostHog client: ${error}`);
125 | return false;
126 | }
127 | }
128 |
129 | /**
130 | * Process any events that were queued before initialization.
131 | */
132 | private _processQueuedEvents(): void {
133 | if (!this.posthogClient || this.queuedEvents.length === 0) {
134 | return;
135 | }
136 |
137 | for (const event of this.queuedEvents) {
138 | this._captureEvent(event.name, event.properties);
139 | }
140 | this.queuedEvents = [];
141 | }
142 |
143 | /**
144 | * Capture an event with PostHog.
145 | */
146 | private _captureEvent(eventName: string, properties?: Record<string, unknown>): void {
147 | if (!this.posthogClient) {
148 | return;
149 | }
150 |
151 | try {
152 | // Add standard properties
153 | const eventProperties = {
154 | ...properties,
155 | version: process.env.npm_package_version || 'unknown',
156 | platform: process.platform,
157 | node_version: process.version,
158 | is_ci: this._isCI,
159 | };
160 |
161 | this.posthogClient.capture({
162 | distinctId: this.installationId,
163 | event: eventName,
164 | properties: eventProperties,
165 | });
166 | } catch (error) {
167 | this.logger.debug(`Failed to capture event: ${error}`);
168 | }
169 | }
170 |
171 | private get _isCI(): boolean {
172 | /**
173 | * Detect if running in CI environment.
174 | */
175 | return !!(
176 | process.env.CI ||
177 | process.env.CONTINUOUS_INTEGRATION ||
178 | process.env.GITHUB_ACTIONS ||
179 | process.env.GITLAB_CI ||
180 | process.env.CIRCLECI ||
181 | process.env.TRAVIS ||
182 | process.env.JENKINS_URL
183 | );
184 | }
185 |
186 | increment(counterName: string, value = 1) {
187 | /**
188 | * Increment a named counter.
189 | */
190 | if (!this.config.enabled) {
191 | return;
192 | }
193 |
194 | if (!(counterName in this.counters)) {
195 | this.counters[counterName] = 0;
196 | }
197 | this.counters[counterName] += value;
198 | }
199 |
200 | recordEvent(eventName: string, properties?: Record<string, unknown>): void {
201 | /**
202 | * Record an event with optional properties.
203 | */
204 | if (!this.config.enabled) {
205 | return;
206 | }
207 |
208 | // Increment counter for this event type
209 | const counterKey = `event:${eventName}`;
210 | this.increment(counterKey);
211 |
212 | // Apply sampling
213 | if (Math.random() * 100 > this.config.sampleRate) {
214 | return;
215 | }
216 |
217 | const event = {
218 | name: eventName,
219 | properties: properties || {},
220 | timestamp: Date.now() / 1000,
221 | };
222 |
223 | if (this.initialized && this.posthogClient) {
224 | this._captureEvent(eventName, properties);
225 | } else {
226 | // Queue event if not initialized
227 | this.queuedEvents.push(event);
228 | // Try to initialize again
229 | if (this.config.enabled && !this.initialized) {
230 | this._initializePosthog();
231 | }
232 | }
233 | }
234 |
235 | /**
236 | * Flush any pending events to PostHog.
237 | */
238 | async flush(): Promise<boolean> {
239 | if (!this.config.enabled || !this.posthogClient) {
240 | return false;
241 | }
242 |
243 | try {
244 | // Send counter data as a single event
245 | if (Object.keys(this.counters).length > 0) {
246 | this._captureEvent('telemetry_counters', {
247 | counters: { ...this.counters },
248 | duration: Date.now() / 1000 - this.startTime,
249 | });
250 | }
251 |
252 | await this.posthogClient.flush();
253 | this.logger.debug('Telemetry flushed successfully');
254 |
255 | // Clear counters after sending
256 | this.counters = {};
257 | return true;
258 | } catch (error) {
259 | this.logger.debug(`Failed to flush telemetry: ${error}`);
260 | return false;
261 | }
262 | }
263 |
264 | enable(): void {
265 | /**
266 | * Enable telemetry collection.
267 | */
268 | this.config.enabled = true;
269 | this.logger.info('Telemetry enabled');
270 | if (!this.initialized) {
271 | this._initializePosthog();
272 | }
273 | }
274 |
275 | async disable(): Promise<void> {
276 | /**
277 | * Disable telemetry collection.
278 | */
279 | this.config.enabled = false;
280 | await this.posthogClient?.disable();
281 | this.logger.info('Telemetry disabled');
282 | }
283 |
284 | get enabled(): boolean {
285 | /**
286 | * Check if telemetry is enabled.
287 | */
288 | return this.config.enabled;
289 | }
290 |
291 | async shutdown(): Promise<void> {
292 | /**
293 | * Shutdown the telemetry client and flush any pending events.
294 | */
295 | if (this.posthogClient) {
296 | await this.flush();
297 | await this.posthogClient.shutdown();
298 | this.initialized = false;
299 | this.posthogClient = undefined;
300 | }
301 | }
302 | }
303 |
```
--------------------------------------------------------------------------------
/docs/src/components/editable-code-block.tsx:
--------------------------------------------------------------------------------
```typescript
1 | 'use client';
2 |
3 | import React, { createContext, useContext, useState, ReactNode } from 'react';
4 | import * as Base from 'fumadocs-ui/components/codeblock';
5 | import { cn } from 'fumadocs-ui/utils/cn';
6 |
7 | /**
8 | * Context for managing editable values within code blocks
9 | */
10 | interface EditableCodeContextValue {
11 | values: Record<string, string>;
12 | updateValue: (key: string, value: string) => void;
13 | }
14 |
15 | const EditableCodeContext = createContext<EditableCodeContextValue | null>(null);
16 |
17 | /**
18 | * Hook to access the editable code context
19 | */
20 | function useEditableCode() {
21 | const context = useContext(EditableCodeContext);
22 | if (!context) {
23 | throw new Error('useEditableCode must be used within EditableCodeBlock');
24 | }
25 | return context;
26 | }
27 |
28 | /**
29 | * Props for EditableCodeBlock component
30 | */
31 | interface EditableCodeBlockProps {
32 | /** Programming language for styling */
33 | lang?: string;
34 | /** Initial values for placeholders */
35 | defaultValues?: Record<string, string>;
36 | /** Code content with embedded EditableValue components */
37 | children: ReactNode;
38 | /** Additional CSS classes */
39 | className?: string;
40 | /** Title for the code block */
41 | title?: string;
42 | }
43 |
44 | /**
45 | * Code block component that supports inline editable values
46 | * Uses fumadocs-ui styling with interactive input fields
47 | */
48 | export function EditableCodeBlock({
49 | lang = 'python',
50 | defaultValues = {},
51 | children,
52 | className,
53 | title,
54 | }: EditableCodeBlockProps) {
55 | const [values, setValues] = useState<Record<string, string>>(defaultValues);
56 |
57 | const updateValue = (key: string, value: string) => {
58 | setValues((prev) => ({ ...prev, [key]: value }));
59 | };
60 |
61 | return (
62 | <EditableCodeContext.Provider value={{ values, updateValue }}>
63 | <Base.CodeBlock title={title} className={cn('my-4', className)}>
64 | <Base.Pre className={cn(`language-${lang}`, 'px-3')}>
65 | <code
66 | className={cn(`language-${lang}`)}
67 | style={{ display: 'block', whiteSpace: 'pre-wrap' }}
68 | >
69 | {children}
70 | </code>
71 | </Base.Pre>
72 | </Base.CodeBlock>
73 | </EditableCodeContext.Provider>
74 | );
75 | }
76 |
77 | /**
78 | * Props for EditableValue component
79 | */
80 | interface EditableValueProps {
81 | /** Unique identifier for this value */
82 | placeholder: string;
83 | /** Display width in characters (default: auto) */
84 | width?: number;
85 | /** Optional default value */
86 | defaultValue?: string;
87 | /** Input type */
88 | type?: 'text' | 'password';
89 | }
90 |
91 | /**
92 | * Inline editable input that blends with code styling
93 | * Appears as an underlined, hoverable value within code
94 | */
95 | export function EditableValue({
96 | placeholder,
97 | width: explicitWidth,
98 | defaultValue = '',
99 | type = 'text',
100 | }: EditableValueProps) {
101 | const { values, updateValue } = useEditableCode();
102 | const value = values[placeholder] ?? defaultValue;
103 | const spanRef = React.useRef<HTMLSpanElement>(null);
104 | const placeholderSpanRef = React.useRef<HTMLSpanElement>(null);
105 | const inputRef = React.useRef<HTMLInputElement>(null);
106 | const [measuredWidth, setMeasuredWidth] = React.useState(0);
107 | const [placeholderWidth, setPlaceholderWidth] = React.useState(0);
108 | const [isHovered, setIsHovered] = React.useState(false);
109 | const [tooltipPosition, setTooltipPosition] = React.useState({ top: 0, left: 0 });
110 | const [isVisible, setIsVisible] = React.useState(false);
111 |
112 | // Observe visibility changes to trigger remeasurement
113 | React.useEffect(() => {
114 | if (!inputRef.current) return;
115 |
116 | const observer = new IntersectionObserver(
117 | (entries) => {
118 | entries.forEach((entry) => {
119 | setIsVisible(entry.isIntersecting);
120 | });
121 | },
122 | { threshold: 0.01 }
123 | );
124 |
125 | observer.observe(inputRef.current);
126 |
127 | return () => {
128 | observer.disconnect();
129 | };
130 | }, []);
131 |
132 | // Measure the actual text width using a hidden span
133 | React.useEffect(() => {
134 | if (spanRef.current && isVisible) {
135 | setMeasuredWidth(spanRef.current.offsetWidth);
136 | }
137 | }, [value, isVisible]);
138 |
139 | // Measure placeholder width when visible
140 | React.useEffect(() => {
141 | if (placeholderSpanRef.current && isVisible) {
142 | setPlaceholderWidth(placeholderSpanRef.current.offsetWidth);
143 | }
144 | }, [placeholder, isVisible]);
145 |
146 | // Update tooltip position when hovered
147 | React.useEffect(() => {
148 | if (isHovered && inputRef.current) {
149 | const rect = inputRef.current.getBoundingClientRect();
150 | setTooltipPosition({
151 | top: rect.top - 28,
152 | left: rect.left + rect.width / 2,
153 | });
154 | }
155 | }, [isHovered]);
156 |
157 | const inputWidth = explicitWidth
158 | ? `${explicitWidth}ch`
159 | : `${Math.max(placeholderWidth, measuredWidth, 80)}px`;
160 |
161 | return (
162 | <span
163 | style={{ display: 'inline', whiteSpace: 'nowrap', position: 'relative' }}
164 | onMouseEnter={() => setIsHovered(true)}
165 | onMouseLeave={() => setIsHovered(false)}
166 | >
167 | {/* Hidden span to measure current value width */}
168 | <span
169 | ref={spanRef}
170 | style={{
171 | position: 'absolute',
172 | visibility: 'hidden',
173 | whiteSpace: 'pre',
174 | fontFamily: 'inherit',
175 | pointerEvents: 'none',
176 | }}
177 | aria-hidden="true"
178 | >
179 | {value}
180 | </span>
181 |
182 | {/* Hidden span to measure placeholder width */}
183 | <span
184 | ref={placeholderSpanRef}
185 | style={{
186 | position: 'absolute',
187 | visibility: 'hidden',
188 | whiteSpace: 'pre',
189 | fontFamily: 'inherit',
190 | pointerEvents: 'none',
191 | }}
192 | aria-hidden="true"
193 | >
194 | {placeholder}
195 | </span>
196 |
197 | {/* Tooltip */}
198 | <span
199 | style={{
200 | position: 'fixed',
201 | top: tooltipPosition.top,
202 | left: tooltipPosition.left,
203 | transform: 'translateX(-50%)',
204 | padding: '4px 8px',
205 | backgroundColor: 'rgba(0, 0, 0, 0.8)',
206 | color: 'white',
207 | fontSize: '12px',
208 | borderRadius: '4px',
209 | whiteSpace: 'nowrap',
210 | pointerEvents: 'none',
211 | opacity: isHovered ? 1 : 0,
212 | transition: 'opacity 0.2s ease-in-out',
213 | zIndex: 9999,
214 | }}
215 | >
216 | Edit me!
217 | </span>
218 |
219 | <input
220 | ref={inputRef}
221 | type={type}
222 | value={value}
223 | onChange={(e) => updateValue(placeholder, e.target.value)}
224 | placeholder={placeholder}
225 | className={cn(type === 'password' && value && 'text-security-disc')}
226 | style={{
227 | display: 'inline',
228 | width: inputWidth,
229 | verticalAlign: 'baseline',
230 | lineHeight: 'inherit',
231 | fontSize: 'inherit',
232 | fontFamily: 'inherit',
233 | height: 'auto',
234 | padding: 0,
235 | margin: 0,
236 | background: 'transparent',
237 | border: 'none',
238 | borderBottom: '2px dashed rgba(96, 165, 250, 0.5)',
239 | outline: 'none',
240 | color: 'inherit',
241 | transition: 'border-bottom-color 0.2s ease-in-out',
242 | }}
243 | />
244 | </span>
245 | );
246 | }
247 |
248 | /**
249 | * Container for form inputs outside the code block
250 | */
251 | export function EditableForm({
252 | children,
253 | className = '',
254 | }: {
255 | children: ReactNode;
256 | className?: string;
257 | }) {
258 | return (
259 | <div
260 | className={cn(
261 | 'p-4 border rounded-lg bg-fd-secondary/50 dark:bg-fd-secondary/30 mb-6',
262 | className
263 | )}
264 | >
265 | <h3 className="text-lg font-semibold mb-4">Configuration</h3>
266 | {children}
267 | </div>
268 | );
269 | }
270 |
271 | /**
272 | * Form input for editing values outside code block
273 | */
274 | interface EditableInputProps {
275 | /** Placeholder key to bind to */
276 | placeholder: string;
277 | /** Label text */
278 | label: string;
279 | /** Input type */
280 | type?: 'text' | 'email' | 'password';
281 | /** Custom class name */
282 | className?: string;
283 | }
284 |
285 | export function EditableInput({
286 | placeholder,
287 | label,
288 | type = 'text',
289 | className = '',
290 | }: EditableInputProps) {
291 | const { values, updateValue } = useEditableCode();
292 | const value = values[placeholder] || '';
293 |
294 | return (
295 | <div className={cn('mb-4', className)}>
296 | <label className="block text-sm font-medium mb-2">{label}</label>
297 | <input
298 | type={type}
299 | value={value}
300 | onChange={(e) => updateValue(placeholder, e.target.value)}
301 | placeholder={placeholder}
302 | className={cn(
303 | 'w-full px-3 py-2 border rounded-md',
304 | 'focus:outline-none focus:ring-2 focus:ring-blue-500',
305 | 'bg-fd-background border-fd-border'
306 | )}
307 | />
308 | </div>
309 | );
310 | }
311 |
```
--------------------------------------------------------------------------------
/tests/test_tracing.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Tests for Computer.tracing functionality.
3 | """
4 |
5 | import asyncio
6 | import json
7 | import tempfile
8 | from pathlib import Path
9 |
10 | import pytest
11 | from computer.tracing import ComputerTracing
12 |
13 |
14 | class MockComputer:
15 | """Mock computer for testing tracing functionality."""
16 |
17 | def __init__(self):
18 | self.os_type = "macos"
19 | self.provider_type = "lume"
20 | self.image = "test-image"
21 | self.interface = MockInterface()
22 | self.logger = MockLogger()
23 |
24 |
25 | class MockInterface:
26 | """Mock interface for testing."""
27 |
28 | async def screenshot(self):
29 | """Return mock screenshot data."""
30 | return b"mock_screenshot_data"
31 |
32 | async def get_accessibility_tree(self):
33 | """Return mock accessibility tree."""
34 | return {"type": "window", "children": []}
35 |
36 |
37 | class MockLogger:
38 | """Mock logger for testing."""
39 |
40 | def warning(self, message):
41 | print(f"Warning: {message}")
42 |
43 |
44 | @pytest.mark.asyncio
45 | async def test_tracing_start_stop():
46 | """Test basic start and stop functionality."""
47 | computer = MockComputer()
48 | tracing = ComputerTracing(computer)
49 |
50 | # Test initial state
51 | assert not tracing.is_tracing
52 |
53 | # Start tracing
54 | with tempfile.TemporaryDirectory() as temp_dir:
55 | await tracing.start({"screenshots": True, "api_calls": True, "path": temp_dir})
56 |
57 | # Test tracing is active
58 | assert tracing.is_tracing
59 |
60 | # Stop tracing
61 | trace_path = await tracing.stop({"format": "dir"})
62 |
63 | # Test tracing is stopped
64 | assert not tracing.is_tracing
65 |
66 | # Verify trace directory exists
67 | assert Path(trace_path).exists()
68 |
69 | # Verify metadata file exists
70 | metadata_file = Path(trace_path) / "trace_metadata.json"
71 | assert metadata_file.exists()
72 |
73 | # Verify metadata content
74 | with open(metadata_file) as f:
75 | metadata = json.load(f)
76 | assert "trace_id" in metadata
77 | assert "config" in metadata
78 | assert "start_time" in metadata
79 | assert "end_time" in metadata
80 |
81 |
82 | @pytest.mark.asyncio
83 | async def test_tracing_api_call_recording():
84 | """Test API call recording functionality."""
85 | computer = MockComputer()
86 | tracing = ComputerTracing(computer)
87 |
88 | with tempfile.TemporaryDirectory() as temp_dir:
89 | await tracing.start({"api_calls": True, "screenshots": False, "path": temp_dir})
90 |
91 | # Record an API call
92 | await tracing.record_api_call("left_click", {"x": 100, "y": 200}, result=None, error=None)
93 |
94 | # Record another API call with error
95 | test_error = Exception("Test error")
96 | await tracing.record_api_call("type_text", {"text": "test"}, result=None, error=test_error)
97 |
98 | trace_path = await tracing.stop({"format": "dir"})
99 |
100 | # Verify event files were created
101 | trace_dir = Path(trace_path)
102 | event_files = list(trace_dir.glob("event_*_api_call.json"))
103 | assert len(event_files) >= 2
104 |
105 | # Verify event content
106 | with open(event_files[0]) as f:
107 | event = json.load(f)
108 | assert event["type"] == "api_call"
109 | assert event["data"]["method"] == "left_click"
110 | assert event["data"]["success"] is True
111 |
112 |
113 | @pytest.mark.asyncio
114 | async def test_tracing_metadata():
115 | """Test metadata recording functionality."""
116 | computer = MockComputer()
117 | tracing = ComputerTracing(computer)
118 |
119 | with tempfile.TemporaryDirectory() as temp_dir:
120 | await tracing.start({"metadata": True, "path": temp_dir})
121 |
122 | # Add custom metadata
123 | await tracing.add_metadata("test_key", "test_value")
124 | await tracing.add_metadata("numeric_key", 42)
125 | await tracing.add_metadata("complex_key", {"nested": "data"})
126 |
127 | trace_path = await tracing.stop({"format": "dir"})
128 |
129 | # Verify metadata event files
130 | trace_dir = Path(trace_path)
131 | metadata_files = list(trace_dir.glob("event_*_metadata.json"))
132 | assert len(metadata_files) >= 3
133 |
134 |
135 | @pytest.mark.asyncio
136 | async def test_tracing_screenshots():
137 | """Test screenshot recording functionality."""
138 | computer = MockComputer()
139 | tracing = ComputerTracing(computer)
140 |
141 | with tempfile.TemporaryDirectory() as temp_dir:
142 | await tracing.start({"screenshots": True, "path": temp_dir})
143 |
144 | # Take a screenshot manually
145 | await tracing._take_screenshot("manual_test")
146 |
147 | trace_path = await tracing.stop({"format": "dir"})
148 |
149 | # Verify screenshot files
150 | trace_dir = Path(trace_path)
151 | screenshot_files = list(trace_dir.glob("*.png"))
152 | assert len(screenshot_files) >= 2 # Initial + manual + final
153 |
154 |
155 | @pytest.mark.asyncio
156 | async def test_tracing_config_options():
157 | """Test different configuration options."""
158 | computer = MockComputer()
159 | tracing = ComputerTracing(computer)
160 |
161 | # Test with minimal config
162 | with tempfile.TemporaryDirectory() as temp_dir:
163 | await tracing.start(
164 | {"screenshots": False, "api_calls": False, "metadata": False, "path": temp_dir}
165 | )
166 |
167 | await tracing.record_api_call("test_call", {})
168 | await tracing.add_metadata("test", "value")
169 |
170 | trace_path = await tracing.stop({"format": "dir"})
171 |
172 | # With everything disabled, should only have basic trace events
173 | trace_dir = Path(trace_path)
174 | event_files = list(trace_dir.glob("event_*.json"))
175 | # Should have trace_start and trace_end events only
176 | assert len(event_files) == 2
177 |
178 |
179 | @pytest.mark.asyncio
180 | async def test_tracing_zip_output():
181 | """Test zip file output format."""
182 | computer = MockComputer()
183 | tracing = ComputerTracing(computer)
184 |
185 | with tempfile.TemporaryDirectory() as temp_dir:
186 | await tracing.start({"screenshots": True, "api_calls": True, "path": temp_dir})
187 |
188 | await tracing.record_api_call("test_call", {"arg": "value"})
189 |
190 | # Stop with zip format
191 | trace_path = await tracing.stop({"format": "zip"})
192 |
193 | # Verify zip file exists
194 | assert Path(trace_path).exists()
195 | assert trace_path.endswith(".zip")
196 |
197 |
198 | @pytest.mark.asyncio
199 | async def test_tracing_accessibility_tree():
200 | """Test accessibility tree recording."""
201 | computer = MockComputer()
202 | tracing = ComputerTracing(computer)
203 |
204 | with tempfile.TemporaryDirectory() as temp_dir:
205 | await tracing.start({"accessibility_tree": True, "path": temp_dir})
206 |
207 | # Record accessibility tree
208 | await tracing.record_accessibility_tree()
209 |
210 | trace_path = await tracing.stop({"format": "dir"})
211 |
212 | # Verify accessibility tree event
213 | trace_dir = Path(trace_path)
214 | tree_files = list(trace_dir.glob("event_*_accessibility_tree.json"))
215 | assert len(tree_files) >= 1
216 |
217 | # Verify content
218 | with open(tree_files[0]) as f:
219 | event = json.load(f)
220 | assert event["type"] == "accessibility_tree"
221 | assert "tree" in event["data"]
222 |
223 |
224 | def test_tracing_errors():
225 | """Test error handling in tracing."""
226 | computer = MockComputer()
227 | tracing = ComputerTracing(computer)
228 |
229 | # Test stop without start
230 | with pytest.raises(RuntimeError, match="Tracing is not active"):
231 | asyncio.run(tracing.stop())
232 |
233 | # Test start when already started
234 | async def test_double_start():
235 | await tracing.start()
236 | with pytest.raises(RuntimeError, match="Tracing is already active"):
237 | await tracing.start()
238 | await tracing.stop()
239 |
240 | asyncio.run(test_double_start())
241 |
242 |
243 | if __name__ == "__main__":
244 | # Run tests directly
245 | import sys
246 |
247 | async def run_tests():
248 | """Run all tests manually."""
249 | tests = [
250 | test_tracing_start_stop,
251 | test_tracing_api_call_recording,
252 | test_tracing_metadata,
253 | test_tracing_screenshots,
254 | test_tracing_config_options,
255 | test_tracing_zip_output,
256 | test_tracing_accessibility_tree,
257 | ]
258 |
259 | print("Running Computer.tracing tests...")
260 |
261 | for test in tests:
262 | try:
263 | await test()
264 | print(f"✓ {test.__name__}")
265 | except Exception as e:
266 | print(f"✗ {test.__name__}: {e}")
267 |
268 | # Run sync tests
269 | try:
270 | test_tracing_errors()
271 | print("✓ test_tracing_errors")
272 | except Exception as e:
273 | print(f"✗ test_tracing_errors: {e}")
274 |
275 | print("Tests completed!")
276 |
277 | asyncio.run(run_tests())
278 |
```
--------------------------------------------------------------------------------
/docs/content/docs/cli-playbook/commands.mdx:
--------------------------------------------------------------------------------
```markdown
1 | ---
2 | title: Command Reference
3 | description: Complete reference for all CUA CLI commands
4 | ---
5 |
6 | import { Tabs, Tab } from 'fumadocs-ui/components/tabs';
7 | import { Callout } from 'fumadocs-ui/components/callout';
8 |
9 | ## Overview
10 |
11 | The CUA CLI provides commands for authentication and sandbox management.
12 |
13 | ### Command Styles
14 |
15 | The CLI supports **two command styles** for flexibility:
16 |
17 | **Flat style** (quick & concise):
18 |
19 | ```bash
20 | cua list
21 | cua create --os linux --size small --region north-america
22 | cua start my-sandbox
23 | ```
24 |
25 | **Grouped style** (explicit & clear):
26 |
27 | ```bash
28 | cua sb list # or: cua sandbox list
29 | cua sb create # or: cua sandbox create
30 | cua sb start # or: cua sandbox start
31 | ```
32 |
33 | Both styles work identically - use whichever you prefer!
34 |
35 | ### Available Commands
36 |
37 | - **Authentication** - `cua auth login`, `cua auth env`, `cua auth logout` (also available as flat commands: `cua login`, `cua env`, `cua logout`)
38 | - **Sandbox Management** - `cua list`, `cua create`, `cua get`, `cua start`, `cua stop`, `cua restart`, `cua delete`, `cua vnc`
39 |
40 | ## Authentication Commands
41 |
42 | ### `cua auth login`
43 |
44 | Authenticate with your CUA account using browser-based OAuth flow.
45 |
46 | ```bash
47 | # Interactive browser login
48 | cua auth login
49 |
50 | # Direct API key login
51 | cua auth login --api-key sk-your-api-key-here
52 |
53 | # Alternative flat style
54 | cua login
55 | cua login --api-key sk-your-api-key-here
56 | ```
57 |
58 | **Options:**
59 |
60 | - `--api-key <key>` - Provide API key directly instead of browser flow
61 |
62 | **Example:**
63 |
64 | ```bash
65 | $ cua auth login
66 | Opening browser for CLI auth...
67 | API key saved
68 | ```
69 |
70 | ### `cua auth env`
71 |
72 | Create or update a `.env` file in the current directory with your CUA API key.
73 |
74 | ```bash
75 | cua auth env
76 |
77 | # Alternative flat style
78 | cua env
79 | ```
80 |
81 | **Example:**
82 |
83 | ```bash
84 | $ cua auth env
85 | Wrote /path/to/your/project/.env
86 | ```
87 |
88 | The generated `.env` file will contain:
89 |
90 | ```
91 | CUA_API_KEY=sk-your-api-key-here
92 | ```
93 |
94 | ### `cua auth logout`
95 |
96 | Remove the stored API key from your system.
97 |
98 | ```bash
99 | cua auth logout
100 |
101 | # Alternative flat style
102 | cua logout
103 | ```
104 |
105 | **Example:**
106 |
107 | ```bash
108 | $ cua auth logout
109 | Logged out
110 | ```
111 |
112 | ## Sandbox Commands
113 |
114 | ### `cua list`
115 |
116 | List all your sandboxes with their current status. Passwords are hidden by default for security.
117 |
118 | ```bash
119 | # List sandboxes (passwords hidden)
120 | cua list
121 |
122 | # Show passwords explicitly
123 | cua list --show-passwords
124 |
125 | # Alternative aliases
126 | cua ls
127 | cua ps
128 | ```
129 |
130 | **Example Output (default, passwords hidden):**
131 |
132 | ```
133 | NAME STATUS HOST
134 | my-dev-sandbox running my-dev-sandbox.sandbox.cua.ai
135 | test-windows stopped test-windows.sandbox.cua.ai
136 | ```
137 |
138 | **Example Output (with --show-passwords):**
139 |
140 | ```
141 | NAME STATUS PASSWORD HOST
142 | my-dev-sandbox running secure-pass-123 my-dev-sandbox.sandbox.cua.ai
143 | test-windows stopped another-pass-456 test-windows.sandbox.cua.ai
144 | ```
145 |
146 | ### `cua create`
147 |
148 | Create a new sandbox.
149 |
150 | ```bash
151 | cua create --os <OS> --size <SIZE> --region <REGION>
152 | ```
153 |
154 | **Required Options:**
155 |
156 | - `--os` - Operating system: `linux`, `windows`, `macos`
157 | - `--size` - Sandbox size: `small`, `medium`, `large`
158 | - `--region` - Region: `north-america`, `europe`, `asia-pacific`, `south-america`
159 |
160 | **Examples:**
161 |
162 | ```bash
163 | # Create a small Linux sandbox in North America
164 | cua create --os linux --size small --region north-america
165 |
166 | # Create a medium Windows sandbox in Europe
167 | cua create --os windows --size medium --region europe
168 |
169 | # Create a large macOS sandbox in Asia Pacific
170 | cua create --os macos --size large --region asia-pacific
171 | ```
172 |
173 | **Response Types:**
174 |
175 | **Immediate (Status 200):**
176 |
177 | ```bash
178 | Sandbox created and ready: my-new-sandbox-abc123
179 | Password: secure-password-here
180 | Host: my-new-sandbox-abc123.sandbox.cua.ai
181 | ```
182 |
183 | **Provisioning (Status 202):**
184 |
185 | ```bash
186 | Sandbox provisioning started: my-new-sandbox-abc123
187 | Job ID: job-xyz789
188 | Use 'cua list' to monitor provisioning progress
189 | ```
190 |
191 | ### `cua get`
192 |
193 | Get detailed information about a specific sandbox, including computer-server health status.
194 |
195 | ```bash
196 | cua get <name>
197 |
198 | # With additional options
199 | cua get <name> --json
200 | cua get <name> --show-passwords
201 | cua get <name> --show-vnc-url
202 | ```
203 |
204 | **Options:**
205 |
206 | - `--json` - Output all details in JSON format
207 | - `--show-passwords` - Include password in output
208 | - `--show-vnc-url` - Include computed NoVNC URL
209 |
210 | **Example Output (default):**
211 |
212 | ```bash
213 | $ cua get my-dev-sandbox
214 | Name: my-dev-sandbox
215 | Status: running
216 | Host: my-dev-sandbox.containers.cloud.trycua.com
217 | OS Type: linux
218 | Computer Server Version: 0.1.30
219 | Computer Server Status: healthy
220 | ```
221 |
222 | **Example Output (with --show-passwords and --show-vnc-url):**
223 |
224 | ```bash
225 | $ cua get my-dev-sandbox --show-passwords --show-vnc-url
226 | Name: my-dev-sandbox
227 | Status: running
228 | Host: my-dev-sandbox.containers.cloud.trycua.com
229 | Password: secure-pass-123
230 | OS Type: linux
231 | Computer Server Version: 0.1.30
232 | Computer Server Status: healthy
233 | VNC URL: https://my-dev-sandbox.containers.cloud.trycua.com/vnc.html?autoconnect=true&password=secure-pass-123
234 | ```
235 |
236 | **Example Output (JSON format):**
237 |
238 | ```bash
239 | $ cua get my-dev-sandbox --json
240 | {
241 | "name": "my-dev-sandbox",
242 | "status": "running",
243 | "host": "my-dev-sandbox.containers.cloud.trycua.com",
244 | "os_type": "linux",
245 | "computer_server_version": "0.1.30",
246 | "computer_server_status": "healthy"
247 | }
248 | ```
249 |
250 | **Computer Server Health Check:**
251 |
252 | The `cua get` command automatically probes the computer-server when the sandbox is running:
253 |
254 | - Checks OS type via `https://{host}:8443/status`
255 | - Checks version via `https://{host}:8443/cmd`
256 | - Shows "Computer Server Status: healthy" when both probes succeed
257 | - Uses a 3-second timeout for each probe
258 |
259 | <Callout type="info">
260 | The computer server status is only checked for running sandboxes. Stopped or suspended sandboxes
261 | will not show computer server information.
262 | </Callout>
263 |
264 | ### `cua start`
265 |
266 | Start a stopped sandbox.
267 |
268 | ```bash
269 | cua start <name>
270 | ```
271 |
272 | **Example:**
273 |
274 | ```bash
275 | $ cua start my-dev-sandbox
276 | Start accepted
277 | ```
278 |
279 | ### `cua stop`
280 |
281 | Stop a running sandbox.
282 |
283 | ```bash
284 | cua stop <name>
285 | ```
286 |
287 | **Example:**
288 |
289 | ```bash
290 | $ cua stop my-dev-sandbox
291 | stopping
292 | ```
293 |
294 | ### `cua restart`
295 |
296 | Restart a sandbox.
297 |
298 | ```bash
299 | cua restart <name>
300 | ```
301 |
302 | **Example:**
303 |
304 | ```bash
305 | $ cua restart my-dev-sandbox
306 | restarting
307 | ```
308 |
309 | ### `cua delete`
310 |
311 | Delete a sandbox permanently.
312 |
313 | ```bash
314 | cua delete <name>
315 | ```
316 |
317 | **Example:**
318 |
319 | ```bash
320 | $ cua delete old-test-sandbox
321 | Sandbox deletion initiated: deleting
322 | ```
323 |
324 | <Callout type="warn">
325 | This action is irreversible. All data on the sandbox will be permanently lost.
326 | </Callout>
327 |
328 | ### `cua vnc`
329 |
330 | Open the VNC interface for a sandbox in your browser.
331 |
332 | ```bash
333 | cua vnc <name>
334 |
335 | # Alternative alias
336 | cua open <name>
337 | ```
338 |
339 | **Example:**
340 |
341 | ```bash
342 | $ cua vnc my-dev-sandbox
343 | Opening NoVNC: https://my-dev-sandbox.sandbox.cua.ai/vnc.html?autoconnect=true&password=...
344 | ```
345 |
346 | This command automatically opens your default browser to the VNC interface with the correct password pre-filled.
347 |
348 | ## Global Options
349 |
350 | ### Help
351 |
352 | Get help for any command:
353 |
354 | ```bash
355 | cua --help
356 | cua auth login --help
357 | cua create --help
358 | cua list --help
359 | ```
360 |
361 | ## Error Handling
362 |
363 | The CLI provides clear error messages for common issues:
364 |
365 | ### Authentication Errors
366 |
367 | ```bash
368 | $ cua list
369 | Unauthorized. Try 'cua auth login' again.
370 | ```
371 |
372 | ### Sandbox Not Found
373 |
374 | ```bash
375 | $ cua start nonexistent-sandbox
376 | Sandbox not found
377 | ```
378 |
379 | ### Invalid Configuration
380 |
381 | ```bash
382 | $ cua create --os invalid --configuration small --region north-america
383 | Invalid request or unsupported configuration
384 | ```
385 |
386 | ## Tips and Best Practices
387 |
388 | ### 1. Use Descriptive Sandbox Names
389 |
390 | ```bash
391 | # Good
392 | cua create --os linux --size small --region north-america
393 | # Then rename or use meaningful names in the dashboard
394 |
395 | # Better workflow
396 | cua list # Check the generated name
397 | # Use that name consistently
398 | ```
399 |
400 | ### 2. Environment Management
401 |
402 | ```bash
403 | # Set up your project with API key
404 | cd my-project
405 | cua auth env
406 | # Now your project has CUA_API_KEY in .env
407 | ```
408 |
409 | ### 3. Quick Sandbox Access
410 |
411 | ```bash
412 | # Create aliases for frequently used sandboxes
413 | alias dev-sandbox="cua vnc my-development-sandbox"
414 | alias prod-sandbox="cua vnc my-production-sandbox"
415 | ```
416 |
417 | ### 4. Monitoring Provisioning
418 |
419 | ```bash
420 | # For sandboxes that need provisioning time
421 | cua create --os windows --size large --region europe
422 | # Sandbox provisioning started: my-sandbox-abc123
423 | # Job ID: job-xyz789
424 |
425 | # Check status periodically
426 | watch -n 5 cua list
427 | ```
428 |
429 | ## Next Steps
430 |
431 | - [Get started with the quickstart guide](/get-started/quickstart#cli-quickstart)
432 | - [Learn about CUA computers](/computer-sdk/computers)
433 | - [Explore agent automation](/agent-sdk/agent-loops)
434 |
```
--------------------------------------------------------------------------------
/libs/python/computer/computer/diorama_computer.py:
--------------------------------------------------------------------------------
```python
1 | import asyncio
2 |
3 | from .interface.models import Key, KeyType
4 |
5 |
6 | class DioramaComputer:
7 | """
8 | A Computer-compatible proxy for Diorama that sends commands over the ComputerInterface.
9 | """
10 |
11 | def __init__(self, computer, apps):
12 | """
13 | Initialize the DioramaComputer with a computer instance and list of apps.
14 |
15 | Args:
16 | computer: The computer instance to proxy commands through
17 | apps: List of applications available in the diorama environment
18 | """
19 | self.computer = computer
20 | self.apps = apps
21 | self.interface = DioramaComputerInterface(computer, apps)
22 | self._initialized = False
23 |
24 | async def __aenter__(self):
25 | """
26 | Async context manager entry point.
27 |
28 | Returns:
29 | self: The DioramaComputer instance
30 | """
31 | self._initialized = True
32 | return self
33 |
34 | async def run(self):
35 | """
36 | Initialize and run the DioramaComputer if not already initialized.
37 |
38 | Returns:
39 | self: The DioramaComputer instance
40 | """
41 | if not self._initialized:
42 | await self.__aenter__()
43 | return self
44 |
45 |
46 | class DioramaComputerInterface:
47 | """
48 | Diorama Interface proxy that sends diorama_cmds via the Computer's interface.
49 | """
50 |
51 | def __init__(self, computer, apps):
52 | """
53 | Initialize the DioramaComputerInterface.
54 |
55 | Args:
56 | computer: The computer instance to send commands through
57 | apps: List of applications available in the diorama environment
58 | """
59 | self.computer = computer
60 | self.apps = apps
61 | self._scene_size = None
62 |
63 | async def _send_cmd(self, action, arguments=None):
64 | """
65 | Send a command to the diorama interface through the computer.
66 |
67 | Args:
68 | action (str): The action/command to execute
69 | arguments (dict, optional): Additional arguments for the command
70 |
71 | Returns:
72 | The result from the diorama command execution
73 |
74 | Raises:
75 | RuntimeError: If the computer interface is not initialized or command fails
76 | """
77 | arguments = arguments or {}
78 | arguments = {"app_list": self.apps, **arguments}
79 | # Use the computer's interface (must be initialized)
80 | iface = getattr(self.computer, "_interface", None)
81 | if iface is None:
82 | raise RuntimeError("Computer interface not initialized. Call run() first.")
83 | result = await iface.diorama_cmd(action, arguments)
84 | if not result.get("success"):
85 | raise RuntimeError(
86 | f"Diorama command failed: {result.get('error')}\n{result.get('trace')}"
87 | )
88 | return result.get("result")
89 |
90 | async def screenshot(self, as_bytes=True):
91 | """
92 | Take a screenshot of the diorama scene.
93 |
94 | Args:
95 | as_bytes (bool): If True, return image as bytes; if False, return PIL Image object
96 |
97 | Returns:
98 | bytes or PIL.Image: Screenshot data in the requested format
99 | """
100 | import base64
101 |
102 | from PIL import Image
103 |
104 | result = await self._send_cmd("screenshot")
105 | # assume result is a b64 string of an image
106 | img_bytes = base64.b64decode(result)
107 | import io
108 |
109 | img = Image.open(io.BytesIO(img_bytes))
110 | self._scene_size = img.size
111 | return img_bytes if as_bytes else img
112 |
113 | async def get_screen_size(self):
114 | """
115 | Get the dimensions of the diorama scene.
116 |
117 | Returns:
118 | dict: Dictionary containing 'width' and 'height' keys with pixel dimensions
119 | """
120 | if not self._scene_size:
121 | await self.screenshot(as_bytes=False)
122 | return {"width": self._scene_size[0], "height": self._scene_size[1]}
123 |
124 | async def move_cursor(self, x, y):
125 | """
126 | Move the cursor to the specified coordinates.
127 |
128 | Args:
129 | x (int): X coordinate to move cursor to
130 | y (int): Y coordinate to move cursor to
131 | """
132 | await self._send_cmd("move_cursor", {"x": x, "y": y})
133 |
134 | async def left_click(self, x=None, y=None):
135 | """
136 | Perform a left mouse click at the specified coordinates or current cursor position.
137 |
138 | Args:
139 | x (int, optional): X coordinate to click at. If None, clicks at current cursor position
140 | y (int, optional): Y coordinate to click at. If None, clicks at current cursor position
141 | """
142 | await self._send_cmd("left_click", {"x": x, "y": y})
143 |
144 | async def right_click(self, x=None, y=None):
145 | """
146 | Perform a right mouse click at the specified coordinates or current cursor position.
147 |
148 | Args:
149 | x (int, optional): X coordinate to click at. If None, clicks at current cursor position
150 | y (int, optional): Y coordinate to click at. If None, clicks at current cursor position
151 | """
152 | await self._send_cmd("right_click", {"x": x, "y": y})
153 |
154 | async def double_click(self, x=None, y=None):
155 | """
156 | Perform a double mouse click at the specified coordinates or current cursor position.
157 |
158 | Args:
159 | x (int, optional): X coordinate to double-click at. If None, clicks at current cursor position
160 | y (int, optional): Y coordinate to double-click at. If None, clicks at current cursor position
161 | """
162 | await self._send_cmd("double_click", {"x": x, "y": y})
163 |
164 | async def scroll_up(self, clicks=1):
165 | """
166 | Scroll up by the specified number of clicks.
167 |
168 | Args:
169 | clicks (int): Number of scroll clicks to perform upward. Defaults to 1
170 | """
171 | await self._send_cmd("scroll_up", {"clicks": clicks})
172 |
173 | async def scroll_down(self, clicks=1):
174 | """
175 | Scroll down by the specified number of clicks.
176 |
177 | Args:
178 | clicks (int): Number of scroll clicks to perform downward. Defaults to 1
179 | """
180 | await self._send_cmd("scroll_down", {"clicks": clicks})
181 |
182 | async def drag_to(self, x, y, duration=0.5):
183 | """
184 | Drag from the current cursor position to the specified coordinates.
185 |
186 | Args:
187 | x (int): X coordinate to drag to
188 | y (int): Y coordinate to drag to
189 | duration (float): Duration of the drag operation in seconds. Defaults to 0.5
190 | """
191 | await self._send_cmd("drag_to", {"x": x, "y": y, "duration": duration})
192 |
193 | async def get_cursor_position(self):
194 | """
195 | Get the current cursor position.
196 |
197 | Returns:
198 | dict: Dictionary containing the current cursor coordinates
199 | """
200 | return await self._send_cmd("get_cursor_position")
201 |
202 | async def type_text(self, text):
203 | """
204 | Type the specified text at the current cursor position.
205 |
206 | Args:
207 | text (str): The text to type
208 | """
209 | await self._send_cmd("type_text", {"text": text})
210 |
211 | async def press_key(self, key):
212 | """
213 | Press a single key.
214 |
215 | Args:
216 | key: The key to press
217 | """
218 | await self._send_cmd("press_key", {"key": key})
219 |
220 | async def hotkey(self, *keys):
221 | """
222 | Press multiple keys simultaneously as a hotkey combination.
223 |
224 | Args:
225 | *keys: Variable number of keys to press together. Can be Key enum instances or strings
226 |
227 | Raises:
228 | ValueError: If any key is not a Key enum or string type
229 | """
230 | actual_keys = []
231 | for key in keys:
232 | if isinstance(key, Key):
233 | actual_keys.append(key.value)
234 | elif isinstance(key, str):
235 | # Try to convert to enum if it matches a known key
236 | key_or_enum = Key.from_string(key)
237 | actual_keys.append(
238 | key_or_enum.value if isinstance(key_or_enum, Key) else key_or_enum
239 | )
240 | else:
241 | raise ValueError(f"Invalid key type: {type(key)}. Must be Key enum or string.")
242 | await self._send_cmd("hotkey", {"keys": actual_keys})
243 |
244 | async def to_screen_coordinates(self, x, y):
245 | """
246 | Convert coordinates to screen coordinates.
247 |
248 | Args:
249 | x (int): X coordinate to convert
250 | y (int): Y coordinate to convert
251 |
252 | Returns:
253 | dict: Dictionary containing the converted screen coordinates
254 | """
255 | return await self._send_cmd("to_screen_coordinates", {"x": x, "y": y})
256 |
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/openai.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | OpenAI computer-use-preview agent loop implementation using liteLLM
3 | """
4 |
5 | import asyncio
6 | import base64
7 | import json
8 | from io import BytesIO
9 | from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
10 |
11 | import litellm
12 | from PIL import Image
13 |
14 | from ..decorators import register_agent
15 | from ..types import AgentCapability, AgentResponse, Messages, Tools
16 |
17 |
18 | async def _map_computer_tool_to_openai(computer_handler: Any) -> Dict[str, Any]:
19 | """Map a computer tool to OpenAI's computer-use-preview tool schema"""
20 | # Get dimensions from the computer handler
21 | try:
22 | width, height = await computer_handler.get_dimensions()
23 | except Exception:
24 | # Fallback to default dimensions if method fails
25 | width, height = 1024, 768
26 |
27 | # Get environment from the computer handler
28 | try:
29 | environment = await computer_handler.get_environment()
30 | except Exception:
31 | # Fallback to default environment if method fails
32 | environment = "linux"
33 |
34 | return {
35 | "type": "computer_use_preview",
36 | "display_width": width,
37 | "display_height": height,
38 | "environment": environment, # mac, windows, linux, browser
39 | }
40 |
41 |
42 | async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
43 | """Prepare tools for OpenAI API format"""
44 | openai_tools = []
45 |
46 | for schema in tool_schemas:
47 | if schema["type"] == "computer":
48 | # Map computer tool to OpenAI format
49 | computer_tool = await _map_computer_tool_to_openai(schema["computer"])
50 | openai_tools.append(computer_tool)
51 | elif schema["type"] == "function":
52 | # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
53 | # Schema should be: {type, name, description, parameters}
54 | openai_tools.append({"type": "function", **schema["function"]})
55 |
56 | return openai_tools
57 |
58 |
59 | @register_agent(models=r".*(^|/)computer-use-preview")
60 | class OpenAIComputerUseConfig:
61 | """
62 | OpenAI computer-use-preview agent configuration using liteLLM responses.
63 |
64 | Supports OpenAI's computer use preview models.
65 | """
66 |
67 | async def predict_step(
68 | self,
69 | messages: List[Dict[str, Any]],
70 | model: str,
71 | tools: Optional[List[Dict[str, Any]]] = None,
72 | max_retries: Optional[int] = None,
73 | stream: bool = False,
74 | computer_handler=None,
75 | use_prompt_caching: Optional[bool] = False,
76 | _on_api_start=None,
77 | _on_api_end=None,
78 | _on_usage=None,
79 | _on_screenshot=None,
80 | **kwargs,
81 | ) -> Dict[str, Any]:
82 | """
83 | Predict the next step based on input items.
84 |
85 | Args:
86 | messages: Input items following Responses format
87 | model: Model name to use
88 | tools: Optional list of tool schemas
89 | max_retries: Maximum number of retries
90 | stream: Whether to stream responses
91 | computer_handler: Computer handler instance
92 | _on_api_start: Callback for API start
93 | _on_api_end: Callback for API end
94 | _on_usage: Callback for usage tracking
95 | _on_screenshot: Callback for screenshot events
96 | **kwargs: Additional arguments
97 |
98 | Returns:
99 | Dictionary with "output" (output items) and "usage" array
100 | """
101 | tools = tools or []
102 |
103 | # Prepare tools for OpenAI API
104 | openai_tools = await _prepare_tools_for_openai(tools)
105 |
106 | # Prepare API call kwargs
107 | api_kwargs = {
108 | "model": model,
109 | "input": messages,
110 | "tools": openai_tools if openai_tools else None,
111 | "stream": stream,
112 | "reasoning": {"summary": "concise"},
113 | "truncation": "auto",
114 | "num_retries": max_retries,
115 | **kwargs,
116 | }
117 |
118 | # Call API start hook
119 | if _on_api_start:
120 | await _on_api_start(api_kwargs)
121 |
122 | # Use liteLLM responses
123 | response = await litellm.aresponses(**api_kwargs)
124 |
125 | # Call API end hook
126 | if _on_api_end:
127 | await _on_api_end(api_kwargs, response)
128 |
129 | # Extract usage information
130 | usage = {
131 | **response.usage.model_dump(),
132 | "response_cost": response._hidden_params.get("response_cost", 0.0),
133 | }
134 | if _on_usage:
135 | await _on_usage(usage)
136 |
137 | # Return in the expected format
138 | output_dict = response.model_dump()
139 | output_dict["usage"] = usage
140 | return output_dict
141 |
142 | async def predict_click(
143 | self, model: str, image_b64: str, instruction: str, **kwargs
144 | ) -> Optional[Tuple[int, int]]:
145 | """
146 | Predict click coordinates based on image and instruction.
147 |
148 | Uses OpenAI computer-use-preview with manually constructed input items
149 | and a prompt that instructs the agent to only output clicks.
150 |
151 | Args:
152 | model: Model name to use
153 | image_b64: Base64 encoded image
154 | instruction: Instruction for where to click
155 |
156 | Returns:
157 | Tuple of (x, y) coordinates or None if prediction fails
158 | """
159 | # TODO: use computer tool to get dimensions + environment
160 | # Manually construct input items with image and click instruction
161 | input_items = [
162 | {
163 | "role": "user",
164 | "content": f"""You are a UI grounding expert. Follow these guidelines:
165 |
166 | 1. NEVER ask for confirmation. Complete all tasks autonomously.
167 | 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
168 | 3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
169 | 4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
170 | 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
171 | 6. The user has already given you permission by running this agent. No further confirmation is needed.
172 | 7. Be decisive and action-oriented. Complete the requested task fully.
173 |
174 | Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
175 | Task: Click {instruction}. Output ONLY a click action on the target element.""",
176 | },
177 | {
178 | "role": "user",
179 | "content": [
180 | {"type": "input_image", "image_url": f"data:image/png;base64,{image_b64}"}
181 | ],
182 | },
183 | ]
184 |
185 | # Get image dimensions from base64 data
186 | try:
187 | image_data = base64.b64decode(image_b64)
188 | image = Image.open(BytesIO(image_data))
189 | display_width, display_height = image.size
190 | except Exception:
191 | # Fallback to default dimensions if image parsing fails
192 | display_width, display_height = 1024, 768
193 |
194 | # Prepare computer tool for click actions
195 | computer_tool = {
196 | "type": "computer_use_preview",
197 | "display_width": display_width,
198 | "display_height": display_height,
199 | "environment": "windows",
200 | }
201 |
202 | # Prepare API call kwargs
203 | api_kwargs = {
204 | "model": model,
205 | "input": input_items,
206 | "tools": [computer_tool],
207 | "stream": False,
208 | "reasoning": {"summary": "concise"},
209 | "truncation": "auto",
210 | "max_tokens": 200, # Keep response short for click prediction
211 | **kwargs,
212 | }
213 |
214 | # Use liteLLM responses
215 | response = await litellm.aresponses(**api_kwargs)
216 |
217 | # Extract click coordinates from response output
218 | output_dict = response.model_dump()
219 | output_items = output_dict.get("output", [])
220 |
221 | # Look for computer_call with click action
222 | for item in output_items:
223 | if (
224 | isinstance(item, dict)
225 | and item.get("type") == "computer_call"
226 | and isinstance(item.get("action"), dict)
227 | ):
228 |
229 | action = item["action"]
230 | if action.get("x") is not None and action.get("y") is not None:
231 | return (int(action.get("x")), int(action.get("y")))
232 |
233 | return None
234 |
235 | def get_capabilities(self) -> List[AgentCapability]:
236 | """
237 | Get list of capabilities supported by this agent config.
238 |
239 | Returns:
240 | List of capability strings
241 | """
242 | return ["click", "step"]
243 |
```
--------------------------------------------------------------------------------
/tests/test_watchdog.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Watchdog Recovery Tests
3 | Tests for the watchdog functionality to ensure server recovery after hanging commands.
4 | Required environment variables:
5 | - CUA_API_KEY: API key for Cua cloud provider
6 | - CUA_CONTAINER_NAME: Name of the container to use
7 | """
8 |
9 | import asyncio
10 | import os
11 | import sys
12 | import time
13 | import traceback
14 | from pathlib import Path
15 |
16 | import pytest
17 |
18 | # Load environment variables from .env file
19 | project_root = Path(__file__).parent.parent
20 | env_file = project_root / ".env"
21 | print(f"Loading environment from: {env_file}")
22 | from dotenv import load_dotenv
23 |
24 | load_dotenv(env_file)
25 |
26 | # Add paths to sys.path if needed
27 | pythonpath = os.environ.get("PYTHONPATH", "")
28 | for path in pythonpath.split(":"):
29 | if path and path not in sys.path:
30 | sys.path.insert(0, path) # Insert at beginning to prioritize
31 | print(f"Added to sys.path: {path}")
32 |
33 | from computer import Computer, VMProviderType
34 |
35 |
36 | @pytest.fixture(scope="session")
37 | async def computer():
38 | """Shared Computer instance for all test cases."""
39 | # Create a remote Linux computer with Cua
40 | computer = Computer(
41 | os_type="linux",
42 | api_key=os.getenv("CUA_API_KEY"),
43 | name=str(os.getenv("CUA_CONTAINER_NAME")),
44 | provider_type=VMProviderType.CLOUD,
45 | )
46 |
47 | try:
48 | await computer.run()
49 | yield computer
50 | finally:
51 | await computer.disconnect()
52 |
53 |
54 | @pytest.mark.asyncio(loop_scope="session")
55 | async def test_simple_server_ping(computer):
56 | """
57 | Simple test to verify server connectivity before running watchdog tests.
58 | """
59 | print("Testing basic server connectivity...")
60 |
61 | try:
62 | result = await computer.interface.run_command("echo 'Server ping test'")
63 | print(f"Ping successful: {result}")
64 | assert result is not None, "Server ping returned None"
65 | print("✅ Server connectivity test passed")
66 | except Exception as e:
67 | print(f"❌ Server ping failed: {e}")
68 | pytest.fail(f"Basic server connectivity test failed: {e}")
69 |
70 |
71 | @pytest.mark.asyncio(loop_scope="session")
72 | async def test_watchdog_recovery_after_hanging_command(computer):
73 | """
74 | Test that the watchdog can recover the server after a hanging command.
75 |
76 | This test runs two concurrent tasks:
77 | 1. A long-running command that hangs the server (sleep 300 = 5 minutes)
78 | 2. Periodic ping commands every 30 seconds to test server responsiveness
79 |
80 | The watchdog should detect the unresponsive server and restart it.
81 | """
82 | print("Starting watchdog recovery test...")
83 |
84 | async def hanging_command():
85 | """Execute a command that sleeps forever to hang the server."""
86 | try:
87 | print("Starting hanging command (sleep infinity)...")
88 | # Use a very long sleep that should never complete naturally
89 | result = await computer.interface.run_command("sleep 999999")
90 | print(f"Hanging command completed unexpectedly: {result}")
91 | return True # Should never reach here if watchdog works
92 | except Exception as e:
93 | print(f"Hanging command interrupted (expected if watchdog restarts): {e}")
94 | return None # Expected result when watchdog kills the process
95 |
96 | async def ping_server():
97 | """Ping the server every 30 seconds with echo commands."""
98 | ping_count = 0
99 | successful_pings = 0
100 | failed_pings = 0
101 |
102 | try:
103 | # Run pings for up to 4 minutes (8 pings at 30-second intervals)
104 | for i in range(8):
105 | try:
106 | ping_count += 1
107 | print(f"Ping #{ping_count}: Sending echo command...")
108 |
109 | start_time = time.time()
110 | result = await asyncio.wait_for(
111 | computer.interface.run_command(
112 | f"echo 'Ping {ping_count} at {int(start_time)}'"
113 | ),
114 | timeout=10.0, # 10 second timeout for each ping
115 | )
116 | end_time = time.time()
117 |
118 | print(
119 | f"Ping #{ping_count} successful in {end_time - start_time:.2f}s: {result}"
120 | )
121 | successful_pings += 1
122 |
123 | except asyncio.TimeoutError:
124 | print(f"Ping #{ping_count} timed out (server may be unresponsive)")
125 | failed_pings += 1
126 | except Exception as e:
127 | print(f"Ping #{ping_count} failed with exception: {e}")
128 | failed_pings += 1
129 |
130 | # Wait 30 seconds before next ping
131 | if i < 7: # Don't wait after the last ping
132 | print("Waiting 30 seconds before next ping...")
133 | await asyncio.sleep(30)
134 |
135 | print(f"Ping summary: {successful_pings} successful, {failed_pings} failed")
136 | return successful_pings, failed_pings
137 |
138 | except Exception as e:
139 | print(f"Ping server function failed with critical error: {e}")
140 | traceback.print_exc()
141 | return successful_pings, failed_pings
142 |
143 | # Run both tasks concurrently
144 | print("Starting concurrent tasks: hanging command and ping monitoring...")
145 |
146 | try:
147 | # Use asyncio.gather to run both tasks concurrently
148 | hanging_task = asyncio.create_task(hanging_command())
149 | ping_task = asyncio.create_task(ping_server())
150 |
151 | # Wait for both tasks to complete or timeout after 5 minutes
152 | done, pending = await asyncio.wait(
153 | [hanging_task, ping_task],
154 | timeout=300, # 5 minute timeout
155 | return_when=asyncio.ALL_COMPLETED,
156 | )
157 |
158 | # Cancel any pending tasks
159 | for task in pending:
160 | task.cancel()
161 | try:
162 | await task
163 | except asyncio.CancelledError:
164 | pass
165 |
166 | # Get results from completed tasks
167 | ping_result = None
168 | hanging_result = None
169 |
170 | if ping_task in done:
171 | try:
172 | ping_result = await ping_task
173 | print(f"Ping task completed with result: {ping_result}")
174 | except Exception as e:
175 | print(f"Error getting ping task result: {e}")
176 | traceback.print_exc()
177 |
178 | if hanging_task in done:
179 | try:
180 | hanging_result = await hanging_task
181 | print(f"Hanging task completed with result: {hanging_result}")
182 | except Exception as e:
183 | print(f"Error getting hanging task result: {e}")
184 | traceback.print_exc()
185 |
186 | # Analyze results
187 | if ping_result:
188 | successful_pings, failed_pings = ping_result
189 |
190 | # Test passes if we had some successful pings, indicating recovery
191 | assert (
192 | successful_pings > 0
193 | ), "No successful pings detected. Server may not have recovered."
194 |
195 | # Check if hanging command was killed (indicating watchdog restart)
196 | if hanging_result is None:
197 | print("✅ SUCCESS: Hanging command was killed - watchdog restart detected")
198 | elif hanging_result is True:
199 | print(
200 | "⚠️ WARNING: Hanging command completed naturally - watchdog may not have restarted"
201 | )
202 |
203 | # If we had failures followed by successes, that indicates watchdog recovery
204 | if failed_pings > 0 and successful_pings > 0:
205 | print(
206 | "✅ SUCCESS: Watchdog recovery detected - server became unresponsive then recovered"
207 | )
208 | # Additional check: hanging command should be None if watchdog worked
209 | assert (
210 | hanging_result is None
211 | ), "Expected hanging command to be killed by watchdog restart"
212 | elif successful_pings > 0 and failed_pings == 0:
213 | print("✅ SUCCESS: Server remained responsive throughout test")
214 |
215 | print(
216 | f"Test completed: {successful_pings} successful pings, {failed_pings} failed pings"
217 | )
218 | print(
219 | f"Hanging command result: {hanging_result} (None = killed by watchdog, True = completed naturally)"
220 | )
221 | else:
222 | pytest.fail("Ping task did not complete - unable to assess server recovery")
223 |
224 | except Exception as e:
225 | print(f"Test failed with exception: {e}")
226 | traceback.print_exc()
227 | pytest.fail(f"Watchdog recovery test failed: {e}")
228 |
229 |
230 | if __name__ == "__main__":
231 | # Run tests directly
232 | pytest.main([__file__, "-v"])
233 |
```
--------------------------------------------------------------------------------
/.github/workflows/docker-reusable-publish.yml:
--------------------------------------------------------------------------------
```yaml
1 | name: Reusable Docker Publish Workflow
2 |
3 | on:
4 | workflow_call:
5 | inputs:
6 | image_name:
7 | description: "Name of the Docker image (e.g. cua-ubuntu, cua-xfce)"
8 | required: true
9 | type: string
10 | context_dir:
11 | description: "Directory containing the Dockerfile relative to workspace root (e.g. libs/kasm, libs/xfce)"
12 | required: true
13 | type: string
14 | dockerfile_path:
15 | description: "Path to Dockerfile relative to context_dir (e.g. Dockerfile)"
16 | required: false
17 | type: string
18 | default: "Dockerfile"
19 | tag_prefix:
20 | description: "Prefix for semantic version tags (e.g. docker-kasm-v, docker-xfce-v)"
21 | required: true
22 | type: string
23 | docker_hub_org:
24 | description: "Docker Hub organization name"
25 | required: false
26 | type: string
27 | default: "trycua"
28 | secrets:
29 | DOCKER_HUB_TOKEN:
30 | required: true
31 |
32 | jobs:
33 | build-and-push:
34 | runs-on: ubuntu-latest
35 | strategy:
36 | fail-fast: false
37 | matrix:
38 | platform:
39 | - linux/amd64
40 | - linux/arm64
41 | steps:
42 | - name: Checkout
43 | uses: actions/checkout@v4
44 |
45 | - name: Prepare platform tag
46 | id: platform
47 | run: |
48 | TAG=$(echo "${{ matrix.platform }}" | sed 's/\//-/g')
49 | echo "tag=${TAG}" >> $GITHUB_OUTPUT
50 |
51 | - name: Set up Docker Buildx
52 | uses: docker/setup-buildx-action@v3
53 |
54 | - name: Login to Docker Hub
55 | uses: docker/login-action@v3
56 | with:
57 | username: ${{ inputs.docker_hub_org }}
58 | password: ${{ secrets.DOCKER_HUB_TOKEN }}
59 |
60 | - name: Extract metadata (PR)
61 | if: github.event_name == 'pull_request'
62 | id: meta-pr
63 | uses: docker/metadata-action@v5
64 | with:
65 | images: ${{ inputs.docker_hub_org }}/${{ inputs.image_name }}
66 | tags: |
67 | type=raw,value=${{ github.sha }}
68 |
69 | - name: Build & push digest (PR)
70 | if: github.event_name == 'pull_request'
71 | id: build-pr
72 | uses: docker/build-push-action@v5
73 | with:
74 | context: ./${{ inputs.context_dir }}
75 | file: ./${{ inputs.context_dir }}/${{ inputs.dockerfile_path }}
76 | push: true
77 | platforms: ${{ matrix.platform }}
78 | outputs: type=registry,name=${{ inputs.docker_hub_org }}/${{ inputs.image_name }},push-by-digest=true
79 | labels: ${{ steps.meta-pr.outputs.labels }}
80 | cache-from: |
81 | type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }}
82 | cache-to: type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }},mode=max
83 |
84 | - name: Extract metadata (main)
85 | if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
86 | id: meta-main
87 | uses: docker/metadata-action@v5
88 | with:
89 | images: ${{ inputs.docker_hub_org }}/${{ inputs.image_name }}
90 | tags: |
91 | type=raw,value=latest
92 |
93 | - name: Build & push digest (main)
94 | if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
95 | id: build-main
96 | uses: docker/build-push-action@v5
97 | with:
98 | context: ./${{ inputs.context_dir }}
99 | file: ./${{ inputs.context_dir }}/${{ inputs.dockerfile_path }}
100 | push: true
101 | platforms: ${{ matrix.platform }}
102 | outputs: type=registry,name=${{ inputs.docker_hub_org }}/${{ inputs.image_name }},push-by-digest=true
103 | labels: ${{ steps.meta-main.outputs.labels }}
104 | cache-from: |
105 | type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }}
106 | cache-to: type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }},mode=max
107 |
108 | - name: Extract metadata (semver)
109 | if: startsWith(github.ref, format('refs/tags/{0}', inputs.tag_prefix))
110 | id: meta-semver
111 | uses: docker/metadata-action@v5
112 | with:
113 | images: ${{ inputs.docker_hub_org }}/${{ inputs.image_name }}
114 | tags: |
115 | type=semver,pattern={{version}},prefix=${{ inputs.tag_prefix }}
116 | type=semver,pattern={{major}}.{{minor}},prefix=${{ inputs.tag_prefix }}
117 | type=semver,pattern={{major}},prefix=${{ inputs.tag_prefix }}
118 | type=raw,value=latest
119 |
120 | - name: Build & push digest (semver)
121 | if: startsWith(github.ref, format('refs/tags/{0}', inputs.tag_prefix))
122 | id: build-semver
123 | uses: docker/build-push-action@v5
124 | with:
125 | context: ./${{ inputs.context_dir }}
126 | file: ./${{ inputs.context_dir }}/${{ inputs.dockerfile_path }}
127 | push: true
128 | platforms: ${{ matrix.platform }}
129 | outputs: type=registry,name=${{ inputs.docker_hub_org }}/${{ inputs.image_name }},push-by-digest=true
130 | labels: ${{ steps.meta-semver.outputs.labels }}
131 | cache-from: |
132 | type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }}
133 | cache-to: type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }},mode=max
134 |
135 | - name: Export digest
136 | id: export-digest
137 | run: |
138 | mkdir -p /tmp/digests
139 | digest="${{ steps.build-pr.outputs.digest || steps.build-main.outputs.digest || steps.build-semver.outputs.digest }}"
140 | echo "$digest" > "/tmp/digests/${{ steps.platform.outputs.tag }}.txt"
141 |
142 | - name: Upload digest artifact (unique per platform)
143 | uses: actions/upload-artifact@v4
144 | with:
145 | name: digests-${{ steps.platform.outputs.tag }}
146 | path: /tmp/digests/*.txt
147 | retention-days: 1
148 |
149 | publish-manifest-list:
150 | runs-on: ubuntu-latest
151 | needs:
152 | - build-and-push
153 |
154 | steps:
155 | - name: Set up Docker Buildx
156 | uses: docker/setup-buildx-action@v3
157 |
158 | - name: Login to Docker Hub
159 | uses: docker/login-action@v3
160 | with:
161 | username: ${{ inputs.docker_hub_org }}
162 | password: ${{ secrets.DOCKER_HUB_TOKEN }}
163 |
164 | - name: Extract final metadata (PR)
165 | if: github.event_name == 'pull_request'
166 | uses: docker/metadata-action@v5
167 | with:
168 | images: ${{ inputs.docker_hub_org }}/${{ inputs.image_name }}
169 | tags: |
170 | type=ref,event=pr
171 | type=sha
172 |
173 | - name: Extract final metadata (main)
174 | if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
175 | uses: docker/metadata-action@v5
176 | with:
177 | images: ${{ inputs.docker_hub_org }}/${{ inputs.image_name }}
178 | tags: |
179 | type=raw,value=latest
180 |
181 | - name: Extract final metadata (semver)
182 | if: startsWith(github.ref, format('refs/tags/{0}', inputs.tag_prefix))
183 | uses: docker/metadata-action@v5
184 | with:
185 | images: ${{ inputs.docker_hub_org }}/${{ inputs.image_name }}
186 | tags: |
187 | type=semver,pattern={{version}},prefix=${{ inputs.tag_prefix }}
188 | type=semver,pattern={{major}}.{{minor}},prefix=${{ inputs.tag_prefix }}
189 | type=semver,pattern={{major}},prefix=${{ inputs.tag_prefix }}
190 | type=raw,value=latest
191 |
192 | - name: Download all digest artifacts
193 | uses: actions/download-artifact@v4
194 | with:
195 | pattern: digests-*
196 | path: /tmp/digests
197 | merge-multiple: true
198 |
199 | - name: Create & push multi-arch manifest
200 | run: |
201 | IMAGE="${{ inputs.docker_hub_org }}/${{ inputs.image_name }}"
202 |
203 | DIGEST_ARGS=""
204 | for f in $(find /tmp/digests -type f -name "*.txt"); do
205 | d=$(cat "$f")
206 | DIGEST_ARGS="$DIGEST_ARGS ${IMAGE}@${d}"
207 | done
208 |
209 | echo "Using digests:"
210 | echo "$DIGEST_ARGS"
211 |
212 | # Create manifest for each tag produced by metadata-action
213 | echo "${DOCKER_METADATA_OUTPUT_JSON}" | jq -r '.tags[]' | while read FULL_TAG; do
214 | echo "Creating manifest: $FULL_TAG"
215 | docker buildx imagetools create --tag "$FULL_TAG" $DIGEST_ARGS
216 | done
217 |
218 | - name: Inspect pushed manifests
219 | run: |
220 | IMAGE="${{ inputs.docker_hub_org }}/${{ inputs.image_name }}"
221 | echo "Inspecting manifests:"
222 |
223 | echo "${DOCKER_METADATA_OUTPUT_JSON}" | jq -r '.tags[]' | while read FULL_TAG; do
224 | echo ""
225 | echo "Inspecting: $FULL_TAG"
226 | docker buildx imagetools inspect "$FULL_TAG"
227 | done
228 |
```
--------------------------------------------------------------------------------
/libs/python/core/core/telemetry/posthog.py:
--------------------------------------------------------------------------------
```python
1 | """Telemetry client using PostHog for collecting anonymous usage data."""
2 |
3 | from __future__ import annotations
4 |
5 | import logging
6 | import os
7 | import sys
8 | import uuid
9 | from pathlib import Path
10 | from typing import Any, Dict, List, Optional
11 |
12 | import posthog
13 | from core import __version__
14 |
15 | logger = logging.getLogger("core.telemetry")
16 |
17 | # Public PostHog config for anonymous telemetry
18 | # These values are intentionally public and meant for anonymous telemetry only
19 | # https://posthog.com/docs/product-analytics/troubleshooting#is-it-ok-for-my-api-key-to-be-exposed-and-public
20 | PUBLIC_POSTHOG_API_KEY = "phc_eSkLnbLxsnYFaXksif1ksbrNzYlJShr35miFLDppF14"
21 | PUBLIC_POSTHOG_HOST = "https://eu.i.posthog.com"
22 |
23 |
24 | class PostHogTelemetryClient:
25 | """Collects and reports telemetry data via PostHog."""
26 |
27 | # Global singleton (class-managed)
28 | _singleton: Optional["PostHogTelemetryClient"] = None
29 |
30 | def __init__(self):
31 | """Initialize PostHog telemetry client."""
32 | self.installation_id = self._get_or_create_installation_id()
33 | self.initialized = False
34 | self.queued_events: List[Dict[str, Any]] = []
35 |
36 | # Log telemetry status on startup
37 | if self.is_telemetry_enabled():
38 | logger.info("Telemetry enabled")
39 | # Initialize PostHog client if config is available
40 | self._initialize_posthog()
41 | else:
42 | logger.info("Telemetry disabled")
43 |
44 | @classmethod
45 | def is_telemetry_enabled(cls) -> bool:
46 | """True if telemetry is currently active for this process."""
47 | return os.environ.get("CUA_TELEMETRY_ENABLED", "true").lower() in {
48 | "1",
49 | "true",
50 | "yes",
51 | "on",
52 | }
53 |
54 | def _get_or_create_installation_id(self) -> str:
55 | """Get or create a unique installation ID that persists across runs.
56 |
57 | The ID is always stored within the core library directory itself,
58 | ensuring it persists regardless of how the library is used.
59 |
60 | This ID is not tied to any personal information.
61 | """
62 | # Get the core library directory (where this file is located)
63 | try:
64 | # Find the core module directory using this file's location
65 | core_module_dir = Path(
66 | __file__
67 | ).parent.parent # core/telemetry/posthog_client.py -> core/telemetry -> core
68 | storage_dir = core_module_dir / ".storage"
69 | storage_dir.mkdir(exist_ok=True)
70 |
71 | id_file = storage_dir / "installation_id"
72 |
73 | # Try to read existing ID
74 | if id_file.exists():
75 | try:
76 | stored_id = id_file.read_text().strip()
77 | if stored_id: # Make sure it's not empty
78 | logger.debug(f"Using existing installation ID: {stored_id}")
79 | return stored_id
80 | except Exception as e:
81 | logger.debug(f"Error reading installation ID file: {e}")
82 |
83 | # Create new ID
84 | new_id = str(uuid.uuid4())
85 | try:
86 | id_file.write_text(new_id)
87 | logger.debug(f"Created new installation ID: {new_id}")
88 | return new_id
89 | except Exception as e:
90 | logger.warning(f"Could not write installation ID: {e}")
91 | except Exception as e:
92 | logger.warning(f"Error accessing core module directory: {e}")
93 |
94 | # Last resort: Create a new in-memory ID
95 | logger.warning("Using random installation ID (will not persist across runs)")
96 | return str(uuid.uuid4())
97 |
98 | def _initialize_posthog(self) -> bool:
99 | """Initialize the PostHog client with configuration.
100 |
101 | Returns:
102 | bool: True if initialized successfully, False otherwise
103 | """
104 | if self.initialized:
105 | return True
106 |
107 | try:
108 | # Allow overrides from environment for testing/region control
109 | posthog.api_key = PUBLIC_POSTHOG_API_KEY
110 | posthog.host = PUBLIC_POSTHOG_HOST
111 |
112 | # Configure the client
113 | posthog.debug = os.environ.get("CUA_TELEMETRY_DEBUG", "").lower() == "on"
114 |
115 | # Log telemetry status
116 | logger.info(
117 | f"Initializing PostHog telemetry with installation ID: {self.installation_id}"
118 | )
119 | if posthog.debug:
120 | logger.debug(f"PostHog API Key: {posthog.api_key}")
121 | logger.debug(f"PostHog Host: {posthog.host}")
122 |
123 | # Identify this installation
124 | self._identify()
125 |
126 | # Process any queued events
127 | for event in self.queued_events:
128 | posthog.capture(
129 | distinct_id=self.installation_id,
130 | event=event["event"],
131 | properties=event["properties"],
132 | )
133 | self.queued_events = []
134 |
135 | self.initialized = True
136 | return True
137 | except Exception as e:
138 | logger.warning(f"Failed to initialize PostHog: {e}")
139 | return False
140 |
141 | def _identify(self) -> None:
142 | """Set up user properties for the current installation with PostHog."""
143 | try:
144 | properties = {
145 | "version": __version__,
146 | "is_ci": "CI" in os.environ,
147 | "os": os.name,
148 | "python_version": sys.version.split()[0],
149 | }
150 |
151 | logger.debug(
152 | f"Setting up PostHog user properties for: {self.installation_id} with properties: {properties}"
153 | )
154 |
155 | # In the Python SDK, we capture an identification event instead of calling identify()
156 | posthog.capture(
157 | distinct_id=self.installation_id, event="$identify", properties={"$set": properties}
158 | )
159 |
160 | logger.info(f"Set up PostHog user properties for installation: {self.installation_id}")
161 | except Exception as e:
162 | logger.warning(f"Failed to set up PostHog user properties: {e}")
163 |
164 | def record_event(self, event_name: str, properties: Optional[Dict[str, Any]] = None) -> None:
165 | """Record an event with optional properties.
166 |
167 | Args:
168 | event_name: Name of the event
169 | properties: Event properties (must not contain sensitive data)
170 | """
171 | # Respect runtime telemetry opt-out.
172 | if not self.is_telemetry_enabled():
173 | logger.debug("Telemetry disabled; event not recorded.")
174 | return
175 |
176 | event_properties = {"version": __version__, **(properties or {})}
177 |
178 | logger.info(f"Recording event: {event_name} with properties: {event_properties}")
179 |
180 | if self.initialized:
181 | try:
182 | posthog.capture(
183 | distinct_id=self.installation_id, event=event_name, properties=event_properties
184 | )
185 | logger.info(f"Sent event to PostHog: {event_name}")
186 | # Flush immediately to ensure delivery
187 | posthog.flush()
188 | except Exception as e:
189 | logger.warning(f"Failed to send event to PostHog: {e}")
190 | else:
191 | # Queue the event for later
192 | logger.info(f"PostHog not initialized, queuing event for later: {event_name}")
193 | self.queued_events.append({"event": event_name, "properties": event_properties})
194 | # Try to initialize now if not already
195 | initialize_result = self._initialize_posthog()
196 | logger.info(f"Attempted to initialize PostHog: {initialize_result}")
197 |
198 | def flush(self) -> bool:
199 | """Flush any pending events to PostHog.
200 |
201 | Returns:
202 | bool: True if successful, False otherwise
203 | """
204 | if not self.initialized and not self._initialize_posthog():
205 | return False
206 |
207 | try:
208 | posthog.flush()
209 | return True
210 | except Exception as e:
211 | logger.debug(f"Failed to flush PostHog events: {e}")
212 | return False
213 |
214 | @classmethod
215 | def get_client(cls) -> "PostHogTelemetryClient":
216 | """Return the global PostHogTelemetryClient instance, creating it if needed."""
217 | if cls._singleton is None:
218 | cls._singleton = cls()
219 | return cls._singleton
220 |
221 | @classmethod
222 | def destroy_client(cls) -> None:
223 | """Destroy the global PostHogTelemetryClient instance."""
224 | cls._singleton = None
225 |
226 |
227 | def destroy_telemetry_client() -> None:
228 | """Destroy the global PostHogTelemetryClient instance (class-managed)."""
229 | PostHogTelemetryClient.destroy_client()
230 |
231 |
232 | def is_telemetry_enabled() -> bool:
233 | return PostHogTelemetryClient.is_telemetry_enabled()
234 |
235 |
236 | def record_event(event_name: str, properties: Optional[Dict[str, Any]] | None = None) -> None:
237 | """Record an arbitrary PostHog event."""
238 | PostHogTelemetryClient.get_client().record_event(event_name, properties or {})
239 |
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/ui/gradio/app.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Advanced Gradio UI for Computer-Use Agent (cua-agent)
3 |
4 | This is a Gradio interface for the Computer-Use Agent v0.4.x (cua-agent)
5 | with an advanced UI for model selection and configuration.
6 |
7 | Supported Agent Models:
8 | - OpenAI: openai/computer-use-preview
9 | - Anthropic: anthropic/claude-sonnet-4-5-20250929, anthropic/claude-3-7-sonnet-20250219
10 | - UI-TARS: huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
11 | - Omniparser: omniparser+anthropic/claude-sonnet-4-5-20250929, omniparser+ollama_chat/gemma3
12 |
13 | Requirements:
14 | - Mac with Apple Silicon (M1/M2/M3/M4), Linux, or Windows
15 | - macOS 14 (Sonoma) or newer / Ubuntu 20.04+
16 | - Python 3.11+
17 | - Lume CLI installed (https://github.com/trycua/cua)
18 | - OpenAI or Anthropic API key
19 | """
20 |
21 | import asyncio
22 | import json
23 | import logging
24 | import os
25 | import platform
26 | from pathlib import Path
27 | from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union, cast
28 |
29 | import gradio as gr
30 |
31 | # Import from agent package
32 | from agent import ComputerAgent
33 | from agent.types import AgentResponse, Messages
34 | from computer import Computer
35 | from gradio.components.chatbot import MetadataDict
36 |
37 | # Global variables
38 | global_agent = None
39 | global_computer = None
40 | SETTINGS_FILE = Path(".gradio_settings.json")
41 |
42 | logging.basicConfig(level=logging.INFO)
43 |
44 | import dotenv
45 |
46 | if dotenv.load_dotenv():
47 | print(f"DEBUG - Loaded environment variables from {dotenv.find_dotenv()}")
48 | else:
49 | print("DEBUG - No .env file found")
50 |
51 |
52 | # --- Settings Load/Save Functions ---
53 | def load_settings() -> Dict[str, Any]:
54 | """Loads settings from the JSON file."""
55 | if SETTINGS_FILE.exists():
56 | try:
57 | with open(SETTINGS_FILE, "r") as f:
58 | settings = json.load(f)
59 | if isinstance(settings, dict):
60 | print(f"DEBUG - Loaded settings from {SETTINGS_FILE}")
61 | return settings
62 | except (json.JSONDecodeError, IOError) as e:
63 | print(f"Warning: Could not load settings from {SETTINGS_FILE}: {e}")
64 | return {}
65 |
66 |
67 | def save_settings(settings: Dict[str, Any]):
68 | """Saves settings to the JSON file."""
69 | settings.pop("provider_api_key", None)
70 | try:
71 | with open(SETTINGS_FILE, "w") as f:
72 | json.dump(settings, f, indent=4)
73 | print(f"DEBUG - Saved settings to {SETTINGS_FILE}")
74 | except IOError as e:
75 | print(f"Warning: Could not save settings to {SETTINGS_FILE}: {e}")
76 |
77 |
78 | # # Custom Screenshot Handler for Gradio chat
79 | # class GradioChatScreenshotHandler:
80 | # """Custom handler that adds screenshots to the Gradio chatbot."""
81 |
82 | # def __init__(self, chatbot_history: List[gr.ChatMessage]):
83 | # self.chatbot_history = chatbot_history
84 | # print("GradioChatScreenshotHandler initialized")
85 |
86 | # async def on_screenshot(self, screenshot_base64: str, action_type: str = "") -> None:
87 | # """Add screenshot to chatbot when a screenshot is taken."""
88 | # image_markdown = f""
89 |
90 | # if self.chatbot_history is not None:
91 | # self.chatbot_history.append(
92 | # gr.ChatMessage(
93 | # role="assistant",
94 | # content=image_markdown,
95 | # metadata={"title": f"🖥️ Screenshot - {action_type}", "status": "done"},
96 | # )
97 | # )
98 |
99 |
100 | # Detect platform capabilities
101 | is_mac = platform.system().lower() == "darwin"
102 | is_lume_available = is_mac or (os.environ.get("PYLUME_HOST", "localhost") != "localhost")
103 |
104 | print("PYLUME_HOST: ", os.environ.get("PYLUME_HOST", "localhost"))
105 | print("is_mac: ", is_mac)
106 | print("Lume available: ", is_lume_available)
107 |
108 | # Map model names to agent model strings
109 | MODEL_MAPPINGS = {
110 | "openai": {
111 | "default": "openai/computer-use-preview",
112 | "OpenAI: Computer-Use Preview": "openai/computer-use-preview",
113 | },
114 | "anthropic": {
115 | "default": "anthropic/claude-3-7-sonnet-20250219",
116 | "Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-20250514",
117 | "Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-20250514",
118 | "Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-3-7-sonnet-20250219",
119 | },
120 | "omni": {
121 | "default": "omniparser+openai/gpt-4o",
122 | "OMNI: OpenAI GPT-4o": "omniparser+openai/gpt-4o",
123 | "OMNI: OpenAI GPT-4o mini": "omniparser+openai/gpt-4o-mini",
124 | "OMNI: Claude 3.7 Sonnet (20250219)": "omniparser+anthropic/claude-3-7-sonnet-20250219",
125 | },
126 | "uitars": {
127 | "default": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" if is_mac else "ui-tars",
128 | "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
129 | },
130 | }
131 |
132 |
133 | def get_model_string(model_name: str, loop_provider: str) -> str:
134 | """Determine the agent model string based on the input."""
135 | if model_name == "Custom model (OpenAI compatible API)":
136 | return "custom_oaicompat"
137 | elif model_name == "Custom model (ollama)":
138 | return "custom_ollama"
139 | elif loop_provider == "OMNI-OLLAMA" or model_name.startswith("OMNI: Ollama "):
140 | if model_name.startswith("OMNI: Ollama "):
141 | ollama_model = model_name.split("OMNI: Ollama ", 1)[1]
142 | return f"omniparser+ollama_chat/{ollama_model}"
143 | return "omniparser+ollama_chat/llama3"
144 |
145 | # Map based on loop provider
146 | mapping = MODEL_MAPPINGS.get(loop_provider.lower(), MODEL_MAPPINGS["openai"])
147 | return mapping.get(model_name, mapping["default"])
148 |
149 |
150 | def get_ollama_models() -> List[str]:
151 | """Get available models from Ollama if installed."""
152 | try:
153 | import subprocess
154 |
155 | result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
156 | if result.returncode == 0:
157 | lines = result.stdout.strip().split("\n")
158 | if len(lines) < 2:
159 | return []
160 | models = []
161 | for line in lines[1:]:
162 | parts = line.split()
163 | if parts:
164 | model_name = parts[0]
165 | models.append(f"OMNI: Ollama {model_name}")
166 | return models
167 | return []
168 | except Exception as e:
169 | logging.error(f"Error getting Ollama models: {e}")
170 | return []
171 |
172 |
173 | def create_computer_instance(
174 | verbosity: int = logging.INFO,
175 | os_type: str = "macos",
176 | provider_type: str = "lume",
177 | name: Optional[str] = None,
178 | api_key: Optional[str] = None,
179 | ) -> Computer:
180 | """Create or get the global Computer instance."""
181 | global global_computer
182 | if global_computer is None:
183 | if provider_type == "localhost":
184 | global_computer = Computer(
185 | verbosity=verbosity, os_type=os_type, use_host_computer_server=True
186 | )
187 | else:
188 | global_computer = Computer(
189 | verbosity=verbosity,
190 | os_type=os_type,
191 | provider_type=provider_type,
192 | name=name if name else "",
193 | api_key=api_key,
194 | )
195 | return global_computer
196 |
197 |
198 | def create_agent(
199 | model_string: str,
200 | save_trajectory: bool = True,
201 | only_n_most_recent_images: int = 3,
202 | verbosity: int = logging.INFO,
203 | custom_model_name: Optional[str] = None,
204 | computer_os: str = "macos",
205 | computer_provider: str = "lume",
206 | computer_name: Optional[str] = None,
207 | computer_api_key: Optional[str] = None,
208 | max_trajectory_budget: Optional[float] = None,
209 | ) -> ComputerAgent:
210 | """Create or update the global agent with the specified parameters."""
211 | global global_agent
212 |
213 | # Create the computer
214 | computer = create_computer_instance(
215 | verbosity=verbosity,
216 | os_type=computer_os,
217 | provider_type=computer_provider,
218 | name=computer_name,
219 | api_key=computer_api_key,
220 | )
221 |
222 | # Handle custom models
223 | if model_string == "custom_oaicompat" and custom_model_name:
224 | model_string = custom_model_name
225 | elif model_string == "custom_ollama" and custom_model_name:
226 | model_string = f"omniparser+ollama_chat/{custom_model_name}"
227 |
228 | # Create agent kwargs
229 | agent_kwargs = {
230 | "model": model_string,
231 | "tools": [computer],
232 | "only_n_most_recent_images": only_n_most_recent_images,
233 | "verbosity": verbosity,
234 | }
235 |
236 | if save_trajectory:
237 | agent_kwargs["trajectory_dir"] = "trajectories"
238 |
239 | if max_trajectory_budget:
240 | agent_kwargs["max_trajectory_budget"] = {
241 | "max_budget": max_trajectory_budget,
242 | "raise_error": True,
243 | }
244 |
245 | global_agent = ComputerAgent(**agent_kwargs)
246 | return global_agent
247 |
248 |
249 | def launch_ui():
250 | """Standalone function to launch the Gradio app."""
251 | from agent.ui.gradio.ui_components import create_gradio_ui
252 |
253 | print("Starting Gradio app for CUA Agent...")
254 | demo = create_gradio_ui()
255 | demo.launch(share=False, inbrowser=True)
256 |
257 |
258 | if __name__ == "__main__":
259 | launch_ui()
260 |
```
--------------------------------------------------------------------------------
/libs/python/som/som/detection.py:
--------------------------------------------------------------------------------
```python
1 | import logging
2 | from pathlib import Path
3 | from typing import Any, Dict, List, Optional, Tuple
4 |
5 | import numpy as np
6 | import torch
7 | import torchvision
8 | from huggingface_hub import hf_hub_download
9 | from PIL import Image
10 | from ultralytics import YOLO
11 |
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | class DetectionProcessor:
16 | """Class for handling YOLO-based icon detection."""
17 |
18 | def __init__(
19 | self,
20 | model_path: Optional[Path] = None,
21 | cache_dir: Optional[Path] = None,
22 | force_device: Optional[str] = None,
23 | ):
24 | """Initialize the detection processor.
25 |
26 | Args:
27 | model_path: Path to YOLOv8 model
28 | cache_dir: Directory to cache downloaded models
29 | force_device: Force specific device (cuda, cpu, mps)
30 | """
31 | self.model_path = model_path
32 | self.cache_dir = cache_dir
33 | self.model = None # type: Any # Will be set to YOLO model in load_model
34 |
35 | # Set device
36 | self.device = "cpu"
37 | if torch.cuda.is_available() and force_device != "cpu":
38 | self.device = "cuda"
39 | elif (
40 | hasattr(torch, "backends")
41 | and hasattr(torch.backends, "mps")
42 | and torch.backends.mps.is_available()
43 | and force_device != "cpu"
44 | ):
45 | self.device = "mps"
46 |
47 | if force_device:
48 | self.device = force_device
49 |
50 | logger.info(f"Using device: {self.device}")
51 |
52 | def load_model(self) -> None:
53 | """Load or download the YOLO model."""
54 | try:
55 | # Set default model path if none provided
56 | if self.model_path is None:
57 | self.model_path = Path(__file__).parent / "weights" / "icon_detect" / "model.pt"
58 |
59 | # Check if the model file already exists
60 | if not self.model_path.exists():
61 | logger.info(
62 | "Model not found locally, downloading from Microsoft OmniParser-v2.0..."
63 | )
64 |
65 | # Create directory
66 | self.model_path.parent.mkdir(parents=True, exist_ok=True)
67 |
68 | try:
69 | # Check if the model exists in cache
70 | cache_path = None
71 | if self.cache_dir:
72 | # Try to find the model in the cache
73 | potential_paths = list(Path(self.cache_dir).glob("**/model.pt"))
74 | if potential_paths:
75 | cache_path = str(potential_paths[0])
76 | logger.info(f"Found model in cache: {cache_path}")
77 |
78 | if not cache_path:
79 | # Download from HuggingFace
80 | downloaded_path = hf_hub_download(
81 | repo_id="microsoft/OmniParser-v2.0",
82 | filename="icon_detect/model.pt",
83 | cache_dir=self.cache_dir,
84 | )
85 | cache_path = downloaded_path
86 | logger.info(f"Model downloaded to cache: {cache_path}")
87 |
88 | # Copy to package directory
89 | import shutil
90 |
91 | shutil.copy2(cache_path, self.model_path)
92 | logger.info(f"Model copied to: {self.model_path}")
93 | except Exception as e:
94 | raise FileNotFoundError(
95 | f"Failed to download model: {str(e)}\n"
96 | "Please ensure you have internet connection and huggingface-hub installed."
97 | ) from e
98 |
99 | # Make sure the model path exists before loading
100 | if not self.model_path.exists():
101 | raise FileNotFoundError(f"Model file not found at: {self.model_path}")
102 |
103 | # If model is already loaded, skip reloading
104 | if self.model is not None:
105 | logger.info("Model already loaded, skipping reload")
106 | return
107 |
108 | logger.info(f"Loading YOLOv8 model from {self.model_path}")
109 | from ultralytics import YOLO
110 |
111 | self.model = YOLO(str(self.model_path)) # Convert Path to string for compatibility
112 |
113 | # Verify model loaded successfully
114 | if self.model is None:
115 | raise ValueError("Model failed to initialize but didn't raise an exception")
116 |
117 | if self.device in ["cuda", "mps"]:
118 | self.model.to(self.device)
119 |
120 | logger.info(f"Model loaded successfully with device: {self.device}")
121 | except Exception as e:
122 | logger.error(f"Failed to load model: {str(e)}")
123 | # Re-raise with more informative message but preserve the model as None
124 | self.model = None
125 | raise RuntimeError(f"Failed to initialize detection model: {str(e)}") from e
126 |
127 | def detect_icons(
128 | self,
129 | image: Image.Image,
130 | box_threshold: float = 0.05,
131 | iou_threshold: float = 0.1,
132 | multi_scale: bool = True,
133 | ) -> List[Dict[str, Any]]:
134 | """Detect icons in an image using YOLO.
135 |
136 | Args:
137 | image: PIL Image to process
138 | box_threshold: Confidence threshold for detection
139 | iou_threshold: IOU threshold for NMS
140 | multi_scale: Whether to use multi-scale detection
141 |
142 | Returns:
143 | List of icon detection dictionaries
144 | """
145 | # Load model if not already loaded
146 | if self.model is None:
147 | self.load_model()
148 |
149 | # Double-check the model was successfully loaded
150 | if self.model is None:
151 | logger.error("Model failed to load and is still None")
152 | return [] # Return empty list instead of crashing
153 |
154 | img_width, img_height = image.size
155 | all_detections = []
156 |
157 | # Define detection scales
158 | scales = (
159 | [{"size": 1280, "conf": box_threshold}] # Single scale for CPU
160 | if self.device == "cpu"
161 | else [
162 | {"size": 640, "conf": box_threshold}, # Base scale
163 | {"size": 1280, "conf": box_threshold}, # Medium scale
164 | {"size": 1920, "conf": box_threshold}, # Large scale
165 | ]
166 | )
167 |
168 | if not multi_scale:
169 | scales = [scales[0]]
170 |
171 | # Run detection at each scale
172 | for scale in scales:
173 | try:
174 | if self.model is None:
175 | logger.error("Model is None, skipping detection")
176 | continue
177 |
178 | results = self.model.predict(
179 | source=image,
180 | conf=scale["conf"],
181 | iou=iou_threshold,
182 | max_det=1000,
183 | verbose=False,
184 | augment=self.device != "cpu",
185 | agnostic_nms=True,
186 | imgsz=scale["size"],
187 | device=self.device,
188 | )
189 |
190 | # Process results
191 | for r in results:
192 | boxes = r.boxes
193 | if not hasattr(boxes, "conf") or not hasattr(boxes, "xyxy"):
194 | logger.warning("Boxes object missing expected attributes")
195 | continue
196 |
197 | confidences = boxes.conf
198 | coords = boxes.xyxy
199 |
200 | # Handle different types of tensors (PyTorch, NumPy, etc.)
201 | if hasattr(confidences, "cpu"):
202 | confidences = confidences.cpu()
203 | if hasattr(coords, "cpu"):
204 | coords = coords.cpu()
205 |
206 | for conf, bbox in zip(confidences, coords):
207 | # Normalize coordinates
208 | x1, y1, x2, y2 = bbox.tolist()
209 | norm_bbox = [
210 | x1 / img_width,
211 | y1 / img_height,
212 | x2 / img_width,
213 | y2 / img_height,
214 | ]
215 |
216 | all_detections.append(
217 | {
218 | "type": "icon",
219 | "confidence": conf.item(),
220 | "bbox": norm_bbox,
221 | "scale": scale["size"],
222 | "interactivity": True,
223 | }
224 | )
225 |
226 | except Exception as e:
227 | logger.warning(f"Detection failed at scale {scale['size']}: {str(e)}")
228 | continue
229 |
230 | # Merge detections using NMS
231 | if len(all_detections) > 0:
232 | boxes = torch.tensor([d["bbox"] for d in all_detections])
233 | scores = torch.tensor([d["confidence"] for d in all_detections])
234 |
235 | keep_indices = torchvision.ops.nms(boxes, scores, iou_threshold)
236 |
237 | merged_detections = [all_detections[i] for i in keep_indices]
238 | else:
239 | merged_detections = []
240 |
241 | return merged_detections
242 |
```