This is page 16 of 28. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .cursorignore
├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github
│ ├── FUNDING.yml
│ ├── scripts
│ │ ├── get_pyproject_version.py
│ │ └── tests
│ │ ├── __init__.py
│ │ ├── README.md
│ │ └── test_get_pyproject_version.py
│ └── workflows
│ ├── bump-version.yml
│ ├── ci-lume.yml
│ ├── docker-publish-cua-linux.yml
│ ├── docker-publish-cua-windows.yml
│ ├── docker-publish-kasm.yml
│ ├── docker-publish-xfce.yml
│ ├── docker-reusable-publish.yml
│ ├── link-check.yml
│ ├── lint.yml
│ ├── npm-publish-cli.yml
│ ├── npm-publish-computer.yml
│ ├── npm-publish-core.yml
│ ├── publish-lume.yml
│ ├── pypi-publish-agent.yml
│ ├── pypi-publish-computer-server.yml
│ ├── pypi-publish-computer.yml
│ ├── pypi-publish-core.yml
│ ├── pypi-publish-mcp-server.yml
│ ├── pypi-publish-som.yml
│ ├── pypi-reusable-publish.yml
│ ├── python-tests.yml
│ ├── test-cua-models.yml
│ └── test-validation-script.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierignore
├── .prettierrc.yaml
├── .vscode
│ ├── docs.code-workspace
│ ├── extensions.json
│ ├── launch.json
│ ├── libs-ts.code-workspace
│ ├── lume.code-workspace
│ ├── lumier.code-workspace
│ ├── py.code-workspace
│ └── settings.json
├── blog
│ ├── app-use.md
│ ├── assets
│ │ ├── composite-agents.png
│ │ ├── docker-ubuntu-support.png
│ │ ├── hack-booth.png
│ │ ├── hack-closing-ceremony.jpg
│ │ ├── hack-cua-ollama-hud.jpeg
│ │ ├── hack-leaderboard.png
│ │ ├── hack-the-north.png
│ │ ├── hack-winners.jpeg
│ │ ├── hack-workshop.jpeg
│ │ ├── hud-agent-evals.png
│ │ └── trajectory-viewer.jpeg
│ ├── bringing-computer-use-to-the-web.md
│ ├── build-your-own-operator-on-macos-1.md
│ ├── build-your-own-operator-on-macos-2.md
│ ├── cloud-windows-ga-macos-preview.md
│ ├── composite-agents.md
│ ├── computer-use-agents-for-growth-hacking.md
│ ├── cua-hackathon.md
│ ├── cua-playground-preview.md
│ ├── cua-vlm-router.md
│ ├── hack-the-north.md
│ ├── hud-agent-evals.md
│ ├── human-in-the-loop.md
│ ├── introducing-cua-cli.md
│ ├── introducing-cua-cloud-containers.md
│ ├── lume-to-containerization.md
│ ├── neurips-2025-cua-papers.md
│ ├── sandboxed-python-execution.md
│ ├── training-computer-use-models-trajectories-1.md
│ ├── trajectory-viewer.md
│ ├── ubuntu-docker-support.md
│ └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│ ├── .env.example
│ ├── .gitignore
│ ├── content
│ │ └── docs
│ │ ├── agent-sdk
│ │ │ ├── agent-loops.mdx
│ │ │ ├── benchmarks
│ │ │ │ ├── index.mdx
│ │ │ │ ├── interactive.mdx
│ │ │ │ ├── introduction.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── osworld-verified.mdx
│ │ │ │ ├── screenspot-pro.mdx
│ │ │ │ └── screenspot-v2.mdx
│ │ │ ├── callbacks
│ │ │ │ ├── agent-lifecycle.mdx
│ │ │ │ ├── cost-saving.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── logging.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── pii-anonymization.mdx
│ │ │ │ └── trajectories.mdx
│ │ │ ├── chat-history.mdx
│ │ │ ├── custom-tools.mdx
│ │ │ ├── customizing-computeragent.mdx
│ │ │ ├── integrations
│ │ │ │ ├── hud.mdx
│ │ │ │ ├── meta.json
│ │ │ │ └── observability.mdx
│ │ │ ├── mcp-server
│ │ │ │ ├── client-integrations.mdx
│ │ │ │ ├── configuration.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ ├── llm-integrations.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── tools.mdx
│ │ │ │ └── usage.mdx
│ │ │ ├── message-format.mdx
│ │ │ ├── meta.json
│ │ │ ├── migration-guide.mdx
│ │ │ ├── prompt-caching.mdx
│ │ │ ├── supported-agents
│ │ │ │ ├── composed-agents.mdx
│ │ │ │ ├── computer-use-agents.mdx
│ │ │ │ ├── grounding-models.mdx
│ │ │ │ ├── human-in-the-loop.mdx
│ │ │ │ └── meta.json
│ │ │ ├── supported-model-providers
│ │ │ │ ├── cua-vlm-router.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ └── local-models.mdx
│ │ │ ├── telemetry.mdx
│ │ │ └── usage-tracking.mdx
│ │ ├── cli-playbook
│ │ │ ├── commands.mdx
│ │ │ ├── index.mdx
│ │ │ └── meta.json
│ │ ├── computer-sdk
│ │ │ ├── cloud-vm-management.mdx
│ │ │ ├── commands.mdx
│ │ │ ├── computer-server
│ │ │ │ ├── Commands.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── REST-API.mdx
│ │ │ │ └── WebSocket-API.mdx
│ │ │ ├── computer-ui.mdx
│ │ │ ├── computers.mdx
│ │ │ ├── custom-computer-handlers.mdx
│ │ │ ├── meta.json
│ │ │ ├── sandboxed-python.mdx
│ │ │ └── tracing-api.mdx
│ │ ├── example-usecases
│ │ │ ├── form-filling.mdx
│ │ │ ├── gemini-complex-ui-navigation.mdx
│ │ │ ├── meta.json
│ │ │ ├── post-event-contact-export.mdx
│ │ │ └── windows-app-behind-vpn.mdx
│ │ ├── get-started
│ │ │ ├── meta.json
│ │ │ └── quickstart.mdx
│ │ ├── index.mdx
│ │ ├── macos-vm-cli-playbook
│ │ │ ├── lume
│ │ │ │ ├── cli-reference.mdx
│ │ │ │ ├── faq.md
│ │ │ │ ├── http-api.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ ├── meta.json
│ │ │ │ └── prebuilt-images.mdx
│ │ │ ├── lumier
│ │ │ │ ├── building-lumier.mdx
│ │ │ │ ├── docker-compose.mdx
│ │ │ │ ├── docker.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ └── meta.json
│ │ │ └── meta.json
│ │ └── meta.json
│ ├── next.config.mjs
│ ├── package-lock.json
│ ├── package.json
│ ├── pnpm-lock.yaml
│ ├── postcss.config.mjs
│ ├── public
│ │ └── img
│ │ ├── agent_gradio_ui.png
│ │ ├── agent.png
│ │ ├── bg-dark.jpg
│ │ ├── bg-light.jpg
│ │ ├── cli.png
│ │ ├── computer.png
│ │ ├── grounding-with-gemini3.gif
│ │ ├── hero.png
│ │ ├── laminar_trace_example.png
│ │ ├── som_box_threshold.png
│ │ └── som_iou_threshold.png
│ ├── README.md
│ ├── source.config.ts
│ ├── src
│ │ ├── app
│ │ │ ├── (home)
│ │ │ │ ├── [[...slug]]
│ │ │ │ │ └── page.tsx
│ │ │ │ └── layout.tsx
│ │ │ ├── api
│ │ │ │ ├── posthog
│ │ │ │ │ └── [...path]
│ │ │ │ │ └── route.ts
│ │ │ │ └── search
│ │ │ │ └── route.ts
│ │ │ ├── favicon.ico
│ │ │ ├── global.css
│ │ │ ├── layout.config.tsx
│ │ │ ├── layout.tsx
│ │ │ ├── llms.mdx
│ │ │ │ └── [[...slug]]
│ │ │ │ └── route.ts
│ │ │ ├── llms.txt
│ │ │ │ └── route.ts
│ │ │ ├── robots.ts
│ │ │ └── sitemap.ts
│ │ ├── assets
│ │ │ ├── discord-black.svg
│ │ │ ├── discord-white.svg
│ │ │ ├── logo-black.svg
│ │ │ └── logo-white.svg
│ │ ├── components
│ │ │ ├── analytics-tracker.tsx
│ │ │ ├── cookie-consent.tsx
│ │ │ ├── doc-actions-menu.tsx
│ │ │ ├── editable-code-block.tsx
│ │ │ ├── footer.tsx
│ │ │ ├── hero.tsx
│ │ │ ├── iou.tsx
│ │ │ ├── mermaid.tsx
│ │ │ └── page-feedback.tsx
│ │ ├── lib
│ │ │ ├── llms.ts
│ │ │ └── source.ts
│ │ ├── mdx-components.tsx
│ │ └── providers
│ │ └── posthog-provider.tsx
│ └── tsconfig.json
├── examples
│ ├── agent_examples.py
│ ├── agent_ui_examples.py
│ ├── browser_tool_example.py
│ ├── cloud_api_examples.py
│ ├── computer_examples_windows.py
│ ├── computer_examples.py
│ ├── computer_ui_examples.py
│ ├── computer-example-ts
│ │ ├── .env.example
│ │ ├── .gitignore
│ │ ├── package-lock.json
│ │ ├── package.json
│ │ ├── pnpm-lock.yaml
│ │ ├── README.md
│ │ ├── src
│ │ │ ├── helpers.ts
│ │ │ └── index.ts
│ │ └── tsconfig.json
│ ├── docker_examples.py
│ ├── evals
│ │ ├── hud_eval_examples.py
│ │ └── wikipedia_most_linked.txt
│ ├── pylume_examples.py
│ ├── sandboxed_functions_examples.py
│ ├── som_examples.py
│ ├── tracing_examples.py
│ ├── utils.py
│ └── winsandbox_example.py
├── img
│ ├── agent_gradio_ui.png
│ ├── agent.png
│ ├── cli.png
│ ├── computer.png
│ ├── logo_black.png
│ └── logo_white.png
├── libs
│ ├── kasm
│ │ ├── Dockerfile
│ │ ├── LICENSE
│ │ ├── README.md
│ │ └── src
│ │ └── ubuntu
│ │ └── install
│ │ └── firefox
│ │ ├── custom_startup.sh
│ │ ├── firefox.desktop
│ │ └── install_firefox.sh
│ ├── lume
│ │ ├── .cursorignore
│ │ ├── CONTRIBUTING.md
│ │ ├── Development.md
│ │ ├── img
│ │ │ └── cli.png
│ │ ├── Package.resolved
│ │ ├── Package.swift
│ │ ├── README.md
│ │ ├── resources
│ │ │ └── lume.entitlements
│ │ ├── scripts
│ │ │ ├── build
│ │ │ │ ├── build-debug.sh
│ │ │ │ ├── build-release-notarized.sh
│ │ │ │ └── build-release.sh
│ │ │ └── install.sh
│ │ ├── src
│ │ │ ├── Commands
│ │ │ │ ├── Clone.swift
│ │ │ │ ├── Config.swift
│ │ │ │ ├── Create.swift
│ │ │ │ ├── Delete.swift
│ │ │ │ ├── Get.swift
│ │ │ │ ├── Images.swift
│ │ │ │ ├── IPSW.swift
│ │ │ │ ├── List.swift
│ │ │ │ ├── Logs.swift
│ │ │ │ ├── Options
│ │ │ │ │ └── FormatOption.swift
│ │ │ │ ├── Prune.swift
│ │ │ │ ├── Pull.swift
│ │ │ │ ├── Push.swift
│ │ │ │ ├── Run.swift
│ │ │ │ ├── Serve.swift
│ │ │ │ ├── Set.swift
│ │ │ │ └── Stop.swift
│ │ │ ├── ContainerRegistry
│ │ │ │ ├── ImageContainerRegistry.swift
│ │ │ │ ├── ImageList.swift
│ │ │ │ └── ImagesPrinter.swift
│ │ │ ├── Errors
│ │ │ │ └── Errors.swift
│ │ │ ├── FileSystem
│ │ │ │ ├── Home.swift
│ │ │ │ ├── Settings.swift
│ │ │ │ ├── VMConfig.swift
│ │ │ │ ├── VMDirectory.swift
│ │ │ │ └── VMLocation.swift
│ │ │ ├── LumeController.swift
│ │ │ ├── Main.swift
│ │ │ ├── Server
│ │ │ │ ├── Handlers.swift
│ │ │ │ ├── HTTP.swift
│ │ │ │ ├── Requests.swift
│ │ │ │ ├── Responses.swift
│ │ │ │ └── Server.swift
│ │ │ ├── Utils
│ │ │ │ ├── CommandRegistry.swift
│ │ │ │ ├── CommandUtils.swift
│ │ │ │ ├── Logger.swift
│ │ │ │ ├── NetworkUtils.swift
│ │ │ │ ├── Path.swift
│ │ │ │ ├── ProcessRunner.swift
│ │ │ │ ├── ProgressLogger.swift
│ │ │ │ ├── String.swift
│ │ │ │ └── Utils.swift
│ │ │ ├── Virtualization
│ │ │ │ ├── DarwinImageLoader.swift
│ │ │ │ ├── DHCPLeaseParser.swift
│ │ │ │ ├── ImageLoaderFactory.swift
│ │ │ │ └── VMVirtualizationService.swift
│ │ │ ├── VM
│ │ │ │ ├── DarwinVM.swift
│ │ │ │ ├── LinuxVM.swift
│ │ │ │ ├── VM.swift
│ │ │ │ ├── VMDetails.swift
│ │ │ │ ├── VMDetailsPrinter.swift
│ │ │ │ ├── VMDisplayResolution.swift
│ │ │ │ └── VMFactory.swift
│ │ │ └── VNC
│ │ │ ├── PassphraseGenerator.swift
│ │ │ └── VNCService.swift
│ │ └── tests
│ │ ├── Mocks
│ │ │ ├── MockVM.swift
│ │ │ ├── MockVMVirtualizationService.swift
│ │ │ └── MockVNCService.swift
│ │ ├── VM
│ │ │ └── VMDetailsPrinterTests.swift
│ │ ├── VMTests.swift
│ │ ├── VMVirtualizationServiceTests.swift
│ │ └── VNCServiceTests.swift
│ ├── lumier
│ │ ├── .dockerignore
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ └── src
│ │ ├── bin
│ │ │ └── entry.sh
│ │ ├── config
│ │ │ └── constants.sh
│ │ ├── hooks
│ │ │ └── on-logon.sh
│ │ └── lib
│ │ ├── utils.sh
│ │ └── vm.sh
│ ├── python
│ │ ├── agent
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── agent
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── adapters
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cua_adapter.py
│ │ │ │ │ ├── huggingfacelocal_adapter.py
│ │ │ │ │ ├── human_adapter.py
│ │ │ │ │ ├── mlxvlm_adapter.py
│ │ │ │ │ └── models
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── internvl.py
│ │ │ │ │ ├── opencua.py
│ │ │ │ │ └── qwen2_5_vl.py
│ │ │ │ ├── agent.py
│ │ │ │ ├── callbacks
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── budget_manager.py
│ │ │ │ │ ├── image_retention.py
│ │ │ │ │ ├── logging.py
│ │ │ │ │ ├── operator_validator.py
│ │ │ │ │ ├── pii_anonymization.py
│ │ │ │ │ ├── prompt_instructions.py
│ │ │ │ │ ├── telemetry.py
│ │ │ │ │ └── trajectory_saver.py
│ │ │ │ ├── cli.py
│ │ │ │ ├── computers
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── cua.py
│ │ │ │ │ └── custom.py
│ │ │ │ ├── decorators.py
│ │ │ │ ├── human_tool
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── __main__.py
│ │ │ │ │ ├── server.py
│ │ │ │ │ └── ui.py
│ │ │ │ ├── integrations
│ │ │ │ │ └── hud
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── agent.py
│ │ │ │ │ └── proxy.py
│ │ │ │ ├── loops
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── anthropic.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── composed_grounded.py
│ │ │ │ │ ├── gelato.py
│ │ │ │ │ ├── gemini.py
│ │ │ │ │ ├── generic_vlm.py
│ │ │ │ │ ├── glm45v.py
│ │ │ │ │ ├── gta1.py
│ │ │ │ │ ├── holo.py
│ │ │ │ │ ├── internvl.py
│ │ │ │ │ ├── model_types.csv
│ │ │ │ │ ├── moondream3.py
│ │ │ │ │ ├── omniparser.py
│ │ │ │ │ ├── openai.py
│ │ │ │ │ ├── opencua.py
│ │ │ │ │ ├── uiins.py
│ │ │ │ │ ├── uitars.py
│ │ │ │ │ └── uitars2.py
│ │ │ │ ├── proxy
│ │ │ │ │ ├── examples.py
│ │ │ │ │ └── handlers.py
│ │ │ │ ├── responses.py
│ │ │ │ ├── tools
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── browser_tool.py
│ │ │ │ ├── types.py
│ │ │ │ └── ui
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ └── gradio
│ │ │ │ ├── __init__.py
│ │ │ │ ├── app.py
│ │ │ │ └── ui_components.py
│ │ │ ├── benchmarks
│ │ │ │ ├── .gitignore
│ │ │ │ ├── contrib.md
│ │ │ │ ├── interactive.py
│ │ │ │ ├── models
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ └── gta1.py
│ │ │ │ ├── README.md
│ │ │ │ ├── ss-pro.py
│ │ │ │ ├── ss-v2.py
│ │ │ │ └── utils.py
│ │ │ ├── example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_computer_agent.py
│ │ ├── bench-ui
│ │ │ ├── bench_ui
│ │ │ │ ├── __init__.py
│ │ │ │ ├── api.py
│ │ │ │ └── child.py
│ │ │ ├── examples
│ │ │ │ ├── folder_example.py
│ │ │ │ ├── gui
│ │ │ │ │ ├── index.html
│ │ │ │ │ ├── logo.svg
│ │ │ │ │ └── styles.css
│ │ │ │ ├── output_overlay.png
│ │ │ │ └── simple_example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ └── test_port_detection.py
│ │ ├── computer
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── computer
│ │ │ │ ├── __init__.py
│ │ │ │ ├── computer.py
│ │ │ │ ├── diorama_computer.py
│ │ │ │ ├── helpers.py
│ │ │ │ ├── interface
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── linux.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ ├── models.py
│ │ │ │ │ └── windows.py
│ │ │ │ ├── logger.py
│ │ │ │ ├── models.py
│ │ │ │ ├── providers
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── cloud
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── docker
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── lume
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── lume_api.py
│ │ │ │ │ ├── lumier
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── types.py
│ │ │ │ │ └── winsandbox
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── provider.py
│ │ │ │ │ └── setup_script.ps1
│ │ │ │ ├── tracing_wrapper.py
│ │ │ │ ├── tracing.py
│ │ │ │ ├── ui
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── __main__.py
│ │ │ │ │ └── gradio
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── app.py
│ │ │ │ └── utils.py
│ │ │ ├── poetry.toml
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ ├── test_computer.py
│ │ │ └── test_helpers.py
│ │ ├── computer-server
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── computer_server
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── browser.py
│ │ │ │ ├── cli.py
│ │ │ │ ├── diorama
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── diorama_computer.py
│ │ │ │ │ ├── diorama.py
│ │ │ │ │ ├── draw.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ └── safezone.py
│ │ │ │ ├── handlers
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── linux.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ └── windows.py
│ │ │ │ ├── main.py
│ │ │ │ ├── server.py
│ │ │ │ ├── utils
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── wallpaper.py
│ │ │ │ └── watchdog.py
│ │ │ ├── examples
│ │ │ │ ├── __init__.py
│ │ │ │ └── usage_example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ ├── run_server.py
│ │ │ ├── test_connection.py
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_server.py
│ │ ├── core
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── core
│ │ │ │ ├── __init__.py
│ │ │ │ └── telemetry
│ │ │ │ ├── __init__.py
│ │ │ │ └── posthog.py
│ │ │ ├── poetry.toml
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_telemetry.py
│ │ ├── mcp-server
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── build-extension.py
│ │ │ ├── CONCURRENT_SESSIONS.md
│ │ │ ├── desktop-extension
│ │ │ │ ├── cua-extension.mcpb
│ │ │ │ ├── desktop_extension.png
│ │ │ │ ├── manifest.json
│ │ │ │ ├── README.md
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── run_server.sh
│ │ │ │ └── setup.py
│ │ │ ├── mcp_server
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── server.py
│ │ │ │ └── session_manager.py
│ │ │ ├── pdm.lock
│ │ │ ├── pyproject.toml
│ │ │ ├── QUICK_TEST_COMMANDS.sh
│ │ │ ├── quick_test_local_option.py
│ │ │ ├── README.md
│ │ │ ├── scripts
│ │ │ │ ├── install_mcp_server.sh
│ │ │ │ └── start_mcp_server.sh
│ │ │ ├── test_mcp_server_local_option.py
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_mcp_server.py
│ │ ├── pylume
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_pylume.py
│ │ └── som
│ │ ├── .bumpversion.cfg
│ │ ├── LICENSE
│ │ ├── poetry.toml
│ │ ├── pyproject.toml
│ │ ├── README.md
│ │ ├── som
│ │ │ ├── __init__.py
│ │ │ ├── detect.py
│ │ │ ├── detection.py
│ │ │ ├── models.py
│ │ │ ├── ocr.py
│ │ │ ├── util
│ │ │ │ └── utils.py
│ │ │ └── visualization.py
│ │ └── tests
│ │ ├── conftest.py
│ │ └── test_omniparser.py
│ ├── qemu-docker
│ │ ├── linux
│ │ │ ├── Dockerfile
│ │ │ ├── README.md
│ │ │ └── src
│ │ │ ├── entry.sh
│ │ │ └── vm
│ │ │ ├── image
│ │ │ │ └── README.md
│ │ │ └── setup
│ │ │ ├── install.sh
│ │ │ ├── setup-cua-server.sh
│ │ │ └── setup.sh
│ │ ├── README.md
│ │ └── windows
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ └── src
│ │ ├── entry.sh
│ │ └── vm
│ │ ├── image
│ │ │ └── README.md
│ │ └── setup
│ │ ├── install.bat
│ │ ├── on-logon.ps1
│ │ ├── setup-cua-server.ps1
│ │ ├── setup-utils.psm1
│ │ └── setup.ps1
│ ├── typescript
│ │ ├── .gitignore
│ │ ├── .nvmrc
│ │ ├── agent
│ │ │ ├── examples
│ │ │ │ ├── playground-example.html
│ │ │ │ └── README.md
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── client.ts
│ │ │ │ ├── index.ts
│ │ │ │ └── types.ts
│ │ │ ├── tests
│ │ │ │ └── client.test.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── computer
│ │ │ ├── .editorconfig
│ │ │ ├── .gitattributes
│ │ │ ├── .gitignore
│ │ │ ├── LICENSE
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── computer
│ │ │ │ │ ├── index.ts
│ │ │ │ │ ├── providers
│ │ │ │ │ │ ├── base.ts
│ │ │ │ │ │ ├── cloud.ts
│ │ │ │ │ │ └── index.ts
│ │ │ │ │ └── types.ts
│ │ │ │ ├── index.ts
│ │ │ │ ├── interface
│ │ │ │ │ ├── base.ts
│ │ │ │ │ ├── factory.ts
│ │ │ │ │ ├── index.ts
│ │ │ │ │ ├── linux.ts
│ │ │ │ │ ├── macos.ts
│ │ │ │ │ └── windows.ts
│ │ │ │ └── types.ts
│ │ │ ├── tests
│ │ │ │ ├── computer
│ │ │ │ │ └── cloud.test.ts
│ │ │ │ ├── interface
│ │ │ │ │ ├── factory.test.ts
│ │ │ │ │ ├── index.test.ts
│ │ │ │ │ ├── linux.test.ts
│ │ │ │ │ ├── macos.test.ts
│ │ │ │ │ └── windows.test.ts
│ │ │ │ └── setup.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── core
│ │ │ ├── .editorconfig
│ │ │ ├── .gitattributes
│ │ │ ├── .gitignore
│ │ │ ├── LICENSE
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── index.ts
│ │ │ │ └── telemetry
│ │ │ │ ├── clients
│ │ │ │ │ ├── index.ts
│ │ │ │ │ └── posthog.ts
│ │ │ │ └── index.ts
│ │ │ ├── tests
│ │ │ │ └── telemetry.test.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── cua-cli
│ │ │ ├── .gitignore
│ │ │ ├── .prettierrc
│ │ │ ├── bun.lock
│ │ │ ├── CLAUDE.md
│ │ │ ├── index.ts
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── auth.ts
│ │ │ │ ├── cli.ts
│ │ │ │ ├── commands
│ │ │ │ │ ├── auth.ts
│ │ │ │ │ └── sandbox.ts
│ │ │ │ ├── config.ts
│ │ │ │ ├── http.ts
│ │ │ │ ├── storage.ts
│ │ │ │ └── util.ts
│ │ │ └── tsconfig.json
│ │ ├── package.json
│ │ ├── pnpm-lock.yaml
│ │ ├── pnpm-workspace.yaml
│ │ └── README.md
│ └── xfce
│ ├── .dockerignore
│ ├── .gitignore
│ ├── Development.md
│ ├── Dockerfile
│ ├── Dockerfile.dev
│ ├── README.md
│ └── src
│ ├── scripts
│ │ ├── resize-display.sh
│ │ ├── start-computer-server.sh
│ │ ├── start-novnc.sh
│ │ ├── start-vnc.sh
│ │ └── xstartup.sh
│ ├── supervisor
│ │ └── supervisord.conf
│ └── xfce-config
│ ├── helpers.rc
│ ├── xfce4-power-manager.xml
│ └── xfce4-session.xml
├── LICENSE.md
├── Makefile
├── notebooks
│ ├── agent_nb.ipynb
│ ├── blog
│ │ ├── build-your-own-operator-on-macos-1.ipynb
│ │ └── build-your-own-operator-on-macos-2.ipynb
│ ├── composite_agents_docker_nb.ipynb
│ ├── computer_nb.ipynb
│ ├── computer_server_nb.ipynb
│ ├── customizing_computeragent.ipynb
│ ├── eval_osworld.ipynb
│ ├── ollama_nb.ipynb
│ ├── README.md
│ ├── sota_hackathon_cloud.ipynb
│ └── sota_hackathon.ipynb
├── package-lock.json
├── package.json
├── pnpm-lock.yaml
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── scripts
│ ├── install-cli.ps1
│ ├── install-cli.sh
│ ├── playground-docker.sh
│ ├── playground.sh
│ ├── run-docker-dev.sh
│ └── typescript-typecheck.js
├── TESTING.md
├── tests
│ ├── agent_loop_testing
│ │ ├── agent_test.py
│ │ └── README.md
│ ├── pytest.ini
│ ├── shell_cmd.py
│ ├── test_files.py
│ ├── test_mcp_server_session_management.py
│ ├── test_mcp_server_streaming.py
│ ├── test_shell_bash.py
│ ├── test_telemetry.py
│ ├── test_tracing.py
│ ├── test_venv.py
│ └── test_watchdog.py
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/docs/content/docs/example-usecases/post-event-contact-export.mdx:
--------------------------------------------------------------------------------
```markdown
1 | ---
2 | title: Post-Event Contact Export
3 | description: Run overnight contact extraction from LinkedIn, X, or other social platforms after networking events
4 | ---
5 |
6 | import { Step, Steps } from 'fumadocs-ui/components/steps';
7 | import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
8 |
9 | ## Overview
10 |
11 | After networking events, you need to export new connections from LinkedIn, X, or other platforms into your CRM. This automation handles it for you.
12 |
13 | **The workflow**: Kick off the script after an event and let it run overnight. Wake up to a clean CSV ready for your CRM or email tool.
14 |
15 | This example focuses on LinkedIn but works across platforms. It uses [Cua Computer](/computer-sdk/computers) to interact with web interfaces and [Agent Loops](/agent-sdk/agent-loops) to iterate through connections with conversation history.
16 |
17 | ### Why Cua is Perfect for This
18 |
19 | **Cua's VMs save your session data**, bypassing bot detection entirely:
20 |
21 | - **Log in once manually** through the VM browser
22 | - **Session persists** - you appear as a regular user, not a bot
23 | - **No captchas** - the platform treats automation like normal browsing
24 | - **No login code** - script doesn't handle authentication
25 | - **Run overnight** - kick off and forget
26 |
27 | Traditional web scraping triggers anti-bot measures immediately. Cua's approach works across all platforms.
28 |
29 | ### What You Get
30 |
31 | The script generates two files with your extracted connections:
32 |
33 | **CSV Export** (`linkedin_connections_20250116_143022.csv`):
34 |
35 | ```csv
36 | first,last,role,company,met_at,linkedin
37 | John,Smith,Software Engineer,Acme Corp,Google Devfest Toronto,https://www.linkedin.com/in/johnsmith
38 | Sarah,Johnson,Product Manager,Tech Inc,Google Devfest Toronto,https://www.linkedin.com/in/sarahjohnson
39 | ```
40 |
41 | **Messaging Links** (`linkedin_messaging_links_20250116_143022.txt`):
42 |
43 | ```
44 | LinkedIn Messaging Compose Links
45 | ================================================================================
46 |
47 | 1. https://www.linkedin.com/messaging/compose/?recipient=johnsmith
48 | 2. https://www.linkedin.com/messaging/compose/?recipient=sarahjohnson
49 | ```
50 |
51 | ---
52 |
53 | <Steps>
54 |
55 | <Step>
56 |
57 | ### Set Up Your Environment
58 |
59 | First, install the required dependencies:
60 |
61 | Create a `requirements.txt` file:
62 |
63 | ```text
64 | cua-agent
65 | cua-computer
66 | python-dotenv>=1.0.0
67 | ```
68 |
69 | Install the dependencies:
70 |
71 | ```bash
72 | pip install -r requirements.txt
73 | ```
74 |
75 | Create a `.env` file with your API keys:
76 |
77 | ```text
78 | ANTHROPIC_API_KEY=your-anthropic-api-key # optional, BYOK. By default, this cookbook uses the CUA VLM Router
79 | CUA_API_KEY=sk_cua-api01...
80 | CUA_CONTAINER_NAME=m-linux-...
81 | ```
82 |
83 | Finally, setup your VM. Refer to the [quickstart guide](https://cua.ai/docs/get-started/quickstart) on how to setup the computer environment.
84 | </Step>
85 |
86 | <Step>
87 |
88 | ### Log Into LinkedIn Manually
89 |
90 | **Important**: Before running the script, manually log into LinkedIn through your VM:
91 |
92 | 1. Access your VM through the Cua dashboard
93 | 2. Open a browser and navigate to LinkedIn
94 | 3. Log in with your credentials (handle any captchas manually)
95 | 4. Close the browser but leave the VM running
96 | 5. Your session is now saved and ready for automation!
97 |
98 | This one-time manual login bypasses all bot detection.
99 |
100 | </Step>
101 |
102 | <Step>
103 |
104 | ### Configure and Create Your Script
105 |
106 | Create a Python file (e.g., `contact_export.py`). You can customize:
107 |
108 | ```python
109 | # Where you met these connections (automatically added to CSV)
110 | MET_AT_REASON = "Google Devfest Toronto"
111 |
112 | # Number of contacts to extract (in the main loop)
113 | for contact_num in range(1, 21): # Change 21 to extract more/fewer contacts
114 | ```
115 |
116 | Select your environment:
117 |
118 | <Tabs items={['Cloud Sandbox', 'Linux on Docker', 'macOS Sandbox', 'Windows Sandbox']}>
119 | <Tab value="Cloud Sandbox">
120 |
121 | ```python
122 | import asyncio
123 | import csv
124 | import logging
125 | import os
126 | import signal
127 | import traceback
128 | from datetime import datetime
129 |
130 | from agent import ComputerAgent
131 | from computer import Computer, VMProviderType
132 | from dotenv import load_dotenv
133 |
134 | logging.basicConfig(level=logging.INFO)
135 | logger = logging.getLogger(__name__)
136 |
137 | # Configuration: Define where you met these connections
138 | MET_AT_REASON = "Google Devfest Toronto"
139 |
140 | def handle_sigint(sig, frame):
141 | print("\n\nExecution interrupted by user. Exiting gracefully...")
142 | exit(0)
143 |
144 | def extract_public_id_from_linkedin_url(linkedin_url):
145 | """Extract public ID from LinkedIn profile URL."""
146 | if not linkedin_url:
147 | return None
148 |
149 | url = linkedin_url.split('?')[0].rstrip('/')
150 |
151 | if '/in/' in url:
152 | public_id = url.split('/in/')[-1]
153 | return public_id
154 |
155 | return None
156 |
157 | def extract_contact_from_response(result_output):
158 | """
159 | Extract contact information from agent's response.
160 | Expects format:
161 | FIRST: value
162 | LAST: value
163 | ROLE: value
164 | COMPANY: value
165 | LINKEDIN: value
166 | """
167 | contact = {
168 | 'first': '',
169 | 'last': '',
170 | 'role': '',
171 | 'company': '',
172 | 'met_at': MET_AT_REASON,
173 | 'linkedin': ''
174 | }
175 |
176 | for item in result_output:
177 | if item.get("type") == "message":
178 | content = item.get("content", [])
179 | for content_part in content:
180 | text = content_part.get("text", "")
181 | if text:
182 | for line in text.split('\n'):
183 | line = line.strip()
184 | line_upper = line.upper()
185 |
186 | if line_upper.startswith("FIRST:"):
187 | value = line[6:].strip()
188 | if value and value.upper() != "N/A":
189 | contact['first'] = value
190 | elif line_upper.startswith("LAST:"):
191 | value = line[5:].strip()
192 | if value and value.upper() != "N/A":
193 | contact['last'] = value
194 | elif line_upper.startswith("ROLE:"):
195 | value = line[5:].strip()
196 | if value and value.upper() != "N/A":
197 | contact['role'] = value
198 | elif line_upper.startswith("COMPANY:"):
199 | value = line[8:].strip()
200 | if value and value.upper() != "N/A":
201 | contact['company'] = value
202 | elif line_upper.startswith("LINKEDIN:"):
203 | value = line[9:].strip()
204 | if value and value.upper() != "N/A":
205 | contact['linkedin'] = value
206 |
207 | return contact
208 |
209 | async def scrape_linkedin_connections():
210 | """Scrape LinkedIn connections and export to CSV."""
211 |
212 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
213 | csv_filename = f"linkedin_connections_{timestamp}.csv"
214 | csv_path = os.path.join(os.getcwd(), csv_filename)
215 |
216 | # Initialize CSV file
217 | with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
218 | writer = csv.DictWriter(csvfile, fieldnames=['first', 'last', 'role', 'company', 'met_at', 'linkedin'])
219 | writer.writeheader()
220 |
221 | print(f"\n🚀 Starting LinkedIn connections scraper")
222 | print(f"📁 Output file: {csv_path}")
223 | print(f"📍 Met at: {MET_AT_REASON}")
224 | print("=" * 80)
225 |
226 | try:
227 | async with Computer(
228 | os_type="linux",
229 | provider_type=VMProviderType.CLOUD,
230 | name=os.environ["CUA_CONTAINER_NAME"], # Your sandbox name
231 | api_key=os.environ["CUA_API_KEY"],
232 | verbosity=logging.INFO,
233 | ) as computer:
234 |
235 | agent = ComputerAgent(
236 | model="cua/anthropic/claude-sonnet-4.5",
237 | tools=[computer],
238 | only_n_most_recent_images=3,
239 | verbosity=logging.INFO,
240 | trajectory_dir="trajectories",
241 | use_prompt_caching=True,
242 | max_trajectory_budget=10.0,
243 | )
244 |
245 | history = []
246 |
247 | # Task 1: Navigate to LinkedIn connections page
248 | navigation_task = (
249 | "STEP 1 - NAVIGATE TO LINKEDIN CONNECTIONS PAGE:\n"
250 | "1. Open a web browser (Chrome or Firefox)\n"
251 | "2. Navigate to https://www.linkedin.com/mynetwork/invite-connect/connections/\n"
252 | "3. Wait for the page to fully load\n"
253 | "4. Confirm you can see the list of connections\n"
254 | "5. Ready to start extracting contacts"
255 | )
256 |
257 | print(f"\n[Task 1/21] Navigating to LinkedIn...")
258 | history.append({"role": "user", "content": navigation_task})
259 |
260 | async for result in agent.run(history, stream=False):
261 | history += result.get("output", [])
262 |
263 | print(f"✅ Navigation completed\n")
264 |
265 | # Extract 20 contacts
266 | contacts_extracted = 0
267 | linkedin_urls = []
268 | previous_contact_name = None
269 |
270 | for contact_num in range(1, 21):
271 | # Build extraction task
272 | if contact_num == 1:
273 | extraction_task = (
274 | f"STEP {contact_num + 1} - EXTRACT CONTACT {contact_num} OF 20:\n"
275 | f"1. Click on the first connection's profile\n"
276 | f"2. Extract: FIRST, LAST, ROLE, COMPANY, LINKEDIN URL\n"
277 | f"3. Return in exact format:\n"
278 | f"FIRST: [value]\n"
279 | f"LAST: [value]\n"
280 | f"ROLE: [value]\n"
281 | f"COMPANY: [value]\n"
282 | f"LINKEDIN: [value]\n"
283 | f"4. Navigate back to connections list"
284 | )
285 | else:
286 | extraction_task = (
287 | f"STEP {contact_num + 1} - EXTRACT CONTACT {contact_num} OF 20:\n"
288 | f"1. Find '{previous_contact_name}' in the list\n"
289 | f"2. Click on the contact BELOW them\n"
290 | f"3. Extract: FIRST, LAST, ROLE, COMPANY, LINKEDIN URL\n"
291 | f"4. Return in exact format:\n"
292 | f"FIRST: [value]\n"
293 | f"LAST: [value]\n"
294 | f"ROLE: [value]\n"
295 | f"COMPANY: [value]\n"
296 | f"LINKEDIN: [value]\n"
297 | f"5. Navigate back"
298 | )
299 |
300 | print(f"[Task {contact_num + 1}/21] Extracting contact {contact_num}/20...")
301 | history.append({"role": "user", "content": extraction_task})
302 |
303 | all_output = []
304 | async for result in agent.run(history, stream=False):
305 | output = result.get("output", [])
306 | history += output
307 | all_output.extend(output)
308 |
309 | contact_data = extract_contact_from_response(all_output)
310 |
311 | has_name = bool(contact_data['first'] and contact_data['last'])
312 | has_linkedin = bool(contact_data['linkedin'] and 'linkedin.com' in contact_data['linkedin'])
313 |
314 | if has_name or has_linkedin:
315 | with open(csv_path, 'a', newline='', encoding='utf-8') as csvfile:
316 | writer = csv.DictWriter(csvfile, fieldnames=['first', 'last', 'role', 'company', 'met_at', 'linkedin'])
317 | writer.writerow(contact_data)
318 | contacts_extracted += 1
319 |
320 | if contact_data['linkedin']:
321 | linkedin_urls.append(contact_data['linkedin'])
322 |
323 | if has_name:
324 | previous_contact_name = f"{contact_data['first']} {contact_data['last']}".strip()
325 |
326 | name_str = f"{contact_data['first']} {contact_data['last']}" if has_name else "[No name]"
327 | print(f"✅ Contact {contact_num}/20 saved: {name_str}")
328 | else:
329 | print(f"⚠️ Could not extract valid data for contact {contact_num}")
330 |
331 | if contact_num % 5 == 0:
332 | print(f"\n📈 Progress: {contacts_extracted}/{contact_num} contacts extracted\n")
333 |
334 | # Create messaging links file
335 | messaging_filename = f"linkedin_messaging_links_{timestamp}.txt"
336 | messaging_path = os.path.join(os.getcwd(), messaging_filename)
337 |
338 | with open(messaging_path, 'w', encoding='utf-8') as txtfile:
339 | txtfile.write("LinkedIn Messaging Compose Links\n")
340 | txtfile.write("=" * 80 + "\n\n")
341 |
342 | for i, linkedin_url in enumerate(linkedin_urls, 1):
343 | public_id = extract_public_id_from_linkedin_url(linkedin_url)
344 | if public_id:
345 | messaging_url = f"https://www.linkedin.com/messaging/compose/?recipient={public_id}"
346 | txtfile.write(f"{i}. {messaging_url}\n")
347 |
348 | print("\n" + "="*80)
349 | print("🎉 All tasks completed!")
350 | print(f"📁 CSV file saved to: {csv_path}")
351 | print(f"📊 Total contacts extracted: {contacts_extracted}/20")
352 | print(f"💬 Messaging links saved to: {messaging_path}")
353 | print("="*80)
354 |
355 | except Exception as e:
356 | print(f"\n❌ Error: {e}")
357 | traceback.print_exc()
358 | raise
359 |
360 | def main():
361 | try:
362 | load_dotenv()
363 |
364 | if "ANTHROPIC_API_KEY" not in os.environ:
365 | raise RuntimeError("Please set ANTHROPIC_API_KEY in .env")
366 |
367 | if "CUA_API_KEY" not in os.environ:
368 | raise RuntimeError("Please set CUA_API_KEY in .env")
369 |
370 | if "CUA_CONTAINER_NAME" not in os.environ:
371 | raise RuntimeError("Please set CUA_CONTAINER_NAME in .env")
372 |
373 | signal.signal(signal.SIGINT, handle_sigint)
374 |
375 | asyncio.run(scrape_linkedin_connections())
376 |
377 | except Exception as e:
378 | print(f"\n❌ Error: {e}")
379 | traceback.print_exc()
380 |
381 | if __name__ == "__main__":
382 | main()
383 | ```
384 |
385 | </Tab>
386 | <Tab value="Linux on Docker">
387 |
388 | ```python
389 | # Same code as Cloud Sandbox, but change Computer initialization to:
390 | async with Computer(
391 | os_type="linux",
392 | provider_type=VMProviderType.DOCKER,
393 | image="trycua/cua-xfce:latest",
394 | verbosity=logging.INFO,
395 | ) as computer:
396 | ```
397 |
398 | And remove the `CUA_API_KEY` and `CUA_CONTAINER_NAME` requirements from `.env` and the validation checks.
399 |
400 | </Tab>
401 | <Tab value="macOS Sandbox">
402 |
403 | ```python
404 | # Same code as Cloud Sandbox, but change Computer initialization to:
405 | async with Computer(
406 | os_type="macos",
407 | provider_type=VMProviderType.LUME,
408 | name="macos-sequoia-cua:latest",
409 | verbosity=logging.INFO,
410 | ) as computer:
411 | ```
412 |
413 | And remove the `CUA_API_KEY` and `CUA_CONTAINER_NAME` requirements from `.env` and the validation checks.
414 |
415 | </Tab>
416 | <Tab value="Windows Sandbox">
417 |
418 | ```python
419 | # Same code as Cloud Sandbox, but change Computer initialization to:
420 | async with Computer(
421 | os_type="windows",
422 | provider_type=VMProviderType.WINDOWS_SANDBOX,
423 | verbosity=logging.INFO,
424 | ) as computer:
425 | ```
426 |
427 | And remove the `CUA_API_KEY` and `CUA_CONTAINER_NAME` requirements from `.env` and the validation checks.
428 |
429 | </Tab>
430 | </Tabs>
431 |
432 | </Step>
433 |
434 | <Step>
435 |
436 | ### Run Your Script
437 |
438 | Execute your contact extraction automation:
439 |
440 | ```bash
441 | python contact_export.py
442 | ```
443 |
444 | The agent will:
445 |
446 | 1. Navigate to your LinkedIn connections page
447 | 2. Extract data from 20 contacts (first name, last name, role, company, LinkedIn URL)
448 | 3. Save contacts to a timestamped CSV file
449 | 4. Generate messaging compose links for easy follow-up
450 |
451 | Monitor the output to see the agent's progress. The script will show a progress update every 5 contacts.
452 |
453 | </Step>
454 |
455 | </Steps>
456 |
457 | ---
458 |
459 | ## How It Works
460 |
461 | This script demonstrates a practical workflow for extracting LinkedIn connection data:
462 |
463 | 1. **Session Persistence** - Manually log into LinkedIn through the VM once, and the VM saves your session
464 | 2. **Navigation** - The script navigates to your connections page using your saved authenticated session
465 | 3. **Data Extraction** - For each contact, the agent clicks their profile, extracts data, and navigates back
466 | 4. **Python Processing** - Python parses responses, validates data, and writes to CSV incrementally
467 | 5. **Output Files** - Generates a CSV with contact data and a text file with messaging URLs
468 |
469 | ## Next Steps
470 |
471 | - Learn more about [Cua computers](/computer-sdk/computers) and [computer commands](/computer-sdk/commands)
472 | - Read about [Agent loops](/agent-sdk/agent-loops), [tools](/agent-sdk/custom-tools), and [supported model providers](/agent-sdk/supported-model-providers/)
473 | - Experiment with different [Models and Providers](/agent-sdk/supported-model-providers/)
474 | - Adapt this script for other platforms (Twitter/X, email extraction, etc.)
475 | - Join our [Discord community](https://discord.com/invite/mVnXXpdE85) for help
476 |
```
--------------------------------------------------------------------------------
/libs/python/mcp-server/mcp_server/server.py:
--------------------------------------------------------------------------------
```python
1 | import asyncio
2 | import base64
3 | import inspect
4 | import logging
5 | import os
6 | import signal
7 | import sys
8 | import traceback
9 | import uuid
10 | from typing import Any, Dict, List, Optional, Tuple, Union
11 |
12 | import anyio
13 |
14 | # Configure logging to output to stderr for debug visibility
15 | logging.basicConfig(
16 | level=logging.DEBUG, # Changed to DEBUG
17 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
18 | stream=sys.stderr,
19 | )
20 | logger = logging.getLogger("mcp-server")
21 |
22 | # More visible startup message
23 | logger.debug("MCP Server module loading...")
24 |
25 | try:
26 | from mcp.server.fastmcp import Context, FastMCP
27 |
28 | # Use the canonical Image type
29 | from mcp.server.fastmcp.utilities.types import Image
30 |
31 | logger.debug("Successfully imported FastMCP")
32 | except ImportError as e:
33 | logger.error(f"Failed to import FastMCP: {e}")
34 | traceback.print_exc(file=sys.stderr)
35 | sys.exit(1)
36 |
37 | try:
38 | from agent import ComputerAgent
39 | from computer import Computer
40 |
41 | logger.debug("Successfully imported Computer and Agent modules")
42 | except ImportError as e:
43 | logger.error(f"Failed to import Computer/Agent modules: {e}")
44 | traceback.print_exc(file=sys.stderr)
45 | sys.exit(1)
46 |
47 | try:
48 | from .session_manager import (
49 | get_session_manager,
50 | initialize_session_manager,
51 | shutdown_session_manager,
52 | )
53 |
54 | logger.debug("Successfully imported session manager")
55 | except ImportError as e:
56 | logger.error(f"Failed to import session manager: {e}")
57 | traceback.print_exc(file=sys.stderr)
58 | sys.exit(1)
59 |
60 |
61 | def get_env_bool(key: str, default: bool = False) -> bool:
62 | """Get boolean value from environment variable."""
63 | return os.getenv(key, str(default)).lower() in ("true", "1", "yes")
64 |
65 |
66 | async def _maybe_call_ctx_method(ctx: Context, method_name: str, *args, **kwargs) -> None:
67 | """Call a context helper if it exists, awaiting the result when necessary."""
68 | method = getattr(ctx, method_name, None)
69 | if not callable(method):
70 | return
71 | result = method(*args, **kwargs)
72 | if inspect.isawaitable(result):
73 | await result
74 |
75 |
76 | def _normalise_message_content(content: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
77 | """Normalise message content to a list of structured parts."""
78 | if isinstance(content, list):
79 | return content
80 | if content is None:
81 | return []
82 | return [{"type": "output_text", "text": str(content)}]
83 |
84 |
85 | def _extract_text_from_content(content: Union[str, List[Dict[str, Any]]]) -> str:
86 | """Extract textual content for inclusion in the aggregated result string."""
87 | if isinstance(content, str):
88 | return content
89 | texts: List[str] = []
90 | for part in content or []:
91 | if not isinstance(part, dict):
92 | continue
93 | if part.get("type") in {"output_text", "text"} and part.get("text"):
94 | texts.append(str(part["text"]))
95 | return "\n".join(texts)
96 |
97 |
98 | def _serialise_tool_content(content: Any) -> str:
99 | """Convert tool outputs into a string for aggregation."""
100 | if isinstance(content, str):
101 | return content
102 | if isinstance(content, list):
103 | texts: List[str] = []
104 | for part in content:
105 | if (
106 | isinstance(part, dict)
107 | and part.get("type") in {"output_text", "text"}
108 | and part.get("text")
109 | ):
110 | texts.append(str(part["text"]))
111 | if texts:
112 | return "\n".join(texts)
113 | if content is None:
114 | return ""
115 | return str(content)
116 |
117 |
118 | def serve() -> FastMCP:
119 | """Create and configure the MCP server."""
120 | # NOTE: Do not pass model_config here; FastMCP 2.12.x doesn't support it.
121 | server = FastMCP(name="cua-agent")
122 |
123 | @server.tool(structured_output=False)
124 | async def screenshot_cua(ctx: Context, session_id: Optional[str] = None) -> Any:
125 | """
126 | Take a screenshot of the current MacOS VM screen and return the image.
127 |
128 | Args:
129 | session_id: Optional session ID for multi-client support. If not provided, a new session will be created.
130 | """
131 | session_manager = get_session_manager()
132 |
133 | async with session_manager.get_session(session_id) as session:
134 | screenshot = await session.computer.interface.screenshot()
135 | # Returning Image object is fine when structured_output=False
136 | return Image(format="png", data=screenshot)
137 |
138 | @server.tool(structured_output=False)
139 | async def run_cua_task(ctx: Context, task: str, session_id: Optional[str] = None) -> Any:
140 | """
141 | Run a Computer-Use Agent (CUA) task in a MacOS VM and return (combined text, final screenshot).
142 |
143 | Args:
144 | task: The task description for the agent to execute
145 | session_id: Optional session ID for multi-client support. If not provided, a new session will be created.
146 | """
147 | session_manager = get_session_manager()
148 | task_id = str(uuid.uuid4())
149 |
150 | try:
151 | logger.info(f"Starting CUA task: {task} (task_id: {task_id})")
152 |
153 | async with session_manager.get_session(session_id) as session:
154 | # Register this task with the session
155 | await session_manager.register_task(session.session_id, task_id)
156 |
157 | try:
158 | # Get model name
159 | model_name = os.getenv("CUA_MODEL_NAME", "anthropic/claude-sonnet-4-5-20250929")
160 | logger.info(f"Using model: {model_name}")
161 |
162 | # Create agent with the new v0.4.x API
163 | agent = ComputerAgent(
164 | model=model_name,
165 | only_n_most_recent_images=int(os.getenv("CUA_MAX_IMAGES", "3")),
166 | verbosity=logging.INFO,
167 | tools=[session.computer],
168 | )
169 |
170 | messages = [{"role": "user", "content": task}]
171 |
172 | # Collect all results
173 | aggregated_messages: List[str] = []
174 | async for result in agent.run(messages):
175 | logger.info("Agent processing step")
176 | ctx.info("Agent processing step")
177 |
178 | outputs = result.get("output", [])
179 | for output in outputs:
180 | output_type = output.get("type")
181 |
182 | if output_type == "message":
183 | logger.debug("Streaming assistant message: %s", output)
184 | content = _normalise_message_content(output.get("content"))
185 | aggregated_text = _extract_text_from_content(content)
186 | if aggregated_text:
187 | aggregated_messages.append(aggregated_text)
188 | await _maybe_call_ctx_method(
189 | ctx,
190 | "yield_message",
191 | role=output.get("role", "assistant"),
192 | content=content,
193 | )
194 |
195 | elif output_type in {"tool_use", "computer_call", "function_call"}:
196 | logger.debug("Streaming tool call: %s", output)
197 | call_id = output.get("id") or output.get("call_id")
198 | tool_name = output.get("name") or output.get("action", {}).get(
199 | "type"
200 | )
201 | tool_input = (
202 | output.get("input")
203 | or output.get("arguments")
204 | or output.get("action")
205 | )
206 | if call_id:
207 | await _maybe_call_ctx_method(
208 | ctx,
209 | "yield_tool_call",
210 | name=tool_name,
211 | call_id=call_id,
212 | input=tool_input,
213 | )
214 |
215 | elif output_type in {
216 | "tool_result",
217 | "computer_call_output",
218 | "function_call_output",
219 | }:
220 | logger.debug("Streaming tool output: %s", output)
221 | call_id = output.get("call_id") or output.get("id")
222 | content = output.get("content") or output.get("output")
223 | aggregated_text = _serialise_tool_content(content)
224 | if aggregated_text:
225 | aggregated_messages.append(aggregated_text)
226 | if call_id:
227 | await _maybe_call_ctx_method(
228 | ctx,
229 | "yield_tool_output",
230 | call_id=call_id,
231 | output=content,
232 | is_error=output.get("status") == "failed"
233 | or output.get("is_error", False),
234 | )
235 |
236 | logger.info("CUA task completed successfully")
237 | ctx.info("CUA task completed successfully")
238 |
239 | screenshot_image = Image(
240 | format="png",
241 | data=await session.computer.interface.screenshot(),
242 | )
243 |
244 | return (
245 | "\n".join(aggregated_messages).strip()
246 | or "Task completed with no text output.",
247 | screenshot_image,
248 | )
249 |
250 | finally:
251 | # Unregister the task from the session
252 | await session_manager.unregister_task(session.session_id, task_id)
253 |
254 | except Exception as e:
255 | error_msg = f"Error running CUA task: {str(e)}\n{traceback.format_exc()}"
256 | logger.error(error_msg)
257 | ctx.error(error_msg)
258 |
259 | # Try to get a screenshot from the session if available
260 | try:
261 | if session_id:
262 | async with session_manager.get_session(session_id) as session:
263 | screenshot = await session.computer.interface.screenshot()
264 | return (
265 | f"Error during task execution: {str(e)}",
266 | Image(format="png", data=screenshot),
267 | )
268 | except Exception:
269 | pass
270 |
271 | # If we can't get a screenshot, return a placeholder
272 | return (
273 | f"Error during task execution: {str(e)}",
274 | Image(format="png", data=b""),
275 | )
276 |
277 | @server.tool(structured_output=False)
278 | async def run_multi_cua_tasks(
279 | ctx: Context, tasks: List[str], session_id: Optional[str] = None, concurrent: bool = False
280 | ) -> Any:
281 | """
282 | Run multiple CUA tasks and return a list of (combined text, screenshot).
283 |
284 | Args:
285 | tasks: List of task descriptions to execute
286 | session_id: Optional session ID for multi-client support. If not provided, a new session will be created.
287 | concurrent: If True, run tasks concurrently. If False, run sequentially (default).
288 | """
289 | total_tasks = len(tasks)
290 | if total_tasks == 0:
291 | ctx.report_progress(1.0)
292 | return []
293 |
294 | session_manager = get_session_manager()
295 |
296 | if concurrent and total_tasks > 1:
297 | # Run tasks concurrently
298 | logger.info(f"Running {total_tasks} tasks concurrently")
299 | ctx.info(f"Running {total_tasks} tasks concurrently")
300 |
301 | # Create tasks with progress tracking
302 | async def run_task_with_progress(
303 | task_index: int, task: str
304 | ) -> Tuple[int, Tuple[str, Image]]:
305 | ctx.report_progress(task_index / total_tasks)
306 | result = await run_cua_task(ctx, task, session_id)
307 | ctx.report_progress((task_index + 1) / total_tasks)
308 | return task_index, result
309 |
310 | # Create all task coroutines
311 | task_coroutines = [run_task_with_progress(i, task) for i, task in enumerate(tasks)]
312 |
313 | # Wait for all tasks to complete
314 | results_with_indices = await asyncio.gather(*task_coroutines, return_exceptions=True)
315 |
316 | # Sort results by original task order and handle exceptions
317 | results: List[Tuple[str, Image]] = []
318 | for result in results_with_indices:
319 | if isinstance(result, Exception):
320 | logger.error(f"Task failed with exception: {result}")
321 | ctx.error(f"Task failed: {str(result)}")
322 | results.append((f"Task failed: {str(result)}", Image(format="png", data=b"")))
323 | else:
324 | _, task_result = result
325 | results.append(task_result)
326 |
327 | return results
328 | else:
329 | # Run tasks sequentially (original behavior)
330 | logger.info(f"Running {total_tasks} tasks sequentially")
331 | ctx.info(f"Running {total_tasks} tasks sequentially")
332 |
333 | results: List[Tuple[str, Image]] = []
334 | for i, task in enumerate(tasks):
335 | logger.info(f"Running task {i+1}/{total_tasks}: {task}")
336 | ctx.info(f"Running task {i+1}/{total_tasks}: {task}")
337 |
338 | ctx.report_progress(i / total_tasks)
339 | task_result = await run_cua_task(ctx, task, session_id)
340 | results.append(task_result)
341 | ctx.report_progress((i + 1) / total_tasks)
342 |
343 | return results
344 |
345 | @server.tool(structured_output=False)
346 | async def get_session_stats(ctx: Context) -> Dict[str, Any]:
347 | """
348 | Get statistics about active sessions and resource usage.
349 | """
350 | session_manager = get_session_manager()
351 | return session_manager.get_session_stats()
352 |
353 | @server.tool(structured_output=False)
354 | async def cleanup_session(ctx: Context, session_id: str) -> str:
355 | """
356 | Cleanup a specific session and release its resources.
357 |
358 | Args:
359 | session_id: The session ID to cleanup
360 | """
361 | session_manager = get_session_manager()
362 | await session_manager.cleanup_session(session_id)
363 | return f"Session {session_id} cleanup initiated"
364 |
365 | return server
366 |
367 |
368 | server = serve()
369 |
370 |
371 | async def run_server():
372 | """Run the MCP server with proper lifecycle management."""
373 | session_manager = None
374 | try:
375 | logger.debug("Starting MCP server...")
376 |
377 | # Initialize session manager
378 | session_manager = await initialize_session_manager()
379 | logger.info("Session manager initialized")
380 |
381 | # Set up signal handlers for graceful shutdown
382 | def signal_handler(signum, frame):
383 | logger.info(f"Received signal {signum}, initiating graceful shutdown...")
384 | # Create a task to shutdown gracefully
385 | asyncio.create_task(graceful_shutdown())
386 |
387 | signal.signal(signal.SIGINT, signal_handler)
388 | signal.signal(signal.SIGTERM, signal_handler)
389 |
390 | # Start the server
391 | logger.info("Starting FastMCP server...")
392 | # Use run_stdio_async directly instead of server.run() to avoid nested event loops
393 | await server.run_stdio_async()
394 |
395 | except Exception as e:
396 | logger.error(f"Error starting server: {e}")
397 | traceback.print_exc(file=sys.stderr)
398 | raise
399 | finally:
400 | # Ensure cleanup happens
401 | if session_manager:
402 | logger.info("Shutting down session manager...")
403 | await shutdown_session_manager()
404 |
405 |
406 | async def graceful_shutdown():
407 | """Gracefully shutdown the server and all sessions."""
408 | logger.info("Initiating graceful shutdown...")
409 | try:
410 | await shutdown_session_manager()
411 | logger.info("Graceful shutdown completed")
412 | except Exception as e:
413 | logger.error(f"Error during graceful shutdown: {e}")
414 | finally:
415 | # Exit the process
416 | import os
417 |
418 | os._exit(0)
419 |
420 |
421 | def main():
422 | """Run the MCP server with proper async lifecycle management."""
423 | try:
424 | # Use anyio.run instead of asyncio.run to avoid nested event loop issues
425 | anyio.run(run_server)
426 | except KeyboardInterrupt:
427 | logger.info("Server interrupted by user")
428 | except Exception as e:
429 | logger.error(f"Error starting server: {e}")
430 | traceback.print_exc(file=sys.stderr)
431 | sys.exit(1)
432 |
433 |
434 | if __name__ == "__main__":
435 | main()
436 |
```
--------------------------------------------------------------------------------
/libs/lume/src/Commands/Logs.swift:
--------------------------------------------------------------------------------
```swift
1 | import ArgumentParser
2 | import Foundation
3 |
4 | struct Logs: ParsableCommand {
5 | static let configuration = CommandConfiguration(
6 | abstract: "View lume serve logs",
7 | subcommands: [Info.self, Error.self, All.self],
8 | defaultSubcommand: All.self
9 | )
10 |
11 | // Common functionality for reading log files
12 | static func readLogFile(path: String, lines: Int? = nil, follow: Bool = false) -> String {
13 | let fileManager = FileManager.default
14 |
15 | // Check if file exists
16 | guard fileManager.fileExists(atPath: path) else {
17 | return "Log file not found at \(path)"
18 | }
19 |
20 | do {
21 | // Read file content
22 | let content = try String(contentsOfFile: path, encoding: .utf8)
23 |
24 | // If lines parameter is provided, return only the specified number of lines from the end
25 | if let lineCount = lines {
26 | let allLines = content.components(separatedBy: .newlines)
27 | let startIndex = max(0, allLines.count - lineCount)
28 | let lastLines = Array(allLines[startIndex...])
29 | return lastLines.joined(separator: "\n")
30 | }
31 |
32 | return content
33 | } catch {
34 | return "Error reading log file: \(error.localizedDescription)"
35 | }
36 | }
37 |
38 | // Method for tailing a log file (following new changes)
39 | static func tailLogFile(path: String, initialLines: Int? = 10) {
40 | let fileManager = FileManager.default
41 |
42 | // Check if file exists
43 | guard fileManager.fileExists(atPath: path) else {
44 | print("Log file not found at \(path)")
45 | return
46 | }
47 |
48 | do {
49 | // Get initial content with only the specified number of lines from the end
50 | var lastPosition: UInt64 = 0
51 | let fileHandle = try FileHandle(forReadingFrom: URL(fileURLWithPath: path))
52 |
53 | // First, print the last few lines of the file
54 | if let lines = initialLines {
55 | let content = try String(contentsOfFile: path, encoding: .utf8)
56 | let allLines = content.components(separatedBy: .newlines)
57 | let startIndex = max(0, allLines.count - lines)
58 | let lastLines = Array(allLines[startIndex...])
59 | print(lastLines.joined(separator: "\n"))
60 | }
61 |
62 | // Get current file size
63 | lastPosition = UInt64(try fileManager.attributesOfItem(atPath: path)[.size] as? UInt64 ?? 0)
64 |
65 | // Set up for continuous monitoring
66 | print("\nTailing log file... Press Ctrl+C to stop")
67 |
68 | // Monitor file for changes
69 | while true {
70 | // Brief pause to reduce CPU usage
71 | Thread.sleep(forTimeInterval: 0.5)
72 |
73 | // Get current size
74 | let currentSize = try fileManager.attributesOfItem(atPath: path)[.size] as? UInt64 ?? 0
75 |
76 | // If file has grown
77 | if currentSize > lastPosition {
78 | // Seek to where we last read
79 | fileHandle.seek(toFileOffset: lastPosition)
80 |
81 | // Read new content
82 | if let newData = try? fileHandle.readToEnd() {
83 | if let newContent = String(data: newData, encoding: .utf8) {
84 | // Print new content without trailing newline
85 | if newContent.hasSuffix("\n") {
86 | print(newContent, terminator: "")
87 | } else {
88 | print(newContent)
89 | }
90 | }
91 | }
92 |
93 | // Update position
94 | lastPosition = currentSize
95 | }
96 |
97 | // Handle file rotation (if file became smaller)
98 | else if currentSize < lastPosition {
99 | // File was probably rotated, start from beginning
100 | lastPosition = 0
101 | fileHandle.seek(toFileOffset: 0)
102 |
103 | if let newData = try? fileHandle.readToEnd() {
104 | if let newContent = String(data: newData, encoding: .utf8) {
105 | print(newContent, terminator: "")
106 | }
107 | }
108 |
109 | lastPosition = currentSize
110 | }
111 | }
112 | } catch {
113 | print("Error tailing log file: \(error.localizedDescription)")
114 | }
115 | }
116 |
117 | // MARK: - Info Logs Subcommand
118 |
119 | struct Info: ParsableCommand {
120 | static let configuration = CommandConfiguration(
121 | commandName: "info",
122 | abstract: "View info logs from the daemon"
123 | )
124 |
125 | @Option(name: .shortAndLong, help: "Number of lines to display from the end of the file")
126 | var lines: Int?
127 |
128 | @Flag(name: .shortAndLong, help: "Follow log file continuously (like tail -f)")
129 | var follow: Bool = false
130 |
131 | func run() throws {
132 | let logPath = "/tmp/lume_daemon.log"
133 |
134 | print("=== Info Logs ===")
135 |
136 | if follow {
137 | // Use tailing functionality to continuously monitor the log
138 | Logs.tailLogFile(path: logPath, initialLines: lines ?? 10)
139 | } else {
140 | // Regular one-time viewing of logs
141 | let content = Logs.readLogFile(path: logPath, lines: lines)
142 | print(content)
143 | }
144 | }
145 | }
146 |
147 | // MARK: - Error Logs Subcommand
148 |
149 | struct Error: ParsableCommand {
150 | static let configuration = CommandConfiguration(
151 | commandName: "error",
152 | abstract: "View error logs from the daemon"
153 | )
154 |
155 | @Option(name: .shortAndLong, help: "Number of lines to display from the end of the file")
156 | var lines: Int?
157 |
158 | @Flag(name: .shortAndLong, help: "Follow log file continuously (like tail -f)")
159 | var follow: Bool = false
160 |
161 | func run() throws {
162 | let logPath = "/tmp/lume_daemon.error.log"
163 |
164 | print("=== Error Logs ===")
165 |
166 | if follow {
167 | // Use tailing functionality to continuously monitor the log
168 | Logs.tailLogFile(path: logPath, initialLines: lines ?? 10)
169 | } else {
170 | // Regular one-time viewing of logs
171 | let content = Logs.readLogFile(path: logPath, lines: lines)
172 | print(content)
173 | }
174 | }
175 | }
176 |
177 | // MARK: - All Logs Subcommand
178 |
179 | struct All: ParsableCommand {
180 | static let configuration = CommandConfiguration(
181 | commandName: "all",
182 | abstract: "View both info and error logs from the daemon"
183 | )
184 |
185 | @Option(name: .shortAndLong, help: "Number of lines to display from the end of each file")
186 | var lines: Int?
187 |
188 | @Flag(name: .shortAndLong, help: "Follow log files continuously (like tail -f)")
189 | var follow: Bool = false
190 |
191 | // Custom implementation to tail both logs simultaneously
192 | private func tailBothLogs(infoPath: String, errorPath: String, initialLines: Int? = 10) {
193 | let fileManager = FileManager.default
194 | var infoExists = fileManager.fileExists(atPath: infoPath)
195 | var errorExists = fileManager.fileExists(atPath: errorPath)
196 |
197 | if !infoExists && !errorExists {
198 | print("Neither info nor error log files found")
199 | return
200 | }
201 |
202 | // Print initial content
203 | print("=== Info Logs ===")
204 | if infoExists {
205 | if let lines = initialLines {
206 | let content = (try? String(contentsOfFile: infoPath, encoding: .utf8)) ?? ""
207 | let allLines = content.components(separatedBy: .newlines)
208 | let startIndex = max(0, allLines.count - lines)
209 | let lastLines = Array(allLines[startIndex...])
210 | print(lastLines.joined(separator: "\n"))
211 | }
212 | } else {
213 | print("Info log file not found")
214 | }
215 |
216 | print("\n=== Error Logs ===")
217 | if errorExists {
218 | if let lines = initialLines {
219 | let content = (try? String(contentsOfFile: errorPath, encoding: .utf8)) ?? ""
220 | let allLines = content.components(separatedBy: .newlines)
221 | let startIndex = max(0, allLines.count - lines)
222 | let lastLines = Array(allLines[startIndex...])
223 | print(lastLines.joined(separator: "\n"))
224 | }
225 | } else {
226 | print("Error log file not found")
227 | }
228 |
229 | print("\nTailing both log files... Press Ctrl+C to stop")
230 |
231 | // Initialize file handles and positions
232 | var infoHandle: FileHandle? = nil
233 | var errorHandle: FileHandle? = nil
234 | var infoPosition: UInt64 = 0
235 | var errorPosition: UInt64 = 0
236 |
237 | // Set up file handles
238 | if infoExists {
239 | do {
240 | infoHandle = try FileHandle(forReadingFrom: URL(fileURLWithPath: infoPath))
241 | infoPosition = UInt64(try fileManager.attributesOfItem(atPath: infoPath)[.size] as? UInt64 ?? 0)
242 | } catch {
243 | print("Error opening info log file: \(error.localizedDescription)")
244 | }
245 | }
246 |
247 | if errorExists {
248 | do {
249 | errorHandle = try FileHandle(forReadingFrom: URL(fileURLWithPath: errorPath))
250 | errorPosition = UInt64(try fileManager.attributesOfItem(atPath: errorPath)[.size] as? UInt64 ?? 0)
251 | } catch {
252 | print("Error opening error log file: \(error.localizedDescription)")
253 | }
254 | }
255 |
256 | // Monitor both files for changes
257 | while true {
258 | Thread.sleep(forTimeInterval: 0.5)
259 |
260 | // Check for new content in info log
261 | if let handle = infoHandle {
262 | do {
263 | // Re-check existence in case file was deleted
264 | infoExists = fileManager.fileExists(atPath: infoPath)
265 | if !infoExists {
266 | print("\n[Info log file was removed]")
267 | infoHandle = nil
268 | continue
269 | }
270 |
271 | let currentSize = try fileManager.attributesOfItem(atPath: infoPath)[.size] as? UInt64 ?? 0
272 |
273 | if currentSize > infoPosition {
274 | handle.seek(toFileOffset: infoPosition)
275 | if let newData = try? handle.readToEnd() {
276 | if let newContent = String(data: newData, encoding: .utf8) {
277 | print("\n--- New Info Log Content ---")
278 | if newContent.hasSuffix("\n") {
279 | print(newContent, terminator: "")
280 | } else {
281 | print(newContent)
282 | }
283 | }
284 | }
285 | infoPosition = currentSize
286 | } else if currentSize < infoPosition {
287 | // File was rotated
288 | print("\n[Info log was rotated]")
289 | infoPosition = 0
290 | handle.seek(toFileOffset: 0)
291 | if let newData = try? handle.readToEnd() {
292 | if let newContent = String(data: newData, encoding: .utf8) {
293 | print("\n--- New Info Log Content ---")
294 | print(newContent, terminator: "")
295 | }
296 | }
297 | infoPosition = currentSize
298 | }
299 | } catch {
300 | print("\nError reading info log: \(error.localizedDescription)")
301 | }
302 | } else if fileManager.fileExists(atPath: infoPath) && !infoExists {
303 | // File exists again after being deleted
304 | do {
305 | infoHandle = try FileHandle(forReadingFrom: URL(fileURLWithPath: infoPath))
306 | infoPosition = 0
307 | infoExists = true
308 | print("\n[Info log file reappeared]")
309 | } catch {
310 | print("\nError reopening info log: \(error.localizedDescription)")
311 | }
312 | }
313 |
314 | // Check for new content in error log
315 | if let handle = errorHandle {
316 | do {
317 | // Re-check existence in case file was deleted
318 | errorExists = fileManager.fileExists(atPath: errorPath)
319 | if !errorExists {
320 | print("\n[Error log file was removed]")
321 | errorHandle = nil
322 | continue
323 | }
324 |
325 | let currentSize = try fileManager.attributesOfItem(atPath: errorPath)[.size] as? UInt64 ?? 0
326 |
327 | if currentSize > errorPosition {
328 | handle.seek(toFileOffset: errorPosition)
329 | if let newData = try? handle.readToEnd() {
330 | if let newContent = String(data: newData, encoding: .utf8) {
331 | print("\n--- New Error Log Content ---")
332 | if newContent.hasSuffix("\n") {
333 | print(newContent, terminator: "")
334 | } else {
335 | print(newContent)
336 | }
337 | }
338 | }
339 | errorPosition = currentSize
340 | } else if currentSize < errorPosition {
341 | // File was rotated
342 | print("\n[Error log was rotated]")
343 | errorPosition = 0
344 | handle.seek(toFileOffset: 0)
345 | if let newData = try? handle.readToEnd() {
346 | if let newContent = String(data: newData, encoding: .utf8) {
347 | print("\n--- New Error Log Content ---")
348 | print(newContent, terminator: "")
349 | }
350 | }
351 | errorPosition = currentSize
352 | }
353 | } catch {
354 | print("\nError reading error log: \(error.localizedDescription)")
355 | }
356 | } else if fileManager.fileExists(atPath: errorPath) && !errorExists {
357 | // File exists again after being deleted
358 | do {
359 | errorHandle = try FileHandle(forReadingFrom: URL(fileURLWithPath: errorPath))
360 | errorPosition = 0
361 | errorExists = true
362 | print("\n[Error log file reappeared]")
363 | } catch {
364 | print("\nError reopening error log: \(error.localizedDescription)")
365 | }
366 | }
367 | }
368 | }
369 |
370 | func run() throws {
371 | let infoLogPath = "/tmp/lume_daemon.log"
372 | let errorLogPath = "/tmp/lume_daemon.error.log"
373 |
374 | if follow {
375 | // Use custom tailing implementation for both logs
376 | tailBothLogs(infoPath: infoLogPath, errorPath: errorLogPath, initialLines: lines ?? 10)
377 | } else {
378 | // Regular one-time viewing of logs
379 | let infoContent = Logs.readLogFile(path: infoLogPath, lines: lines)
380 | let errorContent = Logs.readLogFile(path: errorLogPath, lines: lines)
381 |
382 | print("=== Info Logs ===")
383 | print(infoContent)
384 | print("\n=== Error Logs ===")
385 | print(errorContent)
386 | }
387 | }
388 | }
389 | }
390 |
```
--------------------------------------------------------------------------------
/examples/som_examples.py:
--------------------------------------------------------------------------------
```python
1 | #!/usr/bin/env python3
2 | """
3 | Example script demonstrating the usage of OmniParser's UI element detection functionality.
4 | This script shows how to:
5 | 1. Initialize the OmniParser
6 | 2. Load and process images
7 | 3. Visualize detection results
8 | 4. Compare performance between CPU and MPS (Apple Silicon)
9 | """
10 |
11 | import argparse
12 | import base64
13 | import glob
14 | import io
15 | import logging
16 | import os
17 | import sys
18 | import time
19 | from pathlib import Path
20 | from typing import Any, Dict, List, Optional
21 |
22 | import numpy as np
23 | from PIL import Image
24 |
25 | # Load environment variables from .env file
26 | project_root = Path(__file__).parent.parent
27 | env_file = project_root / ".env"
28 | print(f"Loading environment from: {env_file}")
29 | from dotenv import load_dotenv
30 |
31 | load_dotenv(env_file)
32 |
33 | # Add paths to sys.path if needed
34 | pythonpath = os.environ.get("PYTHONPATH", "")
35 | for path in pythonpath.split(":"):
36 | if path and path not in sys.path:
37 | sys.path.append(path)
38 | print(f"Added to sys.path: {path}")
39 |
40 | # Add the libs directory to the path to find som
41 | libs_path = project_root / "libs"
42 | if str(libs_path) not in sys.path:
43 | sys.path.append(str(libs_path))
44 | print(f"Added to sys.path: {libs_path}")
45 |
46 | from som import IconElement, OmniParser, ParseResult, TextElement
47 | from som.models import BoundingBox, ParserMetadata, UIElement
48 |
49 | # Configure logging
50 | logging.basicConfig(
51 | level=logging.INFO,
52 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
53 | datefmt="%Y-%m-%d %H:%M:%S",
54 | )
55 | logger = logging.getLogger(__name__)
56 |
57 |
58 | def setup_logging():
59 | """Configure logging with a nice format."""
60 | logging.basicConfig(
61 | level=logging.INFO,
62 | format="%(asctime)s - %(levelname)s - %(message)s",
63 | datefmt="%Y-%m-%d %H:%M:%S",
64 | )
65 |
66 |
67 | class Timer:
68 | """Enhanced context manager for timing code blocks."""
69 |
70 | def __init__(self, name: str, logger):
71 | self.name = name
72 | self.logger = logger
73 | self.start_time: float = 0.0
74 | self.elapsed_time: float = 0.0
75 |
76 | def __enter__(self):
77 | self.start_time = time.time()
78 | return self
79 |
80 | def __exit__(self, *args):
81 | self.elapsed_time = time.time() - self.start_time
82 | self.logger.info(f"{self.name}: {self.elapsed_time:.3f}s")
83 | return False
84 |
85 |
86 | def image_to_bytes(image: Image.Image) -> bytes:
87 | """Convert PIL Image to PNG bytes."""
88 | buf = io.BytesIO()
89 | image.save(buf, format="PNG")
90 | return buf.getvalue()
91 |
92 |
93 | def process_image(
94 | parser: OmniParser, image_path: str, output_dir: Path, use_ocr: bool = False
95 | ) -> None:
96 | """Process a single image and save the result."""
97 | try:
98 | # Load image
99 | logger.info(f"Processing image: {image_path}")
100 | image = Image.open(image_path).convert("RGB")
101 | logger.info(f"Image loaded successfully, size: {image.size}")
102 |
103 | # Create output filename
104 | input_filename = Path(image_path).stem
105 | output_path = output_dir / f"{input_filename}_analyzed.png"
106 |
107 | # Convert image to PNG bytes
108 | image_bytes = image_to_bytes(image)
109 |
110 | # Process image
111 | with Timer(f"Processing {input_filename}", logger):
112 | result = parser.parse(image_bytes, use_ocr=use_ocr)
113 | logger.info(
114 | f"Found {result.metadata.num_icons} icons and {result.metadata.num_text} text elements"
115 | )
116 |
117 | # Save the annotated image
118 | logger.info(f"Saving annotated image to: {output_path}")
119 | try:
120 | # Save image from base64
121 | img_data = base64.b64decode(result.annotated_image_base64)
122 | img = Image.open(io.BytesIO(img_data))
123 | img.save(output_path)
124 |
125 | # Print detailed results
126 | logger.info("\nDetected Elements:")
127 | for elem in result.elements:
128 | if isinstance(elem, IconElement):
129 | logger.info(
130 | f"Icon: confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}"
131 | )
132 | elif isinstance(elem, TextElement):
133 | logger.info(
134 | f"Text: '{elem.content}', confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}"
135 | )
136 |
137 | # Verify file exists and log size
138 | if output_path.exists():
139 | logger.info(
140 | f"Successfully saved image. File size: {output_path.stat().st_size} bytes"
141 | )
142 | else:
143 | logger.error(f"Failed to verify file at {output_path}")
144 | except Exception as e:
145 | logger.error(f"Error saving image: {str(e)}", exc_info=True)
146 |
147 | except Exception as e:
148 | logger.error(f"Error processing image {image_path}: {str(e)}", exc_info=True)
149 |
150 |
151 | def run_detection_benchmark(
152 | input_path: str,
153 | output_dir: Path,
154 | use_ocr: bool = False,
155 | box_threshold: float = 0.01,
156 | iou_threshold: float = 0.1,
157 | ):
158 | """Run detection benchmark on images."""
159 | logger.info(
160 | f"Starting benchmark with OCR enabled: {use_ocr}, box_threshold: {box_threshold}, iou_threshold: {iou_threshold}"
161 | )
162 |
163 | try:
164 | # Initialize parser
165 | logger.info("Initializing OmniParser...")
166 | parser = OmniParser()
167 |
168 | # Create output directory
169 | output_dir.mkdir(parents=True, exist_ok=True)
170 | logger.info(f"Output directory created at: {output_dir}")
171 |
172 | # Get list of PNG files
173 | if os.path.isdir(input_path):
174 | image_files = glob.glob(os.path.join(input_path, "*.png"))
175 | else:
176 | image_files = [input_path]
177 |
178 | logger.info(f"Found {len(image_files)} images to process")
179 |
180 | # Process each image with specified thresholds
181 | for image_path in image_files:
182 | try:
183 | # Load image
184 | logger.info(f"Processing image: {image_path}")
185 | image = Image.open(image_path).convert("RGB")
186 | logger.info(f"Image loaded successfully, size: {image.size}")
187 |
188 | # Create output filename
189 | input_filename = Path(image_path).stem
190 | output_path = output_dir / f"{input_filename}_analyzed.png"
191 |
192 | # Convert image to PNG bytes
193 | image_bytes = image_to_bytes(image)
194 |
195 | # Process image with specified thresholds
196 | with Timer(f"Processing {input_filename}", logger):
197 | result = parser.parse(
198 | image_bytes,
199 | use_ocr=use_ocr,
200 | box_threshold=box_threshold,
201 | iou_threshold=iou_threshold,
202 | )
203 | logger.info(
204 | f"Found {result.metadata.num_icons} icons and {result.metadata.num_text} text elements"
205 | )
206 |
207 | # Save the annotated image
208 | logger.info(f"Saving annotated image to: {output_path}")
209 | try:
210 | # Save image from base64
211 | img_data = base64.b64decode(result.annotated_image_base64)
212 | img = Image.open(io.BytesIO(img_data))
213 | img.save(output_path)
214 |
215 | # Print detailed results
216 | logger.info("\nDetected Elements:")
217 | for elem in result.elements:
218 | if isinstance(elem, IconElement):
219 | logger.info(
220 | f"Icon: confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}"
221 | )
222 | elif isinstance(elem, TextElement):
223 | logger.info(
224 | f"Text: '{elem.content}', confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}"
225 | )
226 |
227 | # Verify file exists and log size
228 | if output_path.exists():
229 | logger.info(
230 | f"Successfully saved image. File size: {output_path.stat().st_size} bytes"
231 | )
232 | else:
233 | logger.error(f"Failed to verify file at {output_path}")
234 | except Exception as e:
235 | logger.error(f"Error saving image: {str(e)}", exc_info=True)
236 |
237 | except Exception as e:
238 | logger.error(f"Error processing image {image_path}: {str(e)}", exc_info=True)
239 |
240 | except Exception as e:
241 | logger.error(f"Benchmark failed: {str(e)}", exc_info=True)
242 | raise
243 |
244 |
245 | def run_experiments(input_path: str, output_dir: Path, use_ocr: bool = False):
246 | """Run experiments with different threshold combinations."""
247 | # Define threshold values to test
248 | box_thresholds = [0.01, 0.05, 0.1, 0.3]
249 | iou_thresholds = [0.05, 0.1, 0.2, 0.5]
250 |
251 | logger.info("Starting threshold experiments...")
252 | logger.info("Box thresholds to test: %s", box_thresholds)
253 | logger.info("IOU thresholds to test: %s", iou_thresholds)
254 |
255 | # Create results directory for this experiment
256 | timestamp = time.strftime("%Y%m%d-%H%M%S")
257 | ocr_suffix = "_ocr" if use_ocr else "_no_ocr"
258 | exp_dir = output_dir / f"experiment_{timestamp}{ocr_suffix}"
259 | exp_dir.mkdir(parents=True, exist_ok=True)
260 |
261 | # Create a summary file
262 | summary_file = exp_dir / "results_summary.txt"
263 | with open(summary_file, "w") as f:
264 | f.write("Threshold Experiments Results\n")
265 | f.write("==========================\n\n")
266 | f.write(f"Input: {input_path}\n")
267 | f.write(f"OCR Enabled: {use_ocr}\n")
268 | f.write(f"Date: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n")
269 | f.write("Results:\n")
270 | f.write("-" * 80 + "\n")
271 | f.write(
272 | f"{'Box Thresh':^10} | {'IOU Thresh':^10} | {'Num Icons':^10} | {'Num Text':^10} | {'Time (s)':^10}\n"
273 | )
274 | f.write("-" * 80 + "\n")
275 |
276 | # Initialize parser once for all experiments
277 | parser = OmniParser()
278 |
279 | # Run experiments with each combination
280 | for box_thresh in box_thresholds:
281 | for iou_thresh in iou_thresholds:
282 | logger.info(f"\nTesting box_threshold={box_thresh}, iou_threshold={iou_thresh}")
283 |
284 | # Create directory for this combination
285 | combo_dir = exp_dir / f"box_{box_thresh}_iou_{iou_thresh}"
286 | combo_dir.mkdir(exist_ok=True)
287 |
288 | try:
289 | # Process each image
290 | if os.path.isdir(input_path):
291 | image_files = glob.glob(os.path.join(input_path, "*.png"))
292 | else:
293 | image_files = [input_path]
294 |
295 | total_icons = 0
296 | total_text = 0
297 | total_time = 0
298 |
299 | for image_path in image_files:
300 | # Load and process image
301 | image = Image.open(image_path).convert("RGB")
302 | image_bytes = image_to_bytes(image)
303 |
304 | # Process with current thresholds
305 | with Timer(f"Processing {Path(image_path).stem}", logger) as t:
306 | result = parser.parse(
307 | image_bytes,
308 | use_ocr=use_ocr,
309 | box_threshold=box_thresh,
310 | iou_threshold=iou_thresh,
311 | )
312 |
313 | # Save annotated image
314 | output_path = combo_dir / f"{Path(image_path).stem}_analyzed.png"
315 | img_data = base64.b64decode(result.annotated_image_base64)
316 | img = Image.open(io.BytesIO(img_data))
317 | img.save(output_path)
318 |
319 | # Update totals
320 | total_icons += result.metadata.num_icons
321 | total_text += result.metadata.num_text
322 |
323 | # Log detailed results
324 | detail_file = combo_dir / f"{Path(image_path).stem}_details.txt"
325 | with open(detail_file, "w") as detail_f:
326 | detail_f.write(f"Results for {Path(image_path).name}\n")
327 | detail_f.write("-" * 40 + "\n")
328 | detail_f.write(f"Number of icons: {result.metadata.num_icons}\n")
329 | detail_f.write(
330 | f"Number of text elements: {result.metadata.num_text}\n\n"
331 | )
332 |
333 | detail_f.write("Icon Detections:\n")
334 | icon_count = 1
335 | text_count = (
336 | result.metadata.num_icons + 1
337 | ) # Text boxes start after icons
338 |
339 | # First list all icons
340 | for elem in result.elements:
341 | if isinstance(elem, IconElement):
342 | detail_f.write(f"Box #{icon_count}: Icon\n")
343 | detail_f.write(f" - Confidence: {elem.confidence:.3f}\n")
344 | detail_f.write(
345 | f" - Coordinates: {elem.bbox.coordinates}\n"
346 | )
347 | icon_count += 1
348 |
349 | if use_ocr:
350 | detail_f.write("\nText Detections:\n")
351 | for elem in result.elements:
352 | if isinstance(elem, TextElement):
353 | detail_f.write(f"Box #{text_count}: Text\n")
354 | detail_f.write(f" - Content: '{elem.content}'\n")
355 | detail_f.write(
356 | f" - Confidence: {elem.confidence:.3f}\n"
357 | )
358 | detail_f.write(
359 | f" - Coordinates: {elem.bbox.coordinates}\n"
360 | )
361 | text_count += 1
362 |
363 | # Update timing totals
364 | total_time += t.elapsed_time
365 |
366 | # Write summary for this combination
367 | avg_time = total_time / len(image_files)
368 | f.write(
369 | f"{box_thresh:^10.3f} | {iou_thresh:^10.3f} | {total_icons:^10d} | {total_text:^10d} | {avg_time:^10.3f}\n"
370 | )
371 |
372 | except Exception as e:
373 | logger.error(
374 | f"Error in experiment box={box_thresh}, iou={iou_thresh}: {str(e)}"
375 | )
376 | f.write(
377 | f"{box_thresh:^10.3f} | {iou_thresh:^10.3f} | {'ERROR':^10s} | {'ERROR':^10s} | {'ERROR':^10s}\n"
378 | )
379 |
380 | # Write summary footer
381 | f.write("-" * 80 + "\n")
382 | f.write("\nExperiment completed successfully!\n")
383 |
384 | logger.info(f"\nExperiment results saved to {exp_dir}")
385 | logger.info(f"Summary file: {summary_file}")
386 |
387 |
388 | def main():
389 | """Main entry point."""
390 | parser = argparse.ArgumentParser(description="Run OmniParser benchmark")
391 | parser.add_argument("input_path", help="Path to input image or directory containing images")
392 | parser.add_argument(
393 | "--output-dir", default="examples/output", help="Output directory for annotated images"
394 | )
395 | parser.add_argument(
396 | "--ocr",
397 | choices=["none", "easyocr"],
398 | default="none",
399 | help="OCR engine to use (default: none)",
400 | )
401 | parser.add_argument(
402 | "--mode",
403 | choices=["single", "experiment"],
404 | default="single",
405 | help="Run mode: single run or threshold experiments (default: single)",
406 | )
407 | parser.add_argument(
408 | "--box-threshold",
409 | type=float,
410 | default=0.01,
411 | help="Confidence threshold for detection (default: 0.01)",
412 | )
413 | parser.add_argument(
414 | "--iou-threshold",
415 | type=float,
416 | default=0.1,
417 | help="IOU threshold for Non-Maximum Suppression (default: 0.1)",
418 | )
419 | args = parser.parse_args()
420 |
421 | logger.info(f"Starting OmniParser with arguments: {args}")
422 | use_ocr = args.ocr != "none"
423 | output_dir = Path(args.output_dir)
424 |
425 | try:
426 | if args.mode == "experiment":
427 | run_experiments(args.input_path, output_dir, use_ocr)
428 | else:
429 | run_detection_benchmark(
430 | args.input_path, output_dir, use_ocr, args.box_threshold, args.iou_threshold
431 | )
432 | except Exception as e:
433 | logger.error(f"Process failed: {str(e)}", exc_info=True)
434 | return 1
435 |
436 | return 0
437 |
438 |
439 | if __name__ == "__main__":
440 | sys.exit(main())
441 |
```
--------------------------------------------------------------------------------
/libs/python/som/som/detect.py:
--------------------------------------------------------------------------------
```python
1 | import argparse
2 | import base64
3 | import io
4 | import logging
5 | import signal
6 | import time
7 | from contextlib import contextmanager
8 | from pathlib import Path
9 | from typing import Any, Dict, List, Optional, Tuple, Union, cast
10 |
11 | import cv2
12 | import numpy as np
13 | import supervision as sv
14 | import torch
15 | import torchvision.ops
16 | import torchvision.transforms as T
17 | from huggingface_hub import hf_hub_download
18 | from PIL import Image
19 | from supervision.detection.core import Detections
20 | from ultralytics import YOLO
21 |
22 | from .detection import DetectionProcessor
23 | from .models import (
24 | BoundingBox,
25 | IconElement,
26 | ParseResult,
27 | ParserMetadata,
28 | TextElement,
29 | UIElement,
30 | )
31 | from .ocr import OCRProcessor
32 | from .visualization import BoxAnnotator
33 |
34 | logger = logging.getLogger(__name__)
35 |
36 |
37 | class TimeoutException(Exception):
38 | pass
39 |
40 |
41 | @contextmanager
42 | def timeout(seconds: int):
43 | def timeout_handler(signum, frame):
44 | raise TimeoutException("OCR process timed out")
45 |
46 | # Register the signal handler
47 | original_handler = signal.signal(signal.SIGALRM, timeout_handler)
48 | signal.alarm(seconds)
49 |
50 | try:
51 | yield
52 | finally:
53 | signal.alarm(0)
54 | signal.signal(signal.SIGALRM, original_handler)
55 |
56 |
57 | def process_text_box(box, image):
58 | """Process a single text box with OCR."""
59 | try:
60 | from typing import Any, List, Sequence, Tuple
61 |
62 | import easyocr
63 |
64 | x1 = int(min(point[0] for point in box))
65 | y1 = int(min(point[1] for point in box))
66 | x2 = int(max(point[0] for point in box))
67 | y2 = int(max(point[1] for point in box))
68 |
69 | # Add padding
70 | pad = 2
71 | x1 = max(0, x1 - pad)
72 | y1 = max(0, y1 - pad)
73 | x2 = min(image.shape[1], x2 + pad)
74 | y2 = min(image.shape[0], y2 + pad)
75 |
76 | region = image[y1:y2, x1:x2]
77 | if region.size > 0:
78 | reader = easyocr.Reader(["en"])
79 | results = reader.readtext(region)
80 | if results and len(results) > 0:
81 | # EasyOCR returns a list of tuples (bbox, text, confidence)
82 | first_result = results[0]
83 | if isinstance(first_result, (list, tuple)) and len(first_result) >= 3:
84 | text = str(first_result[1])
85 | confidence = float(first_result[2])
86 | if confidence > 0.5:
87 | return text, [x1, y1, x2, y2], confidence
88 | except Exception:
89 | pass
90 | return None
91 |
92 |
93 | def check_ocr_box(image_path: Union[str, Path]) -> Tuple[List[str], List[List[float]]]:
94 | """Check OCR box using EasyOCR."""
95 | # Read image once
96 | if isinstance(image_path, str):
97 | image_path = Path(image_path)
98 |
99 | # Read image into memory
100 | image_cv = cv2.imread(str(image_path))
101 | if image_cv is None:
102 | logger.error(f"Failed to read image: {image_path}")
103 | return [], []
104 |
105 | # Get image dimensions
106 | img_height, img_width = image_cv.shape[:2]
107 | confidence_threshold = 0.5
108 |
109 | # Use EasyOCR
110 | import ssl
111 |
112 | import easyocr
113 |
114 | # Create unverified SSL context for development
115 | ssl._create_default_https_context = ssl._create_unverified_context
116 | try:
117 | reader = easyocr.Reader(["en"])
118 | with timeout(5): # 5 second timeout for EasyOCR
119 | results = reader.readtext(image_cv, paragraph=False, text_threshold=0.5)
120 | except TimeoutException:
121 | logger.warning("EasyOCR timed out, returning no results")
122 | return [], []
123 | except Exception as e:
124 | logger.warning(f"EasyOCR failed: {str(e)}")
125 | return [], []
126 | finally:
127 | # Restore default SSL context
128 | ssl._create_default_https_context = ssl.create_default_context
129 |
130 | texts = []
131 | boxes = []
132 |
133 | for box, text, conf in results:
134 | # Convert box format to [x1, y1, x2, y2]
135 | x1 = min(point[0] for point in box)
136 | y1 = min(point[1] for point in box)
137 | x2 = max(point[0] for point in box)
138 | y2 = max(point[1] for point in box)
139 |
140 | if float(conf) > 0.5: # Only keep higher confidence detections
141 | texts.append(text)
142 | boxes.append([x1, y1, x2, y2])
143 |
144 | return texts, boxes
145 |
146 |
147 | class OmniParser:
148 | """Enhanced UI parser using computer vision and OCR for detecting interactive elements."""
149 |
150 | def __init__(
151 | self,
152 | model_path: Optional[Union[str, Path]] = None,
153 | cache_dir: Optional[Union[str, Path]] = None,
154 | force_device: Optional[str] = None,
155 | ):
156 | """Initialize the OmniParser.
157 |
158 | Args:
159 | model_path: Optional path to the YOLO model
160 | cache_dir: Optional directory to cache model files
161 | force_device: Force specific device (cpu/cuda/mps)
162 | """
163 | self.detector = DetectionProcessor(
164 | model_path=Path(model_path) if model_path else None,
165 | cache_dir=Path(cache_dir) if cache_dir else None,
166 | force_device=force_device,
167 | )
168 | self.ocr = OCRProcessor()
169 | self.visualizer = BoxAnnotator()
170 |
171 | def process_image(
172 | self,
173 | image: Image.Image,
174 | box_threshold: float = 0.3,
175 | iou_threshold: float = 0.1,
176 | use_ocr: bool = True,
177 | ) -> Tuple[Image.Image, List[UIElement]]:
178 | """Process an image to detect UI elements and optionally text.
179 |
180 | Args:
181 | image: Input PIL Image
182 | box_threshold: Confidence threshold for detection
183 | iou_threshold: IOU threshold for NMS
184 | use_ocr: Whether to enable OCR processing
185 |
186 | Returns:
187 | Tuple of (annotated image, list of detections)
188 | """
189 | try:
190 | logger.info("Starting UI element detection...")
191 |
192 | # Detect icons
193 | icon_detections = self.detector.detect_icons(
194 | image=image, box_threshold=box_threshold, iou_threshold=iou_threshold
195 | )
196 | logger.info(f"Found {len(icon_detections)} interactive elements")
197 |
198 | # Convert icon detections to typed objects
199 | elements: List[UIElement] = cast(
200 | List[UIElement],
201 | [
202 | IconElement(
203 | id=i + 1,
204 | bbox=BoundingBox(
205 | x1=det["bbox"][0],
206 | y1=det["bbox"][1],
207 | x2=det["bbox"][2],
208 | y2=det["bbox"][3],
209 | ),
210 | confidence=det["confidence"],
211 | scale=det.get("scale"),
212 | )
213 | for i, det in enumerate(icon_detections)
214 | ],
215 | )
216 |
217 | # Run OCR if enabled
218 | if use_ocr:
219 | logger.info("Running OCR detection...")
220 | text_detections = self.ocr.detect_text(image=image, confidence_threshold=0.5)
221 | if text_detections is None:
222 | text_detections = []
223 | logger.info(f"Found {len(text_detections)} text regions")
224 |
225 | # Convert text detections to typed objects
226 | text_elements = cast(
227 | List[UIElement],
228 | [
229 | TextElement(
230 | id=len(elements) + i + 1,
231 | bbox=BoundingBox(
232 | x1=det["bbox"][0],
233 | y1=det["bbox"][1],
234 | x2=det["bbox"][2],
235 | y2=det["bbox"][3],
236 | ),
237 | content=det["content"],
238 | confidence=det["confidence"],
239 | )
240 | for i, det in enumerate(text_detections)
241 | ],
242 | )
243 |
244 | if elements and text_elements:
245 | # Filter out non-OCR elements that have OCR elements with center points colliding with them
246 | filtered_elements = []
247 | for elem in elements: # elements at this point contains only non-OCR elements
248 | should_keep = True
249 | for text_elem in text_elements:
250 | # Calculate center point of the text element
251 | center_x = (text_elem.bbox.x1 + text_elem.bbox.x2) / 2
252 | center_y = (text_elem.bbox.y1 + text_elem.bbox.y2) / 2
253 |
254 | # Check if this center point is inside the non-OCR element
255 | if (
256 | center_x >= elem.bbox.x1
257 | and center_x <= elem.bbox.x2
258 | and center_y >= elem.bbox.y1
259 | and center_y <= elem.bbox.y2
260 | ):
261 | should_keep = False
262 | break
263 |
264 | if should_keep:
265 | filtered_elements.append(elem)
266 | elements = filtered_elements
267 |
268 | # Merge detections using NMS
269 | all_elements = elements + text_elements
270 | boxes = torch.tensor([elem.bbox.coordinates for elem in all_elements])
271 | scores = torch.tensor([elem.confidence for elem in all_elements])
272 | keep_indices = torchvision.ops.nms(boxes, scores, iou_threshold)
273 | elements = [all_elements[i] for i in keep_indices]
274 | else:
275 | # Just add text elements to the list if IOU doesn't need to be applied
276 | elements.extend(text_elements)
277 |
278 | # Calculate drawing parameters based on image size
279 | box_overlay_ratio = max(image.size) / 3200
280 | draw_config = {
281 | "font_size": int(12 * box_overlay_ratio),
282 | "box_thickness": max(int(2 * box_overlay_ratio), 1),
283 | "text_padding": max(int(3 * box_overlay_ratio), 1),
284 | }
285 |
286 | # Convert elements back to dict format for visualization
287 | detection_dicts = [
288 | {
289 | "type": elem.type,
290 | "bbox": elem.bbox.coordinates,
291 | "confidence": elem.confidence,
292 | "content": elem.content if isinstance(elem, TextElement) else None,
293 | }
294 | for elem in elements
295 | ]
296 |
297 | # Create visualization
298 | logger.info("Creating visualization...")
299 | annotated_image = self.visualizer.draw_boxes(
300 | image=image.copy(), detections=detection_dicts, draw_config=draw_config
301 | )
302 | logger.info("Visualization complete")
303 |
304 | return annotated_image, elements
305 |
306 | except Exception as e:
307 | logger.error(f"Error in process_image: {str(e)}")
308 | import traceback
309 |
310 | logger.error(traceback.format_exc())
311 | raise
312 |
313 | def parse(
314 | self,
315 | screenshot_data: Union[bytes, str],
316 | box_threshold: float = 0.3,
317 | iou_threshold: float = 0.1,
318 | use_ocr: bool = True,
319 | ) -> ParseResult:
320 | """Parse a UI screenshot to detect interactive elements and text.
321 |
322 | Args:
323 | screenshot_data: Raw bytes or base64 string of the screenshot
324 | box_threshold: Confidence threshold for detection
325 | iou_threshold: IOU threshold for NMS
326 | use_ocr: Whether to enable OCR processing
327 |
328 | Returns:
329 | ParseResult object containing elements, annotated image, and metadata
330 | """
331 | try:
332 | start_time = time.time()
333 |
334 | # Convert input to PIL Image
335 | if isinstance(screenshot_data, str):
336 | screenshot_data = base64.b64decode(screenshot_data)
337 | image = Image.open(io.BytesIO(screenshot_data)).convert("RGB")
338 |
339 | # Process image
340 | annotated_image, elements = self.process_image(
341 | image=image,
342 | box_threshold=box_threshold,
343 | iou_threshold=iou_threshold,
344 | use_ocr=use_ocr,
345 | )
346 |
347 | # Convert annotated image to base64
348 | buffered = io.BytesIO()
349 | annotated_image.save(buffered, format="PNG")
350 | annotated_image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
351 |
352 | # Generate screen info text
353 | screen_info = []
354 | parsed_content_list = []
355 |
356 | # Set element IDs and generate human-readable descriptions
357 | for i, elem in enumerate(elements):
358 | # Set the ID (1-indexed)
359 | elem.id = i + 1
360 |
361 | if isinstance(elem, IconElement):
362 | screen_info.append(
363 | f"Box #{i+1}: Icon (confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates})"
364 | )
365 | parsed_content_list.append(
366 | {
367 | "id": i + 1,
368 | "type": "icon",
369 | "bbox": elem.bbox.coordinates,
370 | "confidence": elem.confidence,
371 | "content": None,
372 | }
373 | )
374 | elif isinstance(elem, TextElement):
375 | screen_info.append(
376 | f"Box #{i+1}: Text '{elem.content}' (confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates})"
377 | )
378 | parsed_content_list.append(
379 | {
380 | "id": i + 1,
381 | "type": "text",
382 | "bbox": elem.bbox.coordinates,
383 | "confidence": elem.confidence,
384 | "content": elem.content,
385 | }
386 | )
387 |
388 | # Calculate metadata
389 | latency = time.time() - start_time
390 | width, height = image.size
391 |
392 | # Create ParseResult object with enhanced properties
393 | result = ParseResult(
394 | elements=elements,
395 | annotated_image_base64=annotated_image_base64,
396 | screen_info=screen_info,
397 | parsed_content_list=parsed_content_list,
398 | metadata=ParserMetadata(
399 | image_size=(width, height),
400 | num_icons=len([e for e in elements if isinstance(e, IconElement)]),
401 | num_text=len([e for e in elements if isinstance(e, TextElement)]),
402 | device=self.detector.device,
403 | ocr_enabled=use_ocr,
404 | latency=latency,
405 | ),
406 | )
407 |
408 | # Return the ParseResult object directly
409 | return result
410 |
411 | except Exception as e:
412 | logger.error(f"Error in parse: {str(e)}")
413 | import traceback
414 |
415 | logger.error(traceback.format_exc())
416 | raise
417 |
418 |
419 | def main():
420 | """Command line interface for UI element detection."""
421 | parser = argparse.ArgumentParser(description="Detect UI elements and text in images")
422 | parser.add_argument("image_path", help="Path to the input image")
423 | parser.add_argument("--model-path", help="Path to YOLO model")
424 | parser.add_argument(
425 | "--box-threshold", type=float, default=0.3, help="Box confidence threshold (default: 0.3)"
426 | )
427 | parser.add_argument(
428 | "--iou-threshold", type=float, default=0.1, help="IOU threshold (default: 0.1)"
429 | )
430 | parser.add_argument(
431 | "--ocr", action="store_true", default=True, help="Enable OCR processing (default: True)"
432 | )
433 | parser.add_argument("--output", help="Output path for annotated image")
434 | args = parser.parse_args()
435 |
436 | # Setup logging
437 | logging.basicConfig(level=logging.INFO)
438 |
439 | try:
440 | # Initialize parser
441 | parser = OmniParser(model_path=args.model_path)
442 |
443 | # Load and process image
444 | logger.info(f"Loading image from: {args.image_path}")
445 | image = Image.open(args.image_path).convert("RGB")
446 | logger.info(f"Image loaded successfully, size: {image.size}")
447 |
448 | # Process image
449 | annotated_image, elements = parser.process_image(
450 | image=image,
451 | box_threshold=args.box_threshold,
452 | iou_threshold=args.iou_threshold,
453 | use_ocr=args.ocr,
454 | )
455 |
456 | # Save output image
457 | output_path = args.output or str(
458 | Path(args.image_path).parent
459 | / f"{Path(args.image_path).stem}_analyzed{Path(args.image_path).suffix}"
460 | )
461 | logger.info(f"Saving annotated image to: {output_path}")
462 |
463 | Path(output_path).parent.mkdir(parents=True, exist_ok=True)
464 | annotated_image.save(output_path)
465 | logger.info(f"Image saved successfully to {output_path}")
466 |
467 | # Print detections
468 | logger.info("\nDetections:")
469 | for i, elem in enumerate(elements):
470 | if isinstance(elem, IconElement):
471 | logger.info(
472 | f"Interactive element {i}: confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}"
473 | )
474 | elif isinstance(elem, TextElement):
475 | logger.info(f"Text {i}: '{elem.content}', bbox={elem.bbox.coordinates}")
476 |
477 | except Exception as e:
478 | logger.error(f"Error processing image: {str(e)}")
479 | import traceback
480 |
481 | logger.error(traceback.format_exc())
482 | return 1
483 |
484 | return 0
485 |
486 |
487 | if __name__ == "__main__":
488 | import sys
489 |
490 | sys.exit(main())
491 |
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/cli.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | CLI chat interface for agent - Computer Use Agent
3 |
4 | Usage:
5 | python -m agent.cli <model_string>
6 |
7 | Examples:
8 | python -m agent.cli openai/computer-use-preview
9 | python -m agent.cli anthropic/claude-sonnet-4-5-20250929
10 | python -m agent.cli omniparser+anthropic/claude-sonnet-4-5-20250929
11 | """
12 |
13 | try:
14 | import argparse
15 | import asyncio
16 | import base64
17 | import json
18 | import os
19 | import platform
20 | import sys
21 | import time
22 | from pathlib import Path
23 | from typing import Any, Dict, List
24 |
25 | import dotenv
26 |
27 | try:
28 | from PIL import Image, ImageDraw
29 |
30 | PIL_AVAILABLE = True
31 | except Exception:
32 | PIL_AVAILABLE = False
33 | from yaspin import yaspin
34 | except ImportError:
35 | if __name__ == "__main__":
36 | raise ImportError(
37 | "CLI dependencies not found. " 'Please install with: pip install "cua-agent[cli]"'
38 | )
39 |
40 | # Load environment variables
41 | dotenv.load_dotenv()
42 |
43 |
44 | # Color codes for terminal output
45 | class Colors:
46 | RESET = "\033[0m"
47 | BOLD = "\033[1m"
48 | DIM = "\033[2m"
49 |
50 | # Text colors
51 | RED = "\033[31m"
52 | GREEN = "\033[32m"
53 | YELLOW = "\033[33m"
54 | BLUE = "\033[34m"
55 | MAGENTA = "\033[35m"
56 | CYAN = "\033[36m"
57 | WHITE = "\033[37m"
58 | GRAY = "\033[90m"
59 |
60 | # Background colors
61 | BG_RED = "\033[41m"
62 | BG_GREEN = "\033[42m"
63 | BG_YELLOW = "\033[43m"
64 | BG_BLUE = "\033[44m"
65 |
66 |
67 | def print_colored(
68 | text: str,
69 | color: str = "",
70 | bold: bool = False,
71 | dim: bool = False,
72 | end: str = "\n",
73 | right: str = "",
74 | ):
75 | """Print colored text to terminal with optional right-aligned text."""
76 | prefix = ""
77 | if bold:
78 | prefix += Colors.BOLD
79 | if dim:
80 | prefix += Colors.DIM
81 | if color:
82 | prefix += color
83 |
84 | if right:
85 | # Get terminal width (default to 80 if unable to determine)
86 | try:
87 | import shutil
88 |
89 | terminal_width = shutil.get_terminal_size().columns
90 | except:
91 | terminal_width = 80
92 |
93 | # Add right margin
94 | terminal_width -= 1
95 |
96 | # Calculate padding needed
97 | # Account for ANSI escape codes not taking visual space
98 | visible_left_len = len(text)
99 | visible_right_len = len(right)
100 | padding = terminal_width - visible_left_len - visible_right_len
101 |
102 | if padding > 0:
103 | output = f"{prefix}{text}{' ' * padding}{right}{Colors.RESET}"
104 | else:
105 | # If not enough space, just put a single space between
106 | output = f"{prefix}{text} {right}{Colors.RESET}"
107 | else:
108 | output = f"{prefix}{text}{Colors.RESET}"
109 |
110 | print(output, end=end)
111 |
112 |
113 | def print_action(action_type: str, details: Dict[str, Any], total_cost: float):
114 | """Print computer action with nice formatting."""
115 | # Format action details
116 | args_str = ""
117 | if action_type == "click" and "x" in details and "y" in details:
118 | args_str = f"_{details.get('button', 'left')}({details['x']}, {details['y']})"
119 | elif action_type == "type" and "text" in details:
120 | text = details["text"]
121 | if len(text) > 50:
122 | text = text[:47] + "..."
123 | args_str = f'("{text}")'
124 | elif action_type == "key" and "text" in details:
125 | args_str = f"('{details['text']}')"
126 | elif action_type == "scroll" and "x" in details and "y" in details:
127 | args_str = f"({details['x']}, {details['y']})"
128 |
129 | if total_cost > 0:
130 | print_colored(f"🛠️ {action_type}{args_str}", dim=True, right=f"💸 ${total_cost:.2f}")
131 | else:
132 | print_colored(f"🛠️ {action_type}{args_str}", dim=True)
133 |
134 |
135 | def print_welcome(model: str, agent_loop: str, container_name: str):
136 | """Print welcome message."""
137 | print_colored(f"Connected to {container_name} ({model}, {agent_loop})")
138 | print_colored("Type 'exit' to quit.", dim=True)
139 |
140 |
141 | async def ainput(prompt: str = ""):
142 | return await asyncio.to_thread(input, prompt)
143 |
144 |
145 | async def chat_loop(
146 | agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True
147 | ):
148 | """Main chat loop with the agent."""
149 | print_welcome(model, agent.agent_config_info.agent_class.__name__, container_name)
150 |
151 | history = []
152 |
153 | if initial_prompt:
154 | history.append({"role": "user", "content": initial_prompt})
155 |
156 | total_cost = 0
157 |
158 | while True:
159 | if len(history) == 0 or history[-1].get("role") != "user":
160 | # Get user input with prompt
161 | print_colored("> ", end="")
162 | user_input = await ainput()
163 |
164 | if user_input.lower() in ["exit", "quit", "q"]:
165 | print_colored("\n👋 Goodbye!")
166 | break
167 |
168 | if not user_input:
169 | continue
170 |
171 | # Add user message to history
172 | history.append({"role": "user", "content": user_input})
173 |
174 | # Stream responses from the agent with spinner
175 | with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
176 | spinner.hide()
177 |
178 | async for result in agent.run(history):
179 | # Add agent responses to history
180 | history.extend(result.get("output", []))
181 |
182 | if show_usage:
183 | total_cost += result.get("usage", {}).get("response_cost", 0)
184 |
185 | # Process and display the output
186 | for item in result.get("output", []):
187 | if item.get("type") == "message" and item.get("role") == "assistant":
188 | # Display agent text response
189 | content = item.get("content", [])
190 | for content_part in content:
191 | if content_part.get("text"):
192 | text = content_part.get("text", "").strip()
193 | if text:
194 | spinner.hide()
195 | print_colored(text)
196 |
197 | elif item.get("type") == "computer_call":
198 | # Display computer action
199 | action = item.get("action", {})
200 | action_type = action.get("type", "")
201 | if action_type:
202 | spinner.hide()
203 | print_action(action_type, action, total_cost)
204 | spinner.text = f"Performing {action_type}..."
205 | spinner.show()
206 |
207 | elif item.get("type") == "function_call":
208 | # Display function call
209 | function_name = item.get("name", "")
210 | spinner.hide()
211 | print_colored(f"🔧 Calling function: {function_name}", dim=True)
212 | spinner.text = f"Calling {function_name}..."
213 | spinner.show()
214 |
215 | elif item.get("type") == "function_call_output":
216 | # Display function output (dimmed)
217 | output = item.get("output", "")
218 | if output and len(output.strip()) > 0:
219 | spinner.hide()
220 | print_colored(f"📤 {output}", dim=True)
221 |
222 | spinner.hide()
223 | if show_usage and total_cost > 0:
224 | print_colored(f"Total cost: ${total_cost:.2f}", dim=True)
225 |
226 |
227 | async def main():
228 | """Main CLI function."""
229 | parser = argparse.ArgumentParser(
230 | description="CUA Agent CLI - Interactive computer use assistant",
231 | formatter_class=argparse.RawDescriptionHelpFormatter,
232 | epilog="""
233 | Examples:
234 | python -m agent.cli openai/computer-use-preview
235 | python -m agent.cli anthropic/claude-sonnet-4-5-20250929
236 | python -m agent.cli omniparser+anthropic/claude-sonnet-4-5-20250929
237 | python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
238 | """,
239 | )
240 |
241 | parser.add_argument(
242 | "model",
243 | help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-sonnet-4-5-20250929')",
244 | )
245 |
246 | parser.add_argument(
247 | "--provider",
248 | choices=["cloud", "lume", "winsandbox", "docker"],
249 | default="cloud",
250 | help="Computer provider to use: cloud (default), lume, winsandbox, or docker",
251 | )
252 |
253 | parser.add_argument(
254 | "--images",
255 | type=int,
256 | default=3,
257 | help="Number of recent images to keep in context (default: 3)",
258 | )
259 |
260 | parser.add_argument("--trajectory", action="store_true", help="Save trajectory for debugging")
261 |
262 | parser.add_argument("--budget", type=float, help="Maximum budget for the session (in dollars)")
263 |
264 | parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
265 |
266 | parser.add_argument(
267 | "-p",
268 | "--prompt",
269 | type=str,
270 | help="Initial prompt to send to the agent. Leave blank for interactive mode.",
271 | )
272 |
273 | parser.add_argument(
274 | "--prompt-file",
275 | type=Path,
276 | help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt.",
277 | )
278 |
279 | parser.add_argument(
280 | "--predict-click",
281 | dest="predict_click",
282 | type=str,
283 | help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it.",
284 | )
285 |
286 | parser.add_argument("-c", "--cache", action="store_true", help="Tell the API to enable caching")
287 |
288 | parser.add_argument(
289 | "-u", "--usage", action="store_true", help="Show total cost of the agent runs"
290 | )
291 |
292 | parser.add_argument(
293 | "-r",
294 | "--max-retries",
295 | type=int,
296 | default=3,
297 | help="Maximum number of retries for the LLM API calls",
298 | )
299 |
300 | # Provider override credentials
301 | parser.add_argument(
302 | "--api-key",
303 | dest="api_key",
304 | type=str,
305 | help="API key override for the model provider (passed to ComputerAgent)",
306 | )
307 | parser.add_argument(
308 | "--api-base",
309 | dest="api_base",
310 | type=str,
311 | help="API base URL override for the model provider (passed to ComputerAgent)",
312 | )
313 |
314 | args = parser.parse_args()
315 |
316 | # Check for required environment variables
317 | container_name = os.getenv("CUA_CONTAINER_NAME")
318 | cua_api_key = os.getenv("CUA_API_KEY")
319 |
320 | # Prompt for missing environment variables (container name always required)
321 | if not container_name:
322 | if args.provider == "cloud":
323 | print_colored("CUA_CONTAINER_NAME not set.", dim=True)
324 | print_colored("You can get a CUA container at https://cua.ai/", dim=True)
325 | container_name = input("Enter your CUA container name: ").strip()
326 | if not container_name:
327 | print_colored("❌ Container name is required.")
328 | sys.exit(1)
329 | else:
330 | container_name = "cli-sandbox"
331 |
332 | # Only require API key for cloud provider
333 | if args.provider == "cloud" and not cua_api_key:
334 | print_colored("CUA_API_KEY not set.", dim=True)
335 | cua_api_key = input("Enter your CUA API key: ").strip()
336 | if not cua_api_key:
337 | print_colored("❌ API key is required for cloud provider.")
338 | sys.exit(1)
339 |
340 | # Check for provider-specific API keys based on model
341 | provider_api_keys = {
342 | "openai/": "OPENAI_API_KEY",
343 | "anthropic/": "ANTHROPIC_API_KEY",
344 | }
345 |
346 | # Find matching provider and check for API key
347 | for prefix, env_var in provider_api_keys.items():
348 | if prefix in args.model:
349 | if not os.getenv(env_var):
350 | print_colored(f"{env_var} not set.", dim=True)
351 | api_key = input(f"Enter your {env_var.replace('_', ' ').title()}: ").strip()
352 | if not api_key:
353 | print_colored(f"❌ {env_var.replace('_', ' ').title()} is required.")
354 | sys.exit(1)
355 | # Set the environment variable for the session
356 | os.environ[env_var] = api_key
357 | break
358 |
359 | # Import here to avoid import errors if dependencies are missing
360 | try:
361 | from agent import ComputerAgent
362 | from computer import Computer
363 | except ImportError as e:
364 | print_colored(f"❌ Import error: {e}", Colors.RED, bold=True)
365 | print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
366 | sys.exit(1)
367 |
368 | # Resolve provider -> os_type, provider_type, api key requirement
369 | provider_map = {
370 | "cloud": ("linux", "cloud", True),
371 | "lume": ("macos", "lume", False),
372 | "winsandbox": ("windows", "winsandbox", False),
373 | "docker": ("linux", "docker", False),
374 | }
375 | os_type, provider_type, needs_api_key = provider_map[args.provider]
376 |
377 | computer_kwargs = {
378 | "os_type": os_type,
379 | "provider_type": provider_type,
380 | "name": container_name,
381 | }
382 | if needs_api_key:
383 | computer_kwargs["api_key"] = cua_api_key # type: ignore
384 |
385 | # Create computer instance
386 | async with Computer(**computer_kwargs) as computer: # type: ignore
387 |
388 | # Create agent
389 | agent_kwargs = {
390 | "model": args.model,
391 | "tools": [computer],
392 | "trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA)
393 | "verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING
394 | "max_retries": args.max_retries,
395 | }
396 |
397 | # Thread API credentials to agent if provided
398 | if args.api_key:
399 | agent_kwargs["api_key"] = args.api_key
400 | if args.api_base:
401 | agent_kwargs["api_base"] = args.api_base
402 |
403 | if args.images > 0:
404 | agent_kwargs["only_n_most_recent_images"] = args.images
405 |
406 | if args.trajectory:
407 | agent_kwargs["trajectory_dir"] = "trajectories"
408 |
409 | if args.budget:
410 | agent_kwargs["max_trajectory_budget"] = {
411 | "max_budget": args.budget,
412 | "raise_error": True,
413 | "reset_after_each_run": False,
414 | }
415 |
416 | if args.cache:
417 | agent_kwargs["use_prompt_caching"] = True
418 |
419 | agent = ComputerAgent(**agent_kwargs)
420 |
421 | # If predict-click mode is requested, run once and exit
422 | if args.predict_click:
423 | if not PIL_AVAILABLE:
424 | print_colored(
425 | "❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow",
426 | Colors.RED,
427 | bold=True,
428 | )
429 | sys.exit(1)
430 |
431 | instruction = args.predict_click
432 | print_colored(f"Predicting click for: '{instruction}'", Colors.CYAN)
433 |
434 | # Take a fresh screenshot FIRST
435 | try:
436 | img_bytes = await computer.interface.screenshot()
437 | except Exception as e:
438 | print_colored(f"❌ Failed to take screenshot: {e}", Colors.RED, bold=True)
439 | sys.exit(1)
440 |
441 | # Encode screenshot to base64 for predict_click
442 | try:
443 | image_b64 = base64.b64encode(img_bytes).decode("utf-8")
444 | except Exception as e:
445 | print_colored(f"❌ Failed to encode screenshot: {e}", Colors.RED, bold=True)
446 | sys.exit(1)
447 |
448 | try:
449 | coords = await agent.predict_click(instruction, image_b64=image_b64)
450 | except Exception as e:
451 | print_colored(f"❌ predict_click failed: {e}", Colors.RED, bold=True)
452 | sys.exit(1)
453 |
454 | if not coords:
455 | print_colored("⚠️ No coordinates returned.", Colors.YELLOW)
456 | sys.exit(2)
457 |
458 | x, y = coords
459 | print_colored(f"✅ Predicted coordinates: ({x}, {y})", Colors.GREEN)
460 |
461 | try:
462 | from io import BytesIO
463 |
464 | with Image.open(BytesIO(img_bytes)) as img:
465 | img = img.convert("RGB")
466 | draw = ImageDraw.Draw(img)
467 | # Draw crosshair
468 | size = 12
469 | color = (255, 0, 0)
470 | draw.line([(x - size, y), (x + size, y)], fill=color, width=3)
471 | draw.line([(x, y - size), (x, y + size)], fill=color, width=3)
472 | # Optional small circle
473 | r = 6
474 | draw.ellipse([(x - r, y - r), (x + r, y + r)], outline=color, width=2)
475 |
476 | out_path = Path.cwd() / f"predict_click_{int(time.time())}.png"
477 | img.save(out_path)
478 | print_colored(f"🖼️ Saved to {out_path}")
479 |
480 | # Open the image with default viewer
481 | try:
482 | system = platform.system().lower()
483 | if system == "windows":
484 | os.startfile(str(out_path)) # type: ignore[attr-defined]
485 | elif system == "darwin":
486 | os.system(f'open "{out_path}"')
487 | else:
488 | os.system(f'xdg-open "{out_path}"')
489 | except Exception:
490 | pass
491 | except Exception as e:
492 | print_colored(f"❌ Failed to render/save screenshot: {e}", Colors.RED, bold=True)
493 | sys.exit(1)
494 |
495 | # Done
496 | sys.exit(0)
497 |
498 | # Resolve initial prompt from --prompt-file or --prompt
499 | initial_prompt = args.prompt or ""
500 | if args.prompt_file:
501 | try:
502 | initial_prompt = args.prompt_file.read_text(encoding="utf-8")
503 | except Exception as e:
504 | print_colored(f"❌ Failed to read --prompt-file: {e}", Colors.RED, bold=True)
505 | sys.exit(1)
506 |
507 | # Start chat loop (default interactive mode)
508 | await chat_loop(agent, args.model, container_name, initial_prompt, args.usage)
509 |
510 |
511 | if __name__ == "__main__":
512 | try:
513 | asyncio.run(main())
514 | except (KeyboardInterrupt, EOFError) as _:
515 | print_colored("\n\n👋 Goodbye!")
516 |
```
--------------------------------------------------------------------------------
/libs/python/computer/computer/helpers.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Helper functions and decorators for the Computer module.
3 | """
4 |
5 | import ast
6 | import asyncio
7 | import builtins
8 | import importlib.util
9 | import inspect
10 | import logging
11 | import os
12 | import sys
13 | from functools import wraps
14 | from inspect import getsource
15 | from textwrap import dedent
16 | from types import FunctionType, ModuleType
17 | from typing import Any, Awaitable, Callable, Dict, List, Set, TypedDict, TypeVar
18 |
19 | try:
20 | # Python 3.12+ has ParamSpec in typing
21 | from typing import ParamSpec
22 | except ImportError: # pragma: no cover
23 | # Fallback for environments without ParamSpec in typing
24 | from typing_extensions import ParamSpec # type: ignore
25 |
26 | P = ParamSpec("P")
27 | R = TypeVar("R")
28 |
29 |
30 | class DependencyInfo(TypedDict):
31 | import_statements: List[str]
32 | definitions: List[tuple[str, Any]]
33 |
34 |
35 | # Global reference to the default computer instance
36 | _default_computer = None
37 |
38 | # Global cache for function dependency analysis
39 | _function_dependency_map: Dict[FunctionType, DependencyInfo] = {}
40 |
41 | logger = logging.getLogger(__name__)
42 |
43 |
44 | def set_default_computer(computer: Any) -> None:
45 | """
46 | Set the default computer instance to be used by the remote decorator.
47 |
48 | Args:
49 | computer: The computer instance to use as default
50 | """
51 | global _default_computer
52 | _default_computer = computer
53 |
54 |
55 | def sandboxed(
56 | venv_name: str = "default",
57 | computer: str = "default",
58 | max_retries: int = 3,
59 | ) -> Callable[[Callable[P, R]], Callable[P, Awaitable[R]]]:
60 | """
61 | Decorator that wraps a function to be executed remotely via computer.venv_exec
62 |
63 | The function is automatically analyzed for dependencies (imports, helper functions,
64 | constants, etc.) and reconstructed with all necessary code in the remote sandbox.
65 |
66 | Args:
67 | venv_name: Name of the virtual environment to execute in
68 | computer: The computer instance to use, or "default" to use the globally set default
69 | max_retries: Maximum number of retries for the remote execution
70 | """
71 |
72 | def decorator(func: Callable[P, R]) -> Callable[P, Awaitable[R]]:
73 | @wraps(func)
74 | async def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
75 | # Determine which computer instance to use
76 | comp = computer if computer != "default" else _default_computer
77 |
78 | if comp is None:
79 | raise RuntimeError(
80 | "No computer instance available. Either specify a computer instance or call set_default_computer() first."
81 | )
82 |
83 | for i in range(max_retries):
84 | try:
85 | return await comp.venv_exec(venv_name, func, *args, **kwargs)
86 | except Exception as e:
87 | logger.error(f"Attempt {i+1} failed: {e}")
88 | await asyncio.sleep(1)
89 | if i == max_retries - 1:
90 | raise e
91 |
92 | # Should be unreachable because we either returned or raised
93 | raise RuntimeError("sandboxed wrapper reached unreachable code path")
94 |
95 | return wrapper
96 |
97 | return decorator
98 |
99 |
100 | def _extract_import_statement(name: str, module: ModuleType) -> str:
101 | """Extract the original import statement for a module."""
102 | module_name = module.__name__
103 |
104 | if name == module_name.split(".")[0]:
105 | return f"import {module_name}"
106 | else:
107 | return f"import {module_name} as {name}"
108 |
109 |
110 | def _is_third_party_module(module_name: str) -> bool:
111 | """Check if a module is a third-party module."""
112 | stdlib_modules = set(sys.stdlib_module_names) if hasattr(sys, "stdlib_module_names") else set()
113 |
114 | if module_name in stdlib_modules:
115 | return False
116 |
117 | try:
118 | spec = importlib.util.find_spec(module_name)
119 | if spec is None:
120 | return False
121 |
122 | if spec.origin and ("site-packages" in spec.origin or "dist-packages" in spec.origin):
123 | return True
124 |
125 | return False
126 | except (ImportError, ModuleNotFoundError, ValueError):
127 | return False
128 |
129 |
130 | def _is_project_import(module_name: str) -> bool:
131 | """Check if a module is a project-level import."""
132 | if module_name.startswith("__relative_import_level_"):
133 | return True
134 |
135 | if module_name in sys.modules:
136 | module = sys.modules[module_name]
137 | if hasattr(module, "__file__") and module.__file__:
138 | if "site-packages" not in module.__file__ and "dist-packages" not in module.__file__:
139 | cwd = os.getcwd()
140 | if module.__file__.startswith(cwd):
141 | return True
142 |
143 | return False
144 |
145 |
146 | def _categorize_module(module_name: str) -> str:
147 | """Categorize a module as stdlib, third-party, or project."""
148 | if module_name.startswith("__relative_import_level_"):
149 | return "project"
150 | elif module_name in (
151 | set(sys.stdlib_module_names) if hasattr(sys, "stdlib_module_names") else set()
152 | ):
153 | return "stdlib"
154 | elif _is_third_party_module(module_name):
155 | return "third_party"
156 | elif _is_project_import(module_name):
157 | return "project"
158 | else:
159 | return "unknown"
160 |
161 |
162 | class _DependencyVisitor(ast.NodeVisitor):
163 | """AST visitor to extract imports and name references from a function."""
164 |
165 | def __init__(self, function_name: str) -> None:
166 | self.function_name = function_name
167 | self.internal_imports: Set[str] = set()
168 | self.internal_import_statements: List[str] = []
169 | self.name_references: Set[str] = set()
170 | self.local_names: Set[str] = set()
171 | self.inside_function = False
172 |
173 | def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
174 | if node.name == self.function_name and not self.inside_function:
175 | self.inside_function = True
176 |
177 | for arg in node.args.args + node.args.posonlyargs + node.args.kwonlyargs:
178 | self.local_names.add(arg.arg)
179 | if node.args.vararg:
180 | self.local_names.add(node.args.vararg.arg)
181 | if node.args.kwarg:
182 | self.local_names.add(node.args.kwarg.arg)
183 |
184 | for child in node.body:
185 | self.visit(child)
186 |
187 | self.inside_function = False
188 | else:
189 | if self.inside_function:
190 | self.local_names.add(node.name)
191 | for child in node.body:
192 | self.visit(child)
193 |
194 | def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
195 | self.visit_FunctionDef(node) # type: ignore
196 |
197 | def visit_Import(self, node: ast.Import) -> None:
198 | if self.inside_function:
199 | for alias in node.names:
200 | module_name = alias.name.split(".")[0]
201 | self.internal_imports.add(module_name)
202 | imported_as = alias.asname if alias.asname else alias.name.split(".")[0]
203 | self.local_names.add(imported_as)
204 | self.internal_import_statements.append(ast.unparse(node))
205 | self.generic_visit(node)
206 |
207 | def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
208 | if self.inside_function:
209 | if node.level == 0 and node.module:
210 | module_name = node.module.split(".")[0]
211 | self.internal_imports.add(module_name)
212 | elif node.level > 0:
213 | self.internal_imports.add(f"__relative_import_level_{node.level}__")
214 |
215 | for alias in node.names:
216 | imported_as = alias.asname if alias.asname else alias.name
217 | self.local_names.add(imported_as)
218 | self.internal_import_statements.append(ast.unparse(node))
219 |
220 | self.generic_visit(node)
221 |
222 | def visit_Name(self, node: ast.Name) -> None:
223 | if self.inside_function:
224 | if isinstance(node.ctx, ast.Load):
225 | self.name_references.add(node.id)
226 | elif isinstance(node.ctx, ast.Store):
227 | self.local_names.add(node.id)
228 | self.generic_visit(node)
229 |
230 | def visit_ClassDef(self, node: ast.ClassDef) -> None:
231 | if self.inside_function:
232 | self.local_names.add(node.name)
233 | self.generic_visit(node)
234 |
235 | def visit_For(self, node: ast.For) -> None:
236 | if self.inside_function and isinstance(node.target, ast.Name):
237 | self.local_names.add(node.target.id)
238 | self.generic_visit(node)
239 |
240 | def visit_comprehension(self, node: ast.comprehension) -> None:
241 | if self.inside_function and isinstance(node.target, ast.Name):
242 | self.local_names.add(node.target.id)
243 | self.generic_visit(node)
244 |
245 | def visit_ExceptHandler(self, node: ast.ExceptHandler) -> None:
246 | if self.inside_function and node.name:
247 | self.local_names.add(node.name)
248 | self.generic_visit(node)
249 |
250 | def visit_With(self, node: ast.With) -> None:
251 | if self.inside_function:
252 | for item in node.items:
253 | if item.optional_vars and isinstance(item.optional_vars, ast.Name):
254 | self.local_names.add(item.optional_vars.id)
255 | self.generic_visit(node)
256 |
257 |
258 | def _traverse_and_collect_dependencies(func: FunctionType) -> DependencyInfo:
259 | """
260 | Traverse a function and collect its dependencies.
261 |
262 | Returns a dict with:
263 | - import_statements: List of import statements needed
264 | - definitions: List of (name, obj) tuples for helper functions/classes/constants
265 | """
266 | source = dedent(getsource(func))
267 | tree = ast.parse(source)
268 |
269 | visitor = _DependencyVisitor(func.__name__)
270 | visitor.visit(tree)
271 |
272 | builtin_names = set(dir(builtins))
273 | external_refs = (visitor.name_references - visitor.local_names) - builtin_names
274 |
275 | import_statements = []
276 | definitions = []
277 | visited = set()
278 |
279 | # Include all internal import statements
280 | import_statements.extend(visitor.internal_import_statements)
281 |
282 | # Analyze external references recursively
283 | def analyze_object(obj: Any, name: str, depth: int = 0) -> None:
284 | if depth > 20:
285 | return
286 |
287 | obj_id = id(obj)
288 | if obj_id in visited:
289 | return
290 | visited.add(obj_id)
291 |
292 | # Handle modules
293 | if inspect.ismodule(obj):
294 | import_stmt = _extract_import_statement(name, obj)
295 | import_statements.append(import_stmt)
296 | return
297 |
298 | # Handle functions and classes
299 | if (
300 | inspect.isfunction(obj)
301 | or inspect.isclass(obj)
302 | or inspect.isbuiltin(obj)
303 | or inspect.ismethod(obj)
304 | ):
305 | obj_module = getattr(obj, "__module__", None)
306 | if obj_module:
307 | base_module = obj_module.split(".")[0]
308 | module_category = _categorize_module(base_module)
309 |
310 | # If from stdlib/third-party, just add import
311 | if module_category in ("stdlib", "third_party"):
312 | obj_name = getattr(obj, "__name__", name)
313 |
314 | # Check if object is accessible by 'name' (in globals or closures)
315 | is_accessible = False
316 | if name in func.__globals__ and func.__globals__[name] is obj:
317 | is_accessible = True
318 | elif func.__closure__ and hasattr(func, "__code__"):
319 | freevars = func.__code__.co_freevars
320 | for i, var_name in enumerate(freevars):
321 | if var_name == name and i < len(func.__closure__):
322 | try:
323 | if func.__closure__[i].cell_contents is obj:
324 | is_accessible = True
325 | break
326 | except (ValueError, AttributeError):
327 | pass
328 |
329 | if is_accessible and name == obj_name:
330 | # Direct import: from requests import get, from math import sqrt
331 | import_statements.append(f"from {base_module} import {name}")
332 | else:
333 | # Module import: import requests
334 | import_statements.append(f"import {base_module}")
335 | return
336 |
337 | try:
338 | obj_tree = ast.parse(dedent(getsource(obj)))
339 | obj_visitor = _DependencyVisitor(obj.__name__)
340 | obj_visitor.visit(obj_tree)
341 |
342 | obj_external_refs = obj_visitor.name_references - obj_visitor.local_names
343 | obj_external_refs = obj_external_refs - builtin_names
344 |
345 | # Add internal imports from this object
346 | import_statements.extend(obj_visitor.internal_import_statements)
347 |
348 | # Recursively analyze its dependencies
349 | obj_globals = getattr(obj, "__globals__", None)
350 | obj_closure = getattr(obj, "__closure__", None)
351 | obj_code = getattr(obj, "__code__", None)
352 | if obj_globals:
353 | for ref_name in obj_external_refs:
354 | ref_obj = None
355 |
356 | # Check globals first
357 | if ref_name in obj_globals:
358 | ref_obj = obj_globals[ref_name]
359 | # Check closure variables using co_freevars
360 | elif obj_closure and obj_code:
361 | freevars = obj_code.co_freevars
362 | for i, var_name in enumerate(freevars):
363 | if var_name == ref_name and i < len(obj_closure):
364 | try:
365 | ref_obj = obj_closure[i].cell_contents
366 | break
367 | except (ValueError, AttributeError):
368 | pass
369 |
370 | if ref_obj is not None:
371 | analyze_object(ref_obj, ref_name, depth + 1)
372 |
373 | # Add this object to definitions
374 | if not inspect.ismodule(obj):
375 | ref_module = getattr(obj, "__module__", None)
376 | if ref_module:
377 | ref_base_module = ref_module.split(".")[0]
378 | ref_category = _categorize_module(ref_base_module)
379 | if ref_category not in ("stdlib", "third_party"):
380 | definitions.append((name, obj))
381 | else:
382 | definitions.append((name, obj))
383 |
384 | except (OSError, TypeError):
385 | pass
386 | return
387 |
388 | if isinstance(obj, (int, float, str, bool, list, dict, tuple, set, frozenset, type(None))):
389 | definitions.append((name, obj))
390 |
391 | # Analyze all external references
392 | for name in external_refs:
393 | obj = None
394 |
395 | # First check globals
396 | if name in func.__globals__:
397 | obj = func.__globals__[name]
398 | # Then check closure variables (sibling functions in enclosing scope)
399 | elif func.__closure__ and func.__code__.co_freevars:
400 | # Match closure variable names with cell contents
401 | freevars = func.__code__.co_freevars
402 | for i, var_name in enumerate(freevars):
403 | if var_name == name and i < len(func.__closure__):
404 | try:
405 | obj = func.__closure__[i].cell_contents
406 | break
407 | except (ValueError, AttributeError):
408 | # Cell is empty or doesn't have contents
409 | pass
410 |
411 | if obj is not None:
412 | analyze_object(obj, name)
413 |
414 | # Remove duplicate import statements
415 | unique_imports = []
416 | seen = set()
417 | for stmt in import_statements:
418 | if stmt not in seen:
419 | seen.add(stmt)
420 | unique_imports.append(stmt)
421 |
422 | # Remove duplicate definitions
423 | unique_definitions = []
424 | seen_names = set()
425 | for name, obj in definitions:
426 | if name not in seen_names:
427 | seen_names.add(name)
428 | unique_definitions.append((name, obj))
429 |
430 | return {
431 | "import_statements": unique_imports,
432 | "definitions": unique_definitions,
433 | }
434 |
435 |
436 | def generate_source_code(func: FunctionType) -> str:
437 | """
438 | Generate complete source code for a function with all dependencies.
439 |
440 | Args:
441 | func: The function to generate source code for
442 |
443 | Returns:
444 | Complete Python source code as a string
445 | """
446 |
447 | if func in _function_dependency_map:
448 | info = _function_dependency_map[func]
449 | else:
450 | info = _traverse_and_collect_dependencies(func)
451 | _function_dependency_map[func] = info
452 |
453 | # Build source code
454 | parts = []
455 |
456 | # 1. Add imports
457 | if info["import_statements"]:
458 | parts.append("\n".join(info["import_statements"]))
459 |
460 | # 2. Add definitions
461 | for name, obj in info["definitions"]:
462 | try:
463 | if inspect.isfunction(obj):
464 | source = dedent(getsource(obj))
465 | tree = ast.parse(source)
466 | if tree.body and isinstance(tree.body[0], (ast.FunctionDef, ast.AsyncFunctionDef)):
467 | tree.body[0].decorator_list = []
468 | source = ast.unparse(tree)
469 | parts.append(source)
470 | elif inspect.isclass(obj):
471 | source = dedent(getsource(obj))
472 | tree = ast.parse(source)
473 | if tree.body and isinstance(tree.body[0], ast.ClassDef):
474 | tree.body[0].decorator_list = []
475 | source = ast.unparse(tree)
476 | parts.append(source)
477 | else:
478 | parts.append(f"{name} = {repr(obj)}")
479 | except (OSError, TypeError):
480 | pass
481 |
482 | # 3. Add main function (without decorators)
483 | func_source = dedent(getsource(func))
484 | tree = ast.parse(func_source)
485 | if tree.body and isinstance(tree.body[0], (ast.FunctionDef, ast.AsyncFunctionDef)):
486 | tree.body[0].decorator_list = []
487 | func_source = ast.unparse(tree)
488 | parts.append(func_source)
489 |
490 | return "\n\n".join(parts)
491 |
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/moondream3.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Moondream3+ composed-grounded agent loop implementation.
3 | Grounding is handled by a local Moondream3 preview model via Transformers.
4 | Thinking is delegated to the trailing LLM in the composed model string: "moondream3+<thinking_model>".
5 |
6 | Differences from composed_grounded:
7 | - Provides a singleton Moondream3 client outside the class.
8 | - predict_click uses model.point(image, instruction, settings={"max_objects": 1}) and returns pixel coordinates.
9 | - If the last image was a screenshot (or we take one), run model.detect(image, "all form ui") to get bboxes, then
10 | run model.caption on each cropped bbox to label it. Overlay labels on the screenshot and emit via _on_screenshot.
11 | - Add a user message listing all detected form UI names so the thinker can reference them.
12 | - If the thinking model doesn't support vision, filter out image content before calling litellm.
13 | """
14 |
15 | from __future__ import annotations
16 |
17 | import base64
18 | import io
19 | import uuid
20 | from typing import Any, Dict, List, Optional, Tuple
21 |
22 | import litellm
23 | from PIL import Image, ImageDraw, ImageFont
24 |
25 | from ..decorators import register_agent
26 | from ..loops.base import AsyncAgentConfig
27 | from ..responses import (
28 | convert_completion_messages_to_responses_items,
29 | convert_computer_calls_desc2xy,
30 | convert_computer_calls_xy2desc,
31 | convert_responses_items_to_completion_messages,
32 | get_all_element_descriptions,
33 | )
34 | from ..types import AgentCapability
35 |
36 | _MOONDREAM_SINGLETON = None
37 |
38 |
39 | def get_moondream_model() -> Any:
40 | """Get a singleton instance of the Moondream3 preview model."""
41 | global _MOONDREAM_SINGLETON
42 | if _MOONDREAM_SINGLETON is None:
43 | try:
44 | import torch
45 | from transformers import AutoModelForCausalLM
46 |
47 | _MOONDREAM_SINGLETON = AutoModelForCausalLM.from_pretrained(
48 | "moondream/moondream3-preview",
49 | trust_remote_code=True,
50 | torch_dtype=torch.bfloat16,
51 | device_map="cuda",
52 | )
53 | except ImportError as e:
54 | raise RuntimeError(
55 | "moondream3 requires torch and transformers. Install with: pip install cua-agent[moondream3]"
56 | ) from e
57 | return _MOONDREAM_SINGLETON
58 |
59 |
60 | def _decode_image_b64(image_b64: str) -> Image.Image:
61 | data = base64.b64decode(image_b64)
62 | return Image.open(io.BytesIO(data)).convert("RGB")
63 |
64 |
65 | def _image_to_b64(img: Image.Image) -> str:
66 | buf = io.BytesIO()
67 | img.save(buf, format="PNG")
68 | return base64.b64encode(buf.getvalue()).decode("utf-8")
69 |
70 |
71 | def _supports_vision(model: str) -> bool:
72 | """Heuristic vision support detection for thinking model."""
73 | m = model.lower()
74 | vision_markers = [
75 | "gpt-4o",
76 | "gpt-4.1",
77 | "o1",
78 | "o3",
79 | "claude-3",
80 | "claude-3.5",
81 | "sonnet",
82 | "haiku",
83 | "opus",
84 | "gemini-1.5",
85 | "llava",
86 | ]
87 | return any(v in m for v in vision_markers)
88 |
89 |
90 | def _filter_images_from_completion_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
91 | filtered: List[Dict[str, Any]] = []
92 | for msg in messages:
93 | msg_copy = {**msg}
94 | content = msg_copy.get("content")
95 | if isinstance(content, list):
96 | msg_copy["content"] = [c for c in content if c.get("type") != "image_url"]
97 | filtered.append(msg_copy)
98 | return filtered
99 |
100 |
101 | def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str, List[str]]:
102 | """Detect UI elements with Moondream, caption each, draw labels with backgrounds.
103 |
104 | Args:
105 | base_img: PIL image of the screenshot (RGB or RGBA). Will be copied/converted internally.
106 | model_md: Moondream model instance with .detect() and .query() methods.
107 |
108 | Returns:
109 | A tuple of (annotated_image_base64_png, detected_names)
110 | """
111 | # Ensure RGBA for semi-transparent fills
112 | if base_img.mode != "RGBA":
113 | base_img = base_img.convert("RGBA")
114 | W, H = base_img.width, base_img.height
115 |
116 | # Detect objects
117 | try:
118 | detect_result = model_md.detect(base_img, "all ui elements")
119 | objects = detect_result.get("objects", []) if isinstance(detect_result, dict) else []
120 | except Exception:
121 | objects = []
122 |
123 | draw = ImageDraw.Draw(base_img)
124 | try:
125 | font = ImageFont.load_default()
126 | except Exception:
127 | font = None
128 |
129 | detected_names: List[str] = []
130 |
131 | for i, obj in enumerate(objects):
132 | try:
133 | # Clamp normalized coords and crop
134 | x_min = max(0.0, min(1.0, float(obj.get("x_min", 0.0))))
135 | y_min = max(0.0, min(1.0, float(obj.get("y_min", 0.0))))
136 | x_max = max(0.0, min(1.0, float(obj.get("x_max", 0.0))))
137 | y_max = max(0.0, min(1.0, float(obj.get("y_max", 0.0))))
138 | left, top, right, bottom = (
139 | int(x_min * W),
140 | int(y_min * H),
141 | int(x_max * W),
142 | int(y_max * H),
143 | )
144 | left, top = max(0, left), max(0, top)
145 | right, bottom = min(W - 1, right), min(H - 1, bottom)
146 | crop = base_img.crop((left, top, right, bottom))
147 |
148 | # Prompted short caption
149 | try:
150 | result = model_md.query(crop, "Caption this UI element in few words.")
151 | caption_text = (result or {}).get("answer", "")
152 | except Exception:
153 | caption_text = ""
154 |
155 | name = (caption_text or "").strip() or f"element_{i+1}"
156 | detected_names.append(name)
157 |
158 | # Draw bbox
159 | draw.rectangle([left, top, right, bottom], outline=(255, 215, 0, 255), width=2)
160 |
161 | # Label background with padding and rounded corners
162 | label = f"{i+1}. {name}"
163 | padding = 3
164 | if font:
165 | text_bbox = draw.textbbox((0, 0), label, font=font)
166 | else:
167 | text_bbox = draw.textbbox((0, 0), label)
168 | text_w = text_bbox[2] - text_bbox[0]
169 | text_h = text_bbox[3] - text_bbox[1]
170 |
171 | tx = left + 3
172 | ty = top - (text_h + 2 * padding + 4)
173 | if ty < 0:
174 | ty = top + 3
175 |
176 | bg_left = tx - padding
177 | bg_top = ty - padding
178 | bg_right = tx + text_w + padding
179 | bg_bottom = ty + text_h + padding
180 | try:
181 | draw.rounded_rectangle(
182 | [bg_left, bg_top, bg_right, bg_bottom],
183 | radius=4,
184 | fill=(0, 0, 0, 160),
185 | outline=(255, 215, 0, 200),
186 | width=1,
187 | )
188 | except Exception:
189 | draw.rectangle(
190 | [bg_left, bg_top, bg_right, bg_bottom],
191 | fill=(0, 0, 0, 160),
192 | outline=(255, 215, 0, 200),
193 | width=1,
194 | )
195 |
196 | text_fill = (255, 255, 255, 255)
197 | if font:
198 | draw.text((tx, ty), label, fill=text_fill, font=font)
199 | else:
200 | draw.text((tx, ty), label, fill=text_fill)
201 | except Exception:
202 | continue
203 |
204 | # Encode PNG base64
205 | annotated = base_img
206 | if annotated.mode not in ("RGBA", "RGB"):
207 | annotated = annotated.convert("RGBA")
208 | annotated_b64 = _image_to_b64(annotated)
209 | return annotated_b64, detected_names
210 |
211 |
212 | GROUNDED_COMPUTER_TOOL_SCHEMA = {
213 | "type": "function",
214 | "function": {
215 | "name": "computer",
216 | "description": (
217 | "Control a computer by taking screenshots and interacting with UI elements. "
218 | "The screenshot action will include a list of detected form UI element names when available. "
219 | "Use element descriptions to locate and interact with UI elements on the screen."
220 | ),
221 | "parameters": {
222 | "type": "object",
223 | "properties": {
224 | "action": {
225 | "type": "string",
226 | "enum": [
227 | "screenshot",
228 | "click",
229 | "double_click",
230 | "drag",
231 | "type",
232 | "keypress",
233 | "scroll",
234 | "move",
235 | "wait",
236 | "get_current_url",
237 | "get_dimensions",
238 | "get_environment",
239 | ],
240 | "description": "The action to perform (required for all actions)",
241 | },
242 | "element_description": {
243 | "type": "string",
244 | "description": "Description of the element to interact with (required for click/double_click/move/scroll)",
245 | },
246 | "start_element_description": {
247 | "type": "string",
248 | "description": "Description of the element to start dragging from (required for drag)",
249 | },
250 | "end_element_description": {
251 | "type": "string",
252 | "description": "Description of the element to drag to (required for drag)",
253 | },
254 | "text": {
255 | "type": "string",
256 | "description": "The text to type (required for type)",
257 | },
258 | "keys": {
259 | "type": "array",
260 | "items": {"type": "string"},
261 | "description": "Key(s) to press (required for keypress)",
262 | },
263 | "button": {
264 | "type": "string",
265 | "enum": ["left", "right", "wheel", "back", "forward"],
266 | "description": "The mouse button to use for click/double_click",
267 | },
268 | "scroll_x": {
269 | "type": "integer",
270 | "description": "Horizontal scroll amount (required for scroll)",
271 | },
272 | "scroll_y": {
273 | "type": "integer",
274 | "description": "Vertical scroll amount (required for scroll)",
275 | },
276 | },
277 | "required": ["action"],
278 | },
279 | },
280 | }
281 |
282 |
283 | @register_agent(r"moondream3\+.*", priority=2)
284 | class Moondream3PlusConfig(AsyncAgentConfig):
285 | def __init__(self):
286 | self.desc2xy: Dict[str, Tuple[float, float]] = {}
287 |
288 | async def predict_step(
289 | self,
290 | messages: List[Dict[str, Any]],
291 | model: str,
292 | tools: Optional[List[Dict[str, Any]]] = None,
293 | max_retries: Optional[int] = None,
294 | stream: bool = False,
295 | computer_handler=None,
296 | use_prompt_caching: Optional[bool] = False,
297 | _on_api_start=None,
298 | _on_api_end=None,
299 | _on_usage=None,
300 | _on_screenshot=None,
301 | **kwargs,
302 | ) -> Dict[str, Any]:
303 | # Parse composed model: moondream3+<thinking_model>
304 | if "+" not in model:
305 | raise ValueError(f"Composed model must be 'moondream3+<thinking_model>', got: {model}")
306 | _, thinking_model = model.split("+", 1)
307 |
308 | pre_output_items: List[Dict[str, Any]] = []
309 |
310 | # Acquire last screenshot; if missing, take one
311 | last_image_b64: Optional[str] = None
312 | for message in reversed(messages):
313 | if (
314 | isinstance(message, dict)
315 | and message.get("type") == "computer_call_output"
316 | and isinstance(message.get("output"), dict)
317 | and message["output"].get("type") == "input_image"
318 | ):
319 | image_url = message["output"].get("image_url", "")
320 | if image_url.startswith("data:image/png;base64,"):
321 | last_image_b64 = image_url.split(",", 1)[1]
322 | break
323 |
324 | if last_image_b64 is None and computer_handler is not None:
325 | # Take a screenshot
326 | screenshot_b64 = await computer_handler.screenshot() # type: ignore
327 | if screenshot_b64:
328 | call_id = uuid.uuid4().hex
329 | pre_output_items += [
330 | {
331 | "type": "message",
332 | "role": "assistant",
333 | "content": [
334 | {
335 | "type": "output_text",
336 | "text": "Taking a screenshot to analyze the current screen.",
337 | }
338 | ],
339 | },
340 | {
341 | "type": "computer_call",
342 | "call_id": call_id,
343 | "status": "completed",
344 | "action": {"type": "screenshot"},
345 | },
346 | {
347 | "type": "computer_call_output",
348 | "call_id": call_id,
349 | "output": {
350 | "type": "input_image",
351 | "image_url": f"data:image/png;base64,{screenshot_b64}",
352 | },
353 | },
354 | ]
355 | last_image_b64 = screenshot_b64
356 | if _on_screenshot:
357 | await _on_screenshot(screenshot_b64)
358 |
359 | # If we have a last screenshot, run Moondream detection and labeling
360 | detected_names: List[str] = []
361 | if last_image_b64 is not None:
362 | base_img = _decode_image_b64(last_image_b64)
363 | model_md = get_moondream_model()
364 | annotated_b64, detected_names = _annotate_detect_and_label_ui(base_img, model_md)
365 | if _on_screenshot:
366 | await _on_screenshot(annotated_b64, "annotated_form_ui")
367 |
368 | # Also push a user message listing all detected names
369 | if detected_names:
370 | names_text = "\n".join(f"- {n}" for n in detected_names)
371 | pre_output_items.append(
372 | {
373 | "type": "message",
374 | "role": "user",
375 | "content": [
376 | {"type": "input_text", "text": "Detected form UI elements on screen:"},
377 | {"type": "input_text", "text": names_text},
378 | {
379 | "type": "input_text",
380 | "text": "Please continue with the next action needed to perform your task.",
381 | },
382 | ],
383 | }
384 | )
385 |
386 | tool_schemas = []
387 | for schema in tools or []:
388 | if schema.get("type") == "computer":
389 | tool_schemas.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
390 | else:
391 | tool_schemas.append(schema)
392 |
393 | # Step 1: Convert computer calls from xy to descriptions
394 | input_messages = messages + pre_output_items
395 | messages_with_descriptions = convert_computer_calls_xy2desc(input_messages, self.desc2xy)
396 |
397 | # Step 2: Convert responses items to completion messages
398 | completion_messages = convert_responses_items_to_completion_messages(
399 | messages_with_descriptions,
400 | allow_images_in_tool_results=False,
401 | )
402 |
403 | # Optionally filter images if model lacks vision
404 | if not _supports_vision(thinking_model):
405 | completion_messages = _filter_images_from_completion_messages(completion_messages)
406 |
407 | # Step 3: Call thinking model with litellm.acompletion
408 | api_kwargs = {
409 | "model": thinking_model,
410 | "messages": completion_messages,
411 | "tools": tool_schemas,
412 | "max_retries": max_retries,
413 | "stream": stream,
414 | **kwargs,
415 | }
416 | if use_prompt_caching:
417 | api_kwargs["use_prompt_caching"] = use_prompt_caching
418 |
419 | if _on_api_start:
420 | await _on_api_start(api_kwargs)
421 |
422 | response = await litellm.acompletion(**api_kwargs)
423 |
424 | if _on_api_end:
425 | await _on_api_end(api_kwargs, response)
426 |
427 | usage = {
428 | **response.usage.model_dump(), # type: ignore
429 | "response_cost": response._hidden_params.get("response_cost", 0.0),
430 | }
431 | if _on_usage:
432 | await _on_usage(usage)
433 |
434 | # Step 4: Convert completion messages back to responses items format
435 | response_dict = response.model_dump() # type: ignore
436 | choice_messages = [choice["message"] for choice in response_dict["choices"]]
437 | thinking_output_items: List[Dict[str, Any]] = []
438 | for choice_message in choice_messages:
439 | thinking_output_items.extend(
440 | convert_completion_messages_to_responses_items([choice_message])
441 | )
442 |
443 | # Step 5: Use Moondream to get coordinates for each description
444 | element_descriptions = get_all_element_descriptions(thinking_output_items)
445 | if element_descriptions and last_image_b64:
446 | for desc in element_descriptions:
447 | for _ in range(3): # try 3 times
448 | coords = await self.predict_click(
449 | model=model,
450 | image_b64=last_image_b64,
451 | instruction=desc,
452 | )
453 | if coords:
454 | self.desc2xy[desc] = coords
455 | break
456 |
457 | # Step 6: Convert computer calls from descriptions back to xy coordinates
458 | final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
459 |
460 | # Step 7: Return output and usage
461 | return {"output": pre_output_items + final_output_items, "usage": usage}
462 |
463 | async def predict_click(
464 | self,
465 | model: str,
466 | image_b64: str,
467 | instruction: str,
468 | **kwargs,
469 | ) -> Optional[Tuple[float, float]]:
470 | """Predict click coordinates using Moondream3's point API.
471 |
472 | Returns pixel coordinates (x, y) as floats.
473 | """
474 | img = _decode_image_b64(image_b64)
475 | W, H = img.width, img.height
476 | model_md = get_moondream_model()
477 | try:
478 | result = model_md.point(img, instruction, settings={"max_objects": 1})
479 | except Exception:
480 | return None
481 |
482 | try:
483 | pt = (result or {}).get("points", [])[0]
484 | x_norm = float(pt.get("x", 0.0))
485 | y_norm = float(pt.get("y", 0.0))
486 | x_px = max(0.0, min(float(W - 1), x_norm * W))
487 | y_px = max(0.0, min(float(H - 1), y_norm * H))
488 | return (x_px, y_px)
489 | except Exception:
490 | return None
491 |
492 | def get_capabilities(self) -> List[AgentCapability]:
493 | return ["click", "step"]
494 |
```