#
tokens: 47470/50000 8/617 files (page 16/28)
lines: on (toggle) GitHub
raw markdown copy reset
This is page 16 of 28. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .cursorignore
├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github
│   ├── FUNDING.yml
│   ├── scripts
│   │   ├── get_pyproject_version.py
│   │   └── tests
│   │       ├── __init__.py
│   │       ├── README.md
│   │       └── test_get_pyproject_version.py
│   └── workflows
│       ├── bump-version.yml
│       ├── ci-lume.yml
│       ├── docker-publish-cua-linux.yml
│       ├── docker-publish-cua-windows.yml
│       ├── docker-publish-kasm.yml
│       ├── docker-publish-xfce.yml
│       ├── docker-reusable-publish.yml
│       ├── link-check.yml
│       ├── lint.yml
│       ├── npm-publish-cli.yml
│       ├── npm-publish-computer.yml
│       ├── npm-publish-core.yml
│       ├── publish-lume.yml
│       ├── pypi-publish-agent.yml
│       ├── pypi-publish-computer-server.yml
│       ├── pypi-publish-computer.yml
│       ├── pypi-publish-core.yml
│       ├── pypi-publish-mcp-server.yml
│       ├── pypi-publish-som.yml
│       ├── pypi-reusable-publish.yml
│       ├── python-tests.yml
│       ├── test-cua-models.yml
│       └── test-validation-script.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierignore
├── .prettierrc.yaml
├── .vscode
│   ├── docs.code-workspace
│   ├── extensions.json
│   ├── launch.json
│   ├── libs-ts.code-workspace
│   ├── lume.code-workspace
│   ├── lumier.code-workspace
│   ├── py.code-workspace
│   └── settings.json
├── blog
│   ├── app-use.md
│   ├── assets
│   │   ├── composite-agents.png
│   │   ├── docker-ubuntu-support.png
│   │   ├── hack-booth.png
│   │   ├── hack-closing-ceremony.jpg
│   │   ├── hack-cua-ollama-hud.jpeg
│   │   ├── hack-leaderboard.png
│   │   ├── hack-the-north.png
│   │   ├── hack-winners.jpeg
│   │   ├── hack-workshop.jpeg
│   │   ├── hud-agent-evals.png
│   │   └── trajectory-viewer.jpeg
│   ├── bringing-computer-use-to-the-web.md
│   ├── build-your-own-operator-on-macos-1.md
│   ├── build-your-own-operator-on-macos-2.md
│   ├── cloud-windows-ga-macos-preview.md
│   ├── composite-agents.md
│   ├── computer-use-agents-for-growth-hacking.md
│   ├── cua-hackathon.md
│   ├── cua-playground-preview.md
│   ├── cua-vlm-router.md
│   ├── hack-the-north.md
│   ├── hud-agent-evals.md
│   ├── human-in-the-loop.md
│   ├── introducing-cua-cli.md
│   ├── introducing-cua-cloud-containers.md
│   ├── lume-to-containerization.md
│   ├── neurips-2025-cua-papers.md
│   ├── sandboxed-python-execution.md
│   ├── training-computer-use-models-trajectories-1.md
│   ├── trajectory-viewer.md
│   ├── ubuntu-docker-support.md
│   └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│   ├── .env.example
│   ├── .gitignore
│   ├── content
│   │   └── docs
│   │       ├── agent-sdk
│   │       │   ├── agent-loops.mdx
│   │       │   ├── benchmarks
│   │       │   │   ├── index.mdx
│   │       │   │   ├── interactive.mdx
│   │       │   │   ├── introduction.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── osworld-verified.mdx
│   │       │   │   ├── screenspot-pro.mdx
│   │       │   │   └── screenspot-v2.mdx
│   │       │   ├── callbacks
│   │       │   │   ├── agent-lifecycle.mdx
│   │       │   │   ├── cost-saving.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── logging.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── pii-anonymization.mdx
│   │       │   │   └── trajectories.mdx
│   │       │   ├── chat-history.mdx
│   │       │   ├── custom-tools.mdx
│   │       │   ├── customizing-computeragent.mdx
│   │       │   ├── integrations
│   │       │   │   ├── hud.mdx
│   │       │   │   ├── meta.json
│   │       │   │   └── observability.mdx
│   │       │   ├── mcp-server
│   │       │   │   ├── client-integrations.mdx
│   │       │   │   ├── configuration.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── llm-integrations.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── tools.mdx
│   │       │   │   └── usage.mdx
│   │       │   ├── message-format.mdx
│   │       │   ├── meta.json
│   │       │   ├── migration-guide.mdx
│   │       │   ├── prompt-caching.mdx
│   │       │   ├── supported-agents
│   │       │   │   ├── composed-agents.mdx
│   │       │   │   ├── computer-use-agents.mdx
│   │       │   │   ├── grounding-models.mdx
│   │       │   │   ├── human-in-the-loop.mdx
│   │       │   │   └── meta.json
│   │       │   ├── supported-model-providers
│   │       │   │   ├── cua-vlm-router.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   └── local-models.mdx
│   │       │   ├── telemetry.mdx
│   │       │   └── usage-tracking.mdx
│   │       ├── cli-playbook
│   │       │   ├── commands.mdx
│   │       │   ├── index.mdx
│   │       │   └── meta.json
│   │       ├── computer-sdk
│   │       │   ├── cloud-vm-management.mdx
│   │       │   ├── commands.mdx
│   │       │   ├── computer-server
│   │       │   │   ├── Commands.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── REST-API.mdx
│   │       │   │   └── WebSocket-API.mdx
│   │       │   ├── computer-ui.mdx
│   │       │   ├── computers.mdx
│   │       │   ├── custom-computer-handlers.mdx
│   │       │   ├── meta.json
│   │       │   ├── sandboxed-python.mdx
│   │       │   └── tracing-api.mdx
│   │       ├── example-usecases
│   │       │   ├── form-filling.mdx
│   │       │   ├── gemini-complex-ui-navigation.mdx
│   │       │   ├── meta.json
│   │       │   ├── post-event-contact-export.mdx
│   │       │   └── windows-app-behind-vpn.mdx
│   │       ├── get-started
│   │       │   ├── meta.json
│   │       │   └── quickstart.mdx
│   │       ├── index.mdx
│   │       ├── macos-vm-cli-playbook
│   │       │   ├── lume
│   │       │   │   ├── cli-reference.mdx
│   │       │   │   ├── faq.md
│   │       │   │   ├── http-api.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── meta.json
│   │       │   │   └── prebuilt-images.mdx
│   │       │   ├── lumier
│   │       │   │   ├── building-lumier.mdx
│   │       │   │   ├── docker-compose.mdx
│   │       │   │   ├── docker.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   └── meta.json
│   │       │   └── meta.json
│   │       └── meta.json
│   ├── next.config.mjs
│   ├── package-lock.json
│   ├── package.json
│   ├── pnpm-lock.yaml
│   ├── postcss.config.mjs
│   ├── public
│   │   └── img
│   │       ├── agent_gradio_ui.png
│   │       ├── agent.png
│   │       ├── bg-dark.jpg
│   │       ├── bg-light.jpg
│   │       ├── cli.png
│   │       ├── computer.png
│   │       ├── grounding-with-gemini3.gif
│   │       ├── hero.png
│   │       ├── laminar_trace_example.png
│   │       ├── som_box_threshold.png
│   │       └── som_iou_threshold.png
│   ├── README.md
│   ├── source.config.ts
│   ├── src
│   │   ├── app
│   │   │   ├── (home)
│   │   │   │   ├── [[...slug]]
│   │   │   │   │   └── page.tsx
│   │   │   │   └── layout.tsx
│   │   │   ├── api
│   │   │   │   ├── posthog
│   │   │   │   │   └── [...path]
│   │   │   │   │       └── route.ts
│   │   │   │   └── search
│   │   │   │       └── route.ts
│   │   │   ├── favicon.ico
│   │   │   ├── global.css
│   │   │   ├── layout.config.tsx
│   │   │   ├── layout.tsx
│   │   │   ├── llms.mdx
│   │   │   │   └── [[...slug]]
│   │   │   │       └── route.ts
│   │   │   ├── llms.txt
│   │   │   │   └── route.ts
│   │   │   ├── robots.ts
│   │   │   └── sitemap.ts
│   │   ├── assets
│   │   │   ├── discord-black.svg
│   │   │   ├── discord-white.svg
│   │   │   ├── logo-black.svg
│   │   │   └── logo-white.svg
│   │   ├── components
│   │   │   ├── analytics-tracker.tsx
│   │   │   ├── cookie-consent.tsx
│   │   │   ├── doc-actions-menu.tsx
│   │   │   ├── editable-code-block.tsx
│   │   │   ├── footer.tsx
│   │   │   ├── hero.tsx
│   │   │   ├── iou.tsx
│   │   │   ├── mermaid.tsx
│   │   │   └── page-feedback.tsx
│   │   ├── lib
│   │   │   ├── llms.ts
│   │   │   └── source.ts
│   │   ├── mdx-components.tsx
│   │   └── providers
│   │       └── posthog-provider.tsx
│   └── tsconfig.json
├── examples
│   ├── agent_examples.py
│   ├── agent_ui_examples.py
│   ├── browser_tool_example.py
│   ├── cloud_api_examples.py
│   ├── computer_examples_windows.py
│   ├── computer_examples.py
│   ├── computer_ui_examples.py
│   ├── computer-example-ts
│   │   ├── .env.example
│   │   ├── .gitignore
│   │   ├── package-lock.json
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── README.md
│   │   ├── src
│   │   │   ├── helpers.ts
│   │   │   └── index.ts
│   │   └── tsconfig.json
│   ├── docker_examples.py
│   ├── evals
│   │   ├── hud_eval_examples.py
│   │   └── wikipedia_most_linked.txt
│   ├── pylume_examples.py
│   ├── sandboxed_functions_examples.py
│   ├── som_examples.py
│   ├── tracing_examples.py
│   ├── utils.py
│   └── winsandbox_example.py
├── img
│   ├── agent_gradio_ui.png
│   ├── agent.png
│   ├── cli.png
│   ├── computer.png
│   ├── logo_black.png
│   └── logo_white.png
├── libs
│   ├── kasm
│   │   ├── Dockerfile
│   │   ├── LICENSE
│   │   ├── README.md
│   │   └── src
│   │       └── ubuntu
│   │           └── install
│   │               └── firefox
│   │                   ├── custom_startup.sh
│   │                   ├── firefox.desktop
│   │                   └── install_firefox.sh
│   ├── lume
│   │   ├── .cursorignore
│   │   ├── CONTRIBUTING.md
│   │   ├── Development.md
│   │   ├── img
│   │   │   └── cli.png
│   │   ├── Package.resolved
│   │   ├── Package.swift
│   │   ├── README.md
│   │   ├── resources
│   │   │   └── lume.entitlements
│   │   ├── scripts
│   │   │   ├── build
│   │   │   │   ├── build-debug.sh
│   │   │   │   ├── build-release-notarized.sh
│   │   │   │   └── build-release.sh
│   │   │   └── install.sh
│   │   ├── src
│   │   │   ├── Commands
│   │   │   │   ├── Clone.swift
│   │   │   │   ├── Config.swift
│   │   │   │   ├── Create.swift
│   │   │   │   ├── Delete.swift
│   │   │   │   ├── Get.swift
│   │   │   │   ├── Images.swift
│   │   │   │   ├── IPSW.swift
│   │   │   │   ├── List.swift
│   │   │   │   ├── Logs.swift
│   │   │   │   ├── Options
│   │   │   │   │   └── FormatOption.swift
│   │   │   │   ├── Prune.swift
│   │   │   │   ├── Pull.swift
│   │   │   │   ├── Push.swift
│   │   │   │   ├── Run.swift
│   │   │   │   ├── Serve.swift
│   │   │   │   ├── Set.swift
│   │   │   │   └── Stop.swift
│   │   │   ├── ContainerRegistry
│   │   │   │   ├── ImageContainerRegistry.swift
│   │   │   │   ├── ImageList.swift
│   │   │   │   └── ImagesPrinter.swift
│   │   │   ├── Errors
│   │   │   │   └── Errors.swift
│   │   │   ├── FileSystem
│   │   │   │   ├── Home.swift
│   │   │   │   ├── Settings.swift
│   │   │   │   ├── VMConfig.swift
│   │   │   │   ├── VMDirectory.swift
│   │   │   │   └── VMLocation.swift
│   │   │   ├── LumeController.swift
│   │   │   ├── Main.swift
│   │   │   ├── Server
│   │   │   │   ├── Handlers.swift
│   │   │   │   ├── HTTP.swift
│   │   │   │   ├── Requests.swift
│   │   │   │   ├── Responses.swift
│   │   │   │   └── Server.swift
│   │   │   ├── Utils
│   │   │   │   ├── CommandRegistry.swift
│   │   │   │   ├── CommandUtils.swift
│   │   │   │   ├── Logger.swift
│   │   │   │   ├── NetworkUtils.swift
│   │   │   │   ├── Path.swift
│   │   │   │   ├── ProcessRunner.swift
│   │   │   │   ├── ProgressLogger.swift
│   │   │   │   ├── String.swift
│   │   │   │   └── Utils.swift
│   │   │   ├── Virtualization
│   │   │   │   ├── DarwinImageLoader.swift
│   │   │   │   ├── DHCPLeaseParser.swift
│   │   │   │   ├── ImageLoaderFactory.swift
│   │   │   │   └── VMVirtualizationService.swift
│   │   │   ├── VM
│   │   │   │   ├── DarwinVM.swift
│   │   │   │   ├── LinuxVM.swift
│   │   │   │   ├── VM.swift
│   │   │   │   ├── VMDetails.swift
│   │   │   │   ├── VMDetailsPrinter.swift
│   │   │   │   ├── VMDisplayResolution.swift
│   │   │   │   └── VMFactory.swift
│   │   │   └── VNC
│   │   │       ├── PassphraseGenerator.swift
│   │   │       └── VNCService.swift
│   │   └── tests
│   │       ├── Mocks
│   │       │   ├── MockVM.swift
│   │       │   ├── MockVMVirtualizationService.swift
│   │       │   └── MockVNCService.swift
│   │       ├── VM
│   │       │   └── VMDetailsPrinterTests.swift
│   │       ├── VMTests.swift
│   │       ├── VMVirtualizationServiceTests.swift
│   │       └── VNCServiceTests.swift
│   ├── lumier
│   │   ├── .dockerignore
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   └── src
│   │       ├── bin
│   │       │   └── entry.sh
│   │       ├── config
│   │       │   └── constants.sh
│   │       ├── hooks
│   │       │   └── on-logon.sh
│   │       └── lib
│   │           ├── utils.sh
│   │           └── vm.sh
│   ├── python
│   │   ├── agent
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── agent
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── adapters
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── cua_adapter.py
│   │   │   │   │   ├── huggingfacelocal_adapter.py
│   │   │   │   │   ├── human_adapter.py
│   │   │   │   │   ├── mlxvlm_adapter.py
│   │   │   │   │   └── models
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── generic.py
│   │   │   │   │       ├── internvl.py
│   │   │   │   │       ├── opencua.py
│   │   │   │   │       └── qwen2_5_vl.py
│   │   │   │   ├── agent.py
│   │   │   │   ├── callbacks
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── budget_manager.py
│   │   │   │   │   ├── image_retention.py
│   │   │   │   │   ├── logging.py
│   │   │   │   │   ├── operator_validator.py
│   │   │   │   │   ├── pii_anonymization.py
│   │   │   │   │   ├── prompt_instructions.py
│   │   │   │   │   ├── telemetry.py
│   │   │   │   │   └── trajectory_saver.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── computers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cua.py
│   │   │   │   │   └── custom.py
│   │   │   │   ├── decorators.py
│   │   │   │   ├── human_tool
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   ├── server.py
│   │   │   │   │   └── ui.py
│   │   │   │   ├── integrations
│   │   │   │   │   └── hud
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── agent.py
│   │   │   │   │       └── proxy.py
│   │   │   │   ├── loops
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── anthropic.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── composed_grounded.py
│   │   │   │   │   ├── gelato.py
│   │   │   │   │   ├── gemini.py
│   │   │   │   │   ├── generic_vlm.py
│   │   │   │   │   ├── glm45v.py
│   │   │   │   │   ├── gta1.py
│   │   │   │   │   ├── holo.py
│   │   │   │   │   ├── internvl.py
│   │   │   │   │   ├── model_types.csv
│   │   │   │   │   ├── moondream3.py
│   │   │   │   │   ├── omniparser.py
│   │   │   │   │   ├── openai.py
│   │   │   │   │   ├── opencua.py
│   │   │   │   │   ├── uiins.py
│   │   │   │   │   ├── uitars.py
│   │   │   │   │   └── uitars2.py
│   │   │   │   ├── proxy
│   │   │   │   │   ├── examples.py
│   │   │   │   │   └── handlers.py
│   │   │   │   ├── responses.py
│   │   │   │   ├── tools
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── browser_tool.py
│   │   │   │   ├── types.py
│   │   │   │   └── ui
│   │   │   │       ├── __init__.py
│   │   │   │       ├── __main__.py
│   │   │   │       └── gradio
│   │   │   │           ├── __init__.py
│   │   │   │           ├── app.py
│   │   │   │           └── ui_components.py
│   │   │   ├── benchmarks
│   │   │   │   ├── .gitignore
│   │   │   │   ├── contrib.md
│   │   │   │   ├── interactive.py
│   │   │   │   ├── models
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   └── gta1.py
│   │   │   │   ├── README.md
│   │   │   │   ├── ss-pro.py
│   │   │   │   ├── ss-v2.py
│   │   │   │   └── utils.py
│   │   │   ├── example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_computer_agent.py
│   │   ├── bench-ui
│   │   │   ├── bench_ui
│   │   │   │   ├── __init__.py
│   │   │   │   ├── api.py
│   │   │   │   └── child.py
│   │   │   ├── examples
│   │   │   │   ├── folder_example.py
│   │   │   │   ├── gui
│   │   │   │   │   ├── index.html
│   │   │   │   │   ├── logo.svg
│   │   │   │   │   └── styles.css
│   │   │   │   ├── output_overlay.png
│   │   │   │   └── simple_example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       └── test_port_detection.py
│   │   ├── computer
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer
│   │   │   │   ├── __init__.py
│   │   │   │   ├── computer.py
│   │   │   │   ├── diorama_computer.py
│   │   │   │   ├── helpers.py
│   │   │   │   ├── interface
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   ├── models.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── logger.py
│   │   │   │   ├── models.py
│   │   │   │   ├── providers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cloud
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── docker
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── lume
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── lume_api.py
│   │   │   │   │   ├── lumier
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── types.py
│   │   │   │   │   └── winsandbox
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── provider.py
│   │   │   │   │       └── setup_script.ps1
│   │   │   │   ├── tracing_wrapper.py
│   │   │   │   ├── tracing.py
│   │   │   │   ├── ui
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   └── gradio
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       └── app.py
│   │   │   │   └── utils.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       ├── test_computer.py
│   │   │       └── test_helpers.py
│   │   ├── computer-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── browser.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── diorama
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── diorama_computer.py
│   │   │   │   │   ├── diorama.py
│   │   │   │   │   ├── draw.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── safezone.py
│   │   │   │   ├── handlers
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── main.py
│   │   │   │   ├── server.py
│   │   │   │   ├── utils
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── wallpaper.py
│   │   │   │   └── watchdog.py
│   │   │   ├── examples
│   │   │   │   ├── __init__.py
│   │   │   │   └── usage_example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   ├── run_server.py
│   │   │   ├── test_connection.py
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_server.py
│   │   ├── core
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── core
│   │   │   │   ├── __init__.py
│   │   │   │   └── telemetry
│   │   │   │       ├── __init__.py
│   │   │   │       └── posthog.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_telemetry.py
│   │   ├── mcp-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── build-extension.py
│   │   │   ├── CONCURRENT_SESSIONS.md
│   │   │   ├── desktop-extension
│   │   │   │   ├── cua-extension.mcpb
│   │   │   │   ├── desktop_extension.png
│   │   │   │   ├── manifest.json
│   │   │   │   ├── README.md
│   │   │   │   ├── requirements.txt
│   │   │   │   ├── run_server.sh
│   │   │   │   └── setup.py
│   │   │   ├── mcp_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── server.py
│   │   │   │   └── session_manager.py
│   │   │   ├── pdm.lock
│   │   │   ├── pyproject.toml
│   │   │   ├── QUICK_TEST_COMMANDS.sh
│   │   │   ├── quick_test_local_option.py
│   │   │   ├── README.md
│   │   │   ├── scripts
│   │   │   │   ├── install_mcp_server.sh
│   │   │   │   └── start_mcp_server.sh
│   │   │   ├── test_mcp_server_local_option.py
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_mcp_server.py
│   │   ├── pylume
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_pylume.py
│   │   └── som
│   │       ├── .bumpversion.cfg
│   │       ├── LICENSE
│   │       ├── poetry.toml
│   │       ├── pyproject.toml
│   │       ├── README.md
│   │       ├── som
│   │       │   ├── __init__.py
│   │       │   ├── detect.py
│   │       │   ├── detection.py
│   │       │   ├── models.py
│   │       │   ├── ocr.py
│   │       │   ├── util
│   │       │   │   └── utils.py
│   │       │   └── visualization.py
│   │       └── tests
│   │           ├── conftest.py
│   │           └── test_omniparser.py
│   ├── qemu-docker
│   │   ├── linux
│   │   │   ├── Dockerfile
│   │   │   ├── README.md
│   │   │   └── src
│   │   │       ├── entry.sh
│   │   │       └── vm
│   │   │           ├── image
│   │   │           │   └── README.md
│   │   │           └── setup
│   │   │               ├── install.sh
│   │   │               ├── setup-cua-server.sh
│   │   │               └── setup.sh
│   │   ├── README.md
│   │   └── windows
│   │       ├── Dockerfile
│   │       ├── README.md
│   │       └── src
│   │           ├── entry.sh
│   │           └── vm
│   │               ├── image
│   │               │   └── README.md
│   │               └── setup
│   │                   ├── install.bat
│   │                   ├── on-logon.ps1
│   │                   ├── setup-cua-server.ps1
│   │                   ├── setup-utils.psm1
│   │                   └── setup.ps1
│   ├── typescript
│   │   ├── .gitignore
│   │   ├── .nvmrc
│   │   ├── agent
│   │   │   ├── examples
│   │   │   │   ├── playground-example.html
│   │   │   │   └── README.md
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── client.ts
│   │   │   │   ├── index.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   └── client.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── computer
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── computer
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── providers
│   │   │   │   │   │   ├── base.ts
│   │   │   │   │   │   ├── cloud.ts
│   │   │   │   │   │   └── index.ts
│   │   │   │   │   └── types.ts
│   │   │   │   ├── index.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── base.ts
│   │   │   │   │   ├── factory.ts
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── linux.ts
│   │   │   │   │   ├── macos.ts
│   │   │   │   │   └── windows.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   ├── computer
│   │   │   │   │   └── cloud.test.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── factory.test.ts
│   │   │   │   │   ├── index.test.ts
│   │   │   │   │   ├── linux.test.ts
│   │   │   │   │   ├── macos.test.ts
│   │   │   │   │   └── windows.test.ts
│   │   │   │   └── setup.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── core
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── index.ts
│   │   │   │   └── telemetry
│   │   │   │       ├── clients
│   │   │   │       │   ├── index.ts
│   │   │   │       │   └── posthog.ts
│   │   │   │       └── index.ts
│   │   │   ├── tests
│   │   │   │   └── telemetry.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── cua-cli
│   │   │   ├── .gitignore
│   │   │   ├── .prettierrc
│   │   │   ├── bun.lock
│   │   │   ├── CLAUDE.md
│   │   │   ├── index.ts
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── auth.ts
│   │   │   │   ├── cli.ts
│   │   │   │   ├── commands
│   │   │   │   │   ├── auth.ts
│   │   │   │   │   └── sandbox.ts
│   │   │   │   ├── config.ts
│   │   │   │   ├── http.ts
│   │   │   │   ├── storage.ts
│   │   │   │   └── util.ts
│   │   │   └── tsconfig.json
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── pnpm-workspace.yaml
│   │   └── README.md
│   └── xfce
│       ├── .dockerignore
│       ├── .gitignore
│       ├── Development.md
│       ├── Dockerfile
│       ├── Dockerfile.dev
│       ├── README.md
│       └── src
│           ├── scripts
│           │   ├── resize-display.sh
│           │   ├── start-computer-server.sh
│           │   ├── start-novnc.sh
│           │   ├── start-vnc.sh
│           │   └── xstartup.sh
│           ├── supervisor
│           │   └── supervisord.conf
│           └── xfce-config
│               ├── helpers.rc
│               ├── xfce4-power-manager.xml
│               └── xfce4-session.xml
├── LICENSE.md
├── Makefile
├── notebooks
│   ├── agent_nb.ipynb
│   ├── blog
│   │   ├── build-your-own-operator-on-macos-1.ipynb
│   │   └── build-your-own-operator-on-macos-2.ipynb
│   ├── composite_agents_docker_nb.ipynb
│   ├── computer_nb.ipynb
│   ├── computer_server_nb.ipynb
│   ├── customizing_computeragent.ipynb
│   ├── eval_osworld.ipynb
│   ├── ollama_nb.ipynb
│   ├── README.md
│   ├── sota_hackathon_cloud.ipynb
│   └── sota_hackathon.ipynb
├── package-lock.json
├── package.json
├── pnpm-lock.yaml
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── scripts
│   ├── install-cli.ps1
│   ├── install-cli.sh
│   ├── playground-docker.sh
│   ├── playground.sh
│   ├── run-docker-dev.sh
│   └── typescript-typecheck.js
├── TESTING.md
├── tests
│   ├── agent_loop_testing
│   │   ├── agent_test.py
│   │   └── README.md
│   ├── pytest.ini
│   ├── shell_cmd.py
│   ├── test_files.py
│   ├── test_mcp_server_session_management.py
│   ├── test_mcp_server_streaming.py
│   ├── test_shell_bash.py
│   ├── test_telemetry.py
│   ├── test_tracing.py
│   ├── test_venv.py
│   └── test_watchdog.py
└── uv.lock
```

# Files

--------------------------------------------------------------------------------
/docs/content/docs/example-usecases/post-event-contact-export.mdx:
--------------------------------------------------------------------------------

```markdown
  1 | ---
  2 | title: Post-Event Contact Export
  3 | description: Run overnight contact extraction from LinkedIn, X, or other social platforms after networking events
  4 | ---
  5 | 
  6 | import { Step, Steps } from 'fumadocs-ui/components/steps';
  7 | import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
  8 | 
  9 | ## Overview
 10 | 
 11 | After networking events, you need to export new connections from LinkedIn, X, or other platforms into your CRM. This automation handles it for you.
 12 | 
 13 | **The workflow**: Kick off the script after an event and let it run overnight. Wake up to a clean CSV ready for your CRM or email tool.
 14 | 
 15 | This example focuses on LinkedIn but works across platforms. It uses [Cua Computer](/computer-sdk/computers) to interact with web interfaces and [Agent Loops](/agent-sdk/agent-loops) to iterate through connections with conversation history.
 16 | 
 17 | ### Why Cua is Perfect for This
 18 | 
 19 | **Cua's VMs save your session data**, bypassing bot detection entirely:
 20 | 
 21 | - **Log in once manually** through the VM browser
 22 | - **Session persists** - you appear as a regular user, not a bot
 23 | - **No captchas** - the platform treats automation like normal browsing
 24 | - **No login code** - script doesn't handle authentication
 25 | - **Run overnight** - kick off and forget
 26 | 
 27 | Traditional web scraping triggers anti-bot measures immediately. Cua's approach works across all platforms.
 28 | 
 29 | ### What You Get
 30 | 
 31 | The script generates two files with your extracted connections:
 32 | 
 33 | **CSV Export** (`linkedin_connections_20250116_143022.csv`):
 34 | 
 35 | ```csv
 36 | first,last,role,company,met_at,linkedin
 37 | John,Smith,Software Engineer,Acme Corp,Google Devfest Toronto,https://www.linkedin.com/in/johnsmith
 38 | Sarah,Johnson,Product Manager,Tech Inc,Google Devfest Toronto,https://www.linkedin.com/in/sarahjohnson
 39 | ```
 40 | 
 41 | **Messaging Links** (`linkedin_messaging_links_20250116_143022.txt`):
 42 | 
 43 | ```
 44 | LinkedIn Messaging Compose Links
 45 | ================================================================================
 46 | 
 47 | 1. https://www.linkedin.com/messaging/compose/?recipient=johnsmith
 48 | 2. https://www.linkedin.com/messaging/compose/?recipient=sarahjohnson
 49 | ```
 50 | 
 51 | ---
 52 | 
 53 | <Steps>
 54 | 
 55 | <Step>
 56 | 
 57 | ### Set Up Your Environment
 58 | 
 59 | First, install the required dependencies:
 60 | 
 61 | Create a `requirements.txt` file:
 62 | 
 63 | ```text
 64 | cua-agent
 65 | cua-computer
 66 | python-dotenv>=1.0.0
 67 | ```
 68 | 
 69 | Install the dependencies:
 70 | 
 71 | ```bash
 72 | pip install -r requirements.txt
 73 | ```
 74 | 
 75 | Create a `.env` file with your API keys:
 76 | 
 77 | ```text
 78 | ANTHROPIC_API_KEY=your-anthropic-api-key # optional, BYOK. By default, this cookbook uses the CUA VLM Router
 79 | CUA_API_KEY=sk_cua-api01...
 80 | CUA_CONTAINER_NAME=m-linux-...
 81 | ```
 82 | 
 83 | Finally, setup your VM. Refer to the [quickstart guide](https://cua.ai/docs/get-started/quickstart) on how to setup the computer environment. 
 84 | </Step>
 85 | 
 86 | <Step>
 87 | 
 88 | ### Log Into LinkedIn Manually
 89 | 
 90 | **Important**: Before running the script, manually log into LinkedIn through your VM:
 91 | 
 92 | 1. Access your VM through the Cua dashboard
 93 | 2. Open a browser and navigate to LinkedIn
 94 | 3. Log in with your credentials (handle any captchas manually)
 95 | 4. Close the browser but leave the VM running
 96 | 5. Your session is now saved and ready for automation!
 97 | 
 98 | This one-time manual login bypasses all bot detection.
 99 | 
100 | </Step>
101 | 
102 | <Step>
103 | 
104 | ### Configure and Create Your Script
105 | 
106 | Create a Python file (e.g., `contact_export.py`). You can customize:
107 | 
108 | ```python
109 | # Where you met these connections (automatically added to CSV)
110 | MET_AT_REASON = "Google Devfest Toronto"
111 | 
112 | # Number of contacts to extract (in the main loop)
113 | for contact_num in range(1, 21):  # Change 21 to extract more/fewer contacts
114 | ```
115 | 
116 | Select your environment:
117 | 
118 | <Tabs items={['Cloud Sandbox', 'Linux on Docker', 'macOS Sandbox', 'Windows Sandbox']}>
119 |   <Tab value="Cloud Sandbox">
120 | 
121 | ```python
122 | import asyncio
123 | import csv
124 | import logging
125 | import os
126 | import signal
127 | import traceback
128 | from datetime import datetime
129 | 
130 | from agent import ComputerAgent
131 | from computer import Computer, VMProviderType
132 | from dotenv import load_dotenv
133 | 
134 | logging.basicConfig(level=logging.INFO)
135 | logger = logging.getLogger(__name__)
136 | 
137 | # Configuration: Define where you met these connections
138 | MET_AT_REASON = "Google Devfest Toronto"
139 | 
140 | def handle_sigint(sig, frame):
141 |     print("\n\nExecution interrupted by user. Exiting gracefully...")
142 |     exit(0)
143 | 
144 | def extract_public_id_from_linkedin_url(linkedin_url):
145 |     """Extract public ID from LinkedIn profile URL."""
146 |     if not linkedin_url:
147 |         return None
148 | 
149 |     url = linkedin_url.split('?')[0].rstrip('/')
150 | 
151 |     if '/in/' in url:
152 |         public_id = url.split('/in/')[-1]
153 |         return public_id
154 | 
155 |     return None
156 | 
157 | def extract_contact_from_response(result_output):
158 |     """
159 |     Extract contact information from agent's response.
160 |     Expects format:
161 |     FIRST: value
162 |     LAST: value
163 |     ROLE: value
164 |     COMPANY: value
165 |     LINKEDIN: value
166 |     """
167 |     contact = {
168 |         'first': '',
169 |         'last': '',
170 |         'role': '',
171 |         'company': '',
172 |         'met_at': MET_AT_REASON,
173 |         'linkedin': ''
174 |     }
175 | 
176 |     for item in result_output:
177 |         if item.get("type") == "message":
178 |             content = item.get("content", [])
179 |             for content_part in content:
180 |                 text = content_part.get("text", "")
181 |                 if text:
182 |                     for line in text.split('\n'):
183 |                         line = line.strip()
184 |                         line_upper = line.upper()
185 | 
186 |                         if line_upper.startswith("FIRST:"):
187 |                             value = line[6:].strip()
188 |                             if value and value.upper() != "N/A":
189 |                                 contact['first'] = value
190 |                         elif line_upper.startswith("LAST:"):
191 |                             value = line[5:].strip()
192 |                             if value and value.upper() != "N/A":
193 |                                 contact['last'] = value
194 |                         elif line_upper.startswith("ROLE:"):
195 |                             value = line[5:].strip()
196 |                             if value and value.upper() != "N/A":
197 |                                 contact['role'] = value
198 |                         elif line_upper.startswith("COMPANY:"):
199 |                             value = line[8:].strip()
200 |                             if value and value.upper() != "N/A":
201 |                                 contact['company'] = value
202 |                         elif line_upper.startswith("LINKEDIN:"):
203 |                             value = line[9:].strip()
204 |                             if value and value.upper() != "N/A":
205 |                                 contact['linkedin'] = value
206 | 
207 |     return contact
208 | 
209 | async def scrape_linkedin_connections():
210 |     """Scrape LinkedIn connections and export to CSV."""
211 | 
212 |     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
213 |     csv_filename = f"linkedin_connections_{timestamp}.csv"
214 |     csv_path = os.path.join(os.getcwd(), csv_filename)
215 | 
216 |     # Initialize CSV file
217 |     with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
218 |         writer = csv.DictWriter(csvfile, fieldnames=['first', 'last', 'role', 'company', 'met_at', 'linkedin'])
219 |         writer.writeheader()
220 | 
221 |     print(f"\n🚀 Starting LinkedIn connections scraper")
222 |     print(f"📁 Output file: {csv_path}")
223 |     print(f"📍 Met at: {MET_AT_REASON}")
224 |     print("=" * 80)
225 | 
226 |     try:
227 |         async with Computer(
228 |             os_type="linux",
229 |             provider_type=VMProviderType.CLOUD,
230 |             name=os.environ["CUA_CONTAINER_NAME"],  # Your sandbox name
231 |             api_key=os.environ["CUA_API_KEY"],
232 |             verbosity=logging.INFO,
233 |         ) as computer:
234 | 
235 |             agent = ComputerAgent(
236 |                 model="cua/anthropic/claude-sonnet-4.5",
237 |                 tools=[computer],
238 |                 only_n_most_recent_images=3,
239 |                 verbosity=logging.INFO,
240 |                 trajectory_dir="trajectories",
241 |                 use_prompt_caching=True,
242 |                 max_trajectory_budget=10.0,
243 |             )
244 | 
245 |             history = []
246 | 
247 |             # Task 1: Navigate to LinkedIn connections page
248 |             navigation_task = (
249 |                 "STEP 1 - NAVIGATE TO LINKEDIN CONNECTIONS PAGE:\n"
250 |                 "1. Open a web browser (Chrome or Firefox)\n"
251 |                 "2. Navigate to https://www.linkedin.com/mynetwork/invite-connect/connections/\n"
252 |                 "3. Wait for the page to fully load\n"
253 |                 "4. Confirm you can see the list of connections\n"
254 |                 "5. Ready to start extracting contacts"
255 |             )
256 | 
257 |             print(f"\n[Task 1/21] Navigating to LinkedIn...")
258 |             history.append({"role": "user", "content": navigation_task})
259 | 
260 |             async for result in agent.run(history, stream=False):
261 |                 history += result.get("output", [])
262 | 
263 |             print(f"✅ Navigation completed\n")
264 | 
265 |             # Extract 20 contacts
266 |             contacts_extracted = 0
267 |             linkedin_urls = []
268 |             previous_contact_name = None
269 | 
270 |             for contact_num in range(1, 21):
271 |                 # Build extraction task
272 |                 if contact_num == 1:
273 |                     extraction_task = (
274 |                         f"STEP {contact_num + 1} - EXTRACT CONTACT {contact_num} OF 20:\n"
275 |                         f"1. Click on the first connection's profile\n"
276 |                         f"2. Extract: FIRST, LAST, ROLE, COMPANY, LINKEDIN URL\n"
277 |                         f"3. Return in exact format:\n"
278 |                         f"FIRST: [value]\n"
279 |                         f"LAST: [value]\n"
280 |                         f"ROLE: [value]\n"
281 |                         f"COMPANY: [value]\n"
282 |                         f"LINKEDIN: [value]\n"
283 |                         f"4. Navigate back to connections list"
284 |                     )
285 |                 else:
286 |                     extraction_task = (
287 |                         f"STEP {contact_num + 1} - EXTRACT CONTACT {contact_num} OF 20:\n"
288 |                         f"1. Find '{previous_contact_name}' in the list\n"
289 |                         f"2. Click on the contact BELOW them\n"
290 |                         f"3. Extract: FIRST, LAST, ROLE, COMPANY, LINKEDIN URL\n"
291 |                         f"4. Return in exact format:\n"
292 |                         f"FIRST: [value]\n"
293 |                         f"LAST: [value]\n"
294 |                         f"ROLE: [value]\n"
295 |                         f"COMPANY: [value]\n"
296 |                         f"LINKEDIN: [value]\n"
297 |                         f"5. Navigate back"
298 |                     )
299 | 
300 |                 print(f"[Task {contact_num + 1}/21] Extracting contact {contact_num}/20...")
301 |                 history.append({"role": "user", "content": extraction_task})
302 | 
303 |                 all_output = []
304 |                 async for result in agent.run(history, stream=False):
305 |                     output = result.get("output", [])
306 |                     history += output
307 |                     all_output.extend(output)
308 | 
309 |                 contact_data = extract_contact_from_response(all_output)
310 | 
311 |                 has_name = bool(contact_data['first'] and contact_data['last'])
312 |                 has_linkedin = bool(contact_data['linkedin'] and 'linkedin.com' in contact_data['linkedin'])
313 | 
314 |                 if has_name or has_linkedin:
315 |                     with open(csv_path, 'a', newline='', encoding='utf-8') as csvfile:
316 |                         writer = csv.DictWriter(csvfile, fieldnames=['first', 'last', 'role', 'company', 'met_at', 'linkedin'])
317 |                         writer.writerow(contact_data)
318 |                     contacts_extracted += 1
319 | 
320 |                     if contact_data['linkedin']:
321 |                         linkedin_urls.append(contact_data['linkedin'])
322 | 
323 |                     if has_name:
324 |                         previous_contact_name = f"{contact_data['first']} {contact_data['last']}".strip()
325 | 
326 |                     name_str = f"{contact_data['first']} {contact_data['last']}" if has_name else "[No name]"
327 |                     print(f"✅ Contact {contact_num}/20 saved: {name_str}")
328 |                 else:
329 |                     print(f"⚠️  Could not extract valid data for contact {contact_num}")
330 | 
331 |                 if contact_num % 5 == 0:
332 |                     print(f"\n📈 Progress: {contacts_extracted}/{contact_num} contacts extracted\n")
333 | 
334 |             # Create messaging links file
335 |             messaging_filename = f"linkedin_messaging_links_{timestamp}.txt"
336 |             messaging_path = os.path.join(os.getcwd(), messaging_filename)
337 | 
338 |             with open(messaging_path, 'w', encoding='utf-8') as txtfile:
339 |                 txtfile.write("LinkedIn Messaging Compose Links\n")
340 |                 txtfile.write("=" * 80 + "\n\n")
341 | 
342 |                 for i, linkedin_url in enumerate(linkedin_urls, 1):
343 |                     public_id = extract_public_id_from_linkedin_url(linkedin_url)
344 |                     if public_id:
345 |                         messaging_url = f"https://www.linkedin.com/messaging/compose/?recipient={public_id}"
346 |                         txtfile.write(f"{i}. {messaging_url}\n")
347 | 
348 |             print("\n" + "="*80)
349 |             print("🎉 All tasks completed!")
350 |             print(f"📁 CSV file saved to: {csv_path}")
351 |             print(f"📊 Total contacts extracted: {contacts_extracted}/20")
352 |             print(f"💬 Messaging links saved to: {messaging_path}")
353 |             print("="*80)
354 | 
355 |     except Exception as e:
356 |         print(f"\n❌ Error: {e}")
357 |         traceback.print_exc()
358 |         raise
359 | 
360 | def main():
361 |     try:
362 |         load_dotenv()
363 | 
364 |         if "ANTHROPIC_API_KEY" not in os.environ:
365 |             raise RuntimeError("Please set ANTHROPIC_API_KEY in .env")
366 | 
367 |         if "CUA_API_KEY" not in os.environ:
368 |             raise RuntimeError("Please set CUA_API_KEY in .env")
369 | 
370 |         if "CUA_CONTAINER_NAME" not in os.environ:
371 |             raise RuntimeError("Please set CUA_CONTAINER_NAME in .env")
372 | 
373 |         signal.signal(signal.SIGINT, handle_sigint)
374 | 
375 |         asyncio.run(scrape_linkedin_connections())
376 | 
377 |     except Exception as e:
378 |         print(f"\n❌ Error: {e}")
379 |         traceback.print_exc()
380 | 
381 | if __name__ == "__main__":
382 |     main()
383 | ```
384 | 
385 |   </Tab>
386 |   <Tab value="Linux on Docker">
387 | 
388 | ```python
389 | # Same code as Cloud Sandbox, but change Computer initialization to:
390 | async with Computer(
391 |     os_type="linux",
392 |     provider_type=VMProviderType.DOCKER,
393 |     image="trycua/cua-xfce:latest",
394 |     verbosity=logging.INFO,
395 | ) as computer:
396 | ```
397 | 
398 | And remove the `CUA_API_KEY` and `CUA_CONTAINER_NAME` requirements from `.env` and the validation checks.
399 | 
400 |   </Tab>
401 |   <Tab value="macOS Sandbox">
402 | 
403 | ```python
404 | # Same code as Cloud Sandbox, but change Computer initialization to:
405 | async with Computer(
406 |     os_type="macos",
407 |     provider_type=VMProviderType.LUME,
408 |     name="macos-sequoia-cua:latest",
409 |     verbosity=logging.INFO,
410 | ) as computer:
411 | ```
412 | 
413 | And remove the `CUA_API_KEY` and `CUA_CONTAINER_NAME` requirements from `.env` and the validation checks.
414 | 
415 |   </Tab>
416 |   <Tab value="Windows Sandbox">
417 | 
418 | ```python
419 | # Same code as Cloud Sandbox, but change Computer initialization to:
420 | async with Computer(
421 |     os_type="windows",
422 |     provider_type=VMProviderType.WINDOWS_SANDBOX,
423 |     verbosity=logging.INFO,
424 | ) as computer:
425 | ```
426 | 
427 | And remove the `CUA_API_KEY` and `CUA_CONTAINER_NAME` requirements from `.env` and the validation checks.
428 | 
429 |   </Tab>
430 | </Tabs>
431 | 
432 | </Step>
433 | 
434 | <Step>
435 | 
436 | ### Run Your Script
437 | 
438 | Execute your contact extraction automation:
439 | 
440 | ```bash
441 | python contact_export.py
442 | ```
443 | 
444 | The agent will:
445 | 
446 | 1. Navigate to your LinkedIn connections page
447 | 2. Extract data from 20 contacts (first name, last name, role, company, LinkedIn URL)
448 | 3. Save contacts to a timestamped CSV file
449 | 4. Generate messaging compose links for easy follow-up
450 | 
451 | Monitor the output to see the agent's progress. The script will show a progress update every 5 contacts.
452 | 
453 | </Step>
454 | 
455 | </Steps>
456 | 
457 | ---
458 | 
459 | ## How It Works
460 | 
461 | This script demonstrates a practical workflow for extracting LinkedIn connection data:
462 | 
463 | 1. **Session Persistence** - Manually log into LinkedIn through the VM once, and the VM saves your session
464 | 2. **Navigation** - The script navigates to your connections page using your saved authenticated session
465 | 3. **Data Extraction** - For each contact, the agent clicks their profile, extracts data, and navigates back
466 | 4. **Python Processing** - Python parses responses, validates data, and writes to CSV incrementally
467 | 5. **Output Files** - Generates a CSV with contact data and a text file with messaging URLs
468 | 
469 | ## Next Steps
470 | 
471 | - Learn more about [Cua computers](/computer-sdk/computers) and [computer commands](/computer-sdk/commands)
472 | - Read about [Agent loops](/agent-sdk/agent-loops), [tools](/agent-sdk/custom-tools), and [supported model providers](/agent-sdk/supported-model-providers/)
473 | - Experiment with different [Models and Providers](/agent-sdk/supported-model-providers/)
474 | - Adapt this script for other platforms (Twitter/X, email extraction, etc.)
475 | - Join our [Discord community](https://discord.com/invite/mVnXXpdE85) for help
476 | 
```

--------------------------------------------------------------------------------
/libs/python/mcp-server/mcp_server/server.py:
--------------------------------------------------------------------------------

```python
  1 | import asyncio
  2 | import base64
  3 | import inspect
  4 | import logging
  5 | import os
  6 | import signal
  7 | import sys
  8 | import traceback
  9 | import uuid
 10 | from typing import Any, Dict, List, Optional, Tuple, Union
 11 | 
 12 | import anyio
 13 | 
 14 | # Configure logging to output to stderr for debug visibility
 15 | logging.basicConfig(
 16 |     level=logging.DEBUG,  # Changed to DEBUG
 17 |     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 18 |     stream=sys.stderr,
 19 | )
 20 | logger = logging.getLogger("mcp-server")
 21 | 
 22 | # More visible startup message
 23 | logger.debug("MCP Server module loading...")
 24 | 
 25 | try:
 26 |     from mcp.server.fastmcp import Context, FastMCP
 27 | 
 28 |     # Use the canonical Image type
 29 |     from mcp.server.fastmcp.utilities.types import Image
 30 | 
 31 |     logger.debug("Successfully imported FastMCP")
 32 | except ImportError as e:
 33 |     logger.error(f"Failed to import FastMCP: {e}")
 34 |     traceback.print_exc(file=sys.stderr)
 35 |     sys.exit(1)
 36 | 
 37 | try:
 38 |     from agent import ComputerAgent
 39 |     from computer import Computer
 40 | 
 41 |     logger.debug("Successfully imported Computer and Agent modules")
 42 | except ImportError as e:
 43 |     logger.error(f"Failed to import Computer/Agent modules: {e}")
 44 |     traceback.print_exc(file=sys.stderr)
 45 |     sys.exit(1)
 46 | 
 47 | try:
 48 |     from .session_manager import (
 49 |         get_session_manager,
 50 |         initialize_session_manager,
 51 |         shutdown_session_manager,
 52 |     )
 53 | 
 54 |     logger.debug("Successfully imported session manager")
 55 | except ImportError as e:
 56 |     logger.error(f"Failed to import session manager: {e}")
 57 |     traceback.print_exc(file=sys.stderr)
 58 |     sys.exit(1)
 59 | 
 60 | 
 61 | def get_env_bool(key: str, default: bool = False) -> bool:
 62 |     """Get boolean value from environment variable."""
 63 |     return os.getenv(key, str(default)).lower() in ("true", "1", "yes")
 64 | 
 65 | 
 66 | async def _maybe_call_ctx_method(ctx: Context, method_name: str, *args, **kwargs) -> None:
 67 |     """Call a context helper if it exists, awaiting the result when necessary."""
 68 |     method = getattr(ctx, method_name, None)
 69 |     if not callable(method):
 70 |         return
 71 |     result = method(*args, **kwargs)
 72 |     if inspect.isawaitable(result):
 73 |         await result
 74 | 
 75 | 
 76 | def _normalise_message_content(content: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
 77 |     """Normalise message content to a list of structured parts."""
 78 |     if isinstance(content, list):
 79 |         return content
 80 |     if content is None:
 81 |         return []
 82 |     return [{"type": "output_text", "text": str(content)}]
 83 | 
 84 | 
 85 | def _extract_text_from_content(content: Union[str, List[Dict[str, Any]]]) -> str:
 86 |     """Extract textual content for inclusion in the aggregated result string."""
 87 |     if isinstance(content, str):
 88 |         return content
 89 |     texts: List[str] = []
 90 |     for part in content or []:
 91 |         if not isinstance(part, dict):
 92 |             continue
 93 |         if part.get("type") in {"output_text", "text"} and part.get("text"):
 94 |             texts.append(str(part["text"]))
 95 |     return "\n".join(texts)
 96 | 
 97 | 
 98 | def _serialise_tool_content(content: Any) -> str:
 99 |     """Convert tool outputs into a string for aggregation."""
100 |     if isinstance(content, str):
101 |         return content
102 |     if isinstance(content, list):
103 |         texts: List[str] = []
104 |         for part in content:
105 |             if (
106 |                 isinstance(part, dict)
107 |                 and part.get("type") in {"output_text", "text"}
108 |                 and part.get("text")
109 |             ):
110 |                 texts.append(str(part["text"]))
111 |         if texts:
112 |             return "\n".join(texts)
113 |     if content is None:
114 |         return ""
115 |     return str(content)
116 | 
117 | 
118 | def serve() -> FastMCP:
119 |     """Create and configure the MCP server."""
120 |     # NOTE: Do not pass model_config here; FastMCP 2.12.x doesn't support it.
121 |     server = FastMCP(name="cua-agent")
122 | 
123 |     @server.tool(structured_output=False)
124 |     async def screenshot_cua(ctx: Context, session_id: Optional[str] = None) -> Any:
125 |         """
126 |         Take a screenshot of the current MacOS VM screen and return the image.
127 | 
128 |         Args:
129 |             session_id: Optional session ID for multi-client support. If not provided, a new session will be created.
130 |         """
131 |         session_manager = get_session_manager()
132 | 
133 |         async with session_manager.get_session(session_id) as session:
134 |             screenshot = await session.computer.interface.screenshot()
135 |             # Returning Image object is fine when structured_output=False
136 |             return Image(format="png", data=screenshot)
137 | 
138 |     @server.tool(structured_output=False)
139 |     async def run_cua_task(ctx: Context, task: str, session_id: Optional[str] = None) -> Any:
140 |         """
141 |         Run a Computer-Use Agent (CUA) task in a MacOS VM and return (combined text, final screenshot).
142 | 
143 |         Args:
144 |             task: The task description for the agent to execute
145 |             session_id: Optional session ID for multi-client support. If not provided, a new session will be created.
146 |         """
147 |         session_manager = get_session_manager()
148 |         task_id = str(uuid.uuid4())
149 | 
150 |         try:
151 |             logger.info(f"Starting CUA task: {task} (task_id: {task_id})")
152 | 
153 |             async with session_manager.get_session(session_id) as session:
154 |                 # Register this task with the session
155 |                 await session_manager.register_task(session.session_id, task_id)
156 | 
157 |                 try:
158 |                     # Get model name
159 |                     model_name = os.getenv("CUA_MODEL_NAME", "anthropic/claude-sonnet-4-5-20250929")
160 |                     logger.info(f"Using model: {model_name}")
161 | 
162 |                     # Create agent with the new v0.4.x API
163 |                     agent = ComputerAgent(
164 |                         model=model_name,
165 |                         only_n_most_recent_images=int(os.getenv("CUA_MAX_IMAGES", "3")),
166 |                         verbosity=logging.INFO,
167 |                         tools=[session.computer],
168 |                     )
169 | 
170 |                     messages = [{"role": "user", "content": task}]
171 | 
172 |                     # Collect all results
173 |                     aggregated_messages: List[str] = []
174 |                     async for result in agent.run(messages):
175 |                         logger.info("Agent processing step")
176 |                         ctx.info("Agent processing step")
177 | 
178 |                         outputs = result.get("output", [])
179 |                         for output in outputs:
180 |                             output_type = output.get("type")
181 | 
182 |                             if output_type == "message":
183 |                                 logger.debug("Streaming assistant message: %s", output)
184 |                                 content = _normalise_message_content(output.get("content"))
185 |                                 aggregated_text = _extract_text_from_content(content)
186 |                                 if aggregated_text:
187 |                                     aggregated_messages.append(aggregated_text)
188 |                                 await _maybe_call_ctx_method(
189 |                                     ctx,
190 |                                     "yield_message",
191 |                                     role=output.get("role", "assistant"),
192 |                                     content=content,
193 |                                 )
194 | 
195 |                             elif output_type in {"tool_use", "computer_call", "function_call"}:
196 |                                 logger.debug("Streaming tool call: %s", output)
197 |                                 call_id = output.get("id") or output.get("call_id")
198 |                                 tool_name = output.get("name") or output.get("action", {}).get(
199 |                                     "type"
200 |                                 )
201 |                                 tool_input = (
202 |                                     output.get("input")
203 |                                     or output.get("arguments")
204 |                                     or output.get("action")
205 |                                 )
206 |                                 if call_id:
207 |                                     await _maybe_call_ctx_method(
208 |                                         ctx,
209 |                                         "yield_tool_call",
210 |                                         name=tool_name,
211 |                                         call_id=call_id,
212 |                                         input=tool_input,
213 |                                     )
214 | 
215 |                             elif output_type in {
216 |                                 "tool_result",
217 |                                 "computer_call_output",
218 |                                 "function_call_output",
219 |                             }:
220 |                                 logger.debug("Streaming tool output: %s", output)
221 |                                 call_id = output.get("call_id") or output.get("id")
222 |                                 content = output.get("content") or output.get("output")
223 |                                 aggregated_text = _serialise_tool_content(content)
224 |                                 if aggregated_text:
225 |                                     aggregated_messages.append(aggregated_text)
226 |                                 if call_id:
227 |                                     await _maybe_call_ctx_method(
228 |                                         ctx,
229 |                                         "yield_tool_output",
230 |                                         call_id=call_id,
231 |                                         output=content,
232 |                                         is_error=output.get("status") == "failed"
233 |                                         or output.get("is_error", False),
234 |                                     )
235 | 
236 |                     logger.info("CUA task completed successfully")
237 |                     ctx.info("CUA task completed successfully")
238 | 
239 |                     screenshot_image = Image(
240 |                         format="png",
241 |                         data=await session.computer.interface.screenshot(),
242 |                     )
243 | 
244 |                     return (
245 |                         "\n".join(aggregated_messages).strip()
246 |                         or "Task completed with no text output.",
247 |                         screenshot_image,
248 |                     )
249 | 
250 |                 finally:
251 |                     # Unregister the task from the session
252 |                     await session_manager.unregister_task(session.session_id, task_id)
253 | 
254 |         except Exception as e:
255 |             error_msg = f"Error running CUA task: {str(e)}\n{traceback.format_exc()}"
256 |             logger.error(error_msg)
257 |             ctx.error(error_msg)
258 | 
259 |             # Try to get a screenshot from the session if available
260 |             try:
261 |                 if session_id:
262 |                     async with session_manager.get_session(session_id) as session:
263 |                         screenshot = await session.computer.interface.screenshot()
264 |                         return (
265 |                             f"Error during task execution: {str(e)}",
266 |                             Image(format="png", data=screenshot),
267 |                         )
268 |             except Exception:
269 |                 pass
270 | 
271 |             # If we can't get a screenshot, return a placeholder
272 |             return (
273 |                 f"Error during task execution: {str(e)}",
274 |                 Image(format="png", data=b""),
275 |             )
276 | 
277 |     @server.tool(structured_output=False)
278 |     async def run_multi_cua_tasks(
279 |         ctx: Context, tasks: List[str], session_id: Optional[str] = None, concurrent: bool = False
280 |     ) -> Any:
281 |         """
282 |         Run multiple CUA tasks and return a list of (combined text, screenshot).
283 | 
284 |         Args:
285 |             tasks: List of task descriptions to execute
286 |             session_id: Optional session ID for multi-client support. If not provided, a new session will be created.
287 |             concurrent: If True, run tasks concurrently. If False, run sequentially (default).
288 |         """
289 |         total_tasks = len(tasks)
290 |         if total_tasks == 0:
291 |             ctx.report_progress(1.0)
292 |             return []
293 | 
294 |         session_manager = get_session_manager()
295 | 
296 |         if concurrent and total_tasks > 1:
297 |             # Run tasks concurrently
298 |             logger.info(f"Running {total_tasks} tasks concurrently")
299 |             ctx.info(f"Running {total_tasks} tasks concurrently")
300 | 
301 |             # Create tasks with progress tracking
302 |             async def run_task_with_progress(
303 |                 task_index: int, task: str
304 |             ) -> Tuple[int, Tuple[str, Image]]:
305 |                 ctx.report_progress(task_index / total_tasks)
306 |                 result = await run_cua_task(ctx, task, session_id)
307 |                 ctx.report_progress((task_index + 1) / total_tasks)
308 |                 return task_index, result
309 | 
310 |             # Create all task coroutines
311 |             task_coroutines = [run_task_with_progress(i, task) for i, task in enumerate(tasks)]
312 | 
313 |             # Wait for all tasks to complete
314 |             results_with_indices = await asyncio.gather(*task_coroutines, return_exceptions=True)
315 | 
316 |             # Sort results by original task order and handle exceptions
317 |             results: List[Tuple[str, Image]] = []
318 |             for result in results_with_indices:
319 |                 if isinstance(result, Exception):
320 |                     logger.error(f"Task failed with exception: {result}")
321 |                     ctx.error(f"Task failed: {str(result)}")
322 |                     results.append((f"Task failed: {str(result)}", Image(format="png", data=b"")))
323 |                 else:
324 |                     _, task_result = result
325 |                     results.append(task_result)
326 | 
327 |             return results
328 |         else:
329 |             # Run tasks sequentially (original behavior)
330 |             logger.info(f"Running {total_tasks} tasks sequentially")
331 |             ctx.info(f"Running {total_tasks} tasks sequentially")
332 | 
333 |             results: List[Tuple[str, Image]] = []
334 |             for i, task in enumerate(tasks):
335 |                 logger.info(f"Running task {i+1}/{total_tasks}: {task}")
336 |                 ctx.info(f"Running task {i+1}/{total_tasks}: {task}")
337 | 
338 |                 ctx.report_progress(i / total_tasks)
339 |                 task_result = await run_cua_task(ctx, task, session_id)
340 |                 results.append(task_result)
341 |                 ctx.report_progress((i + 1) / total_tasks)
342 | 
343 |             return results
344 | 
345 |     @server.tool(structured_output=False)
346 |     async def get_session_stats(ctx: Context) -> Dict[str, Any]:
347 |         """
348 |         Get statistics about active sessions and resource usage.
349 |         """
350 |         session_manager = get_session_manager()
351 |         return session_manager.get_session_stats()
352 | 
353 |     @server.tool(structured_output=False)
354 |     async def cleanup_session(ctx: Context, session_id: str) -> str:
355 |         """
356 |         Cleanup a specific session and release its resources.
357 | 
358 |         Args:
359 |             session_id: The session ID to cleanup
360 |         """
361 |         session_manager = get_session_manager()
362 |         await session_manager.cleanup_session(session_id)
363 |         return f"Session {session_id} cleanup initiated"
364 | 
365 |     return server
366 | 
367 | 
368 | server = serve()
369 | 
370 | 
371 | async def run_server():
372 |     """Run the MCP server with proper lifecycle management."""
373 |     session_manager = None
374 |     try:
375 |         logger.debug("Starting MCP server...")
376 | 
377 |         # Initialize session manager
378 |         session_manager = await initialize_session_manager()
379 |         logger.info("Session manager initialized")
380 | 
381 |         # Set up signal handlers for graceful shutdown
382 |         def signal_handler(signum, frame):
383 |             logger.info(f"Received signal {signum}, initiating graceful shutdown...")
384 |             # Create a task to shutdown gracefully
385 |             asyncio.create_task(graceful_shutdown())
386 | 
387 |         signal.signal(signal.SIGINT, signal_handler)
388 |         signal.signal(signal.SIGTERM, signal_handler)
389 | 
390 |         # Start the server
391 |         logger.info("Starting FastMCP server...")
392 |         # Use run_stdio_async directly instead of server.run() to avoid nested event loops
393 |         await server.run_stdio_async()
394 | 
395 |     except Exception as e:
396 |         logger.error(f"Error starting server: {e}")
397 |         traceback.print_exc(file=sys.stderr)
398 |         raise
399 |     finally:
400 |         # Ensure cleanup happens
401 |         if session_manager:
402 |             logger.info("Shutting down session manager...")
403 |             await shutdown_session_manager()
404 | 
405 | 
406 | async def graceful_shutdown():
407 |     """Gracefully shutdown the server and all sessions."""
408 |     logger.info("Initiating graceful shutdown...")
409 |     try:
410 |         await shutdown_session_manager()
411 |         logger.info("Graceful shutdown completed")
412 |     except Exception as e:
413 |         logger.error(f"Error during graceful shutdown: {e}")
414 |     finally:
415 |         # Exit the process
416 |         import os
417 | 
418 |         os._exit(0)
419 | 
420 | 
421 | def main():
422 |     """Run the MCP server with proper async lifecycle management."""
423 |     try:
424 |         # Use anyio.run instead of asyncio.run to avoid nested event loop issues
425 |         anyio.run(run_server)
426 |     except KeyboardInterrupt:
427 |         logger.info("Server interrupted by user")
428 |     except Exception as e:
429 |         logger.error(f"Error starting server: {e}")
430 |         traceback.print_exc(file=sys.stderr)
431 |         sys.exit(1)
432 | 
433 | 
434 | if __name__ == "__main__":
435 |     main()
436 | 
```

--------------------------------------------------------------------------------
/libs/lume/src/Commands/Logs.swift:
--------------------------------------------------------------------------------

```swift
  1 | import ArgumentParser
  2 | import Foundation
  3 | 
  4 | struct Logs: ParsableCommand {
  5 |     static let configuration = CommandConfiguration(
  6 |         abstract: "View lume serve logs",
  7 |         subcommands: [Info.self, Error.self, All.self],
  8 |         defaultSubcommand: All.self
  9 |     )
 10 |     
 11 |     // Common functionality for reading log files
 12 |     static func readLogFile(path: String, lines: Int? = nil, follow: Bool = false) -> String {
 13 |         let fileManager = FileManager.default
 14 |         
 15 |         // Check if file exists
 16 |         guard fileManager.fileExists(atPath: path) else {
 17 |             return "Log file not found at \(path)"
 18 |         }
 19 |         
 20 |         do {
 21 |             // Read file content
 22 |             let content = try String(contentsOfFile: path, encoding: .utf8)
 23 |             
 24 |             // If lines parameter is provided, return only the specified number of lines from the end
 25 |             if let lineCount = lines {
 26 |                 let allLines = content.components(separatedBy: .newlines)
 27 |                 let startIndex = max(0, allLines.count - lineCount)
 28 |                 let lastLines = Array(allLines[startIndex...])
 29 |                 return lastLines.joined(separator: "\n")
 30 |             }
 31 |             
 32 |             return content
 33 |         } catch {
 34 |             return "Error reading log file: \(error.localizedDescription)"
 35 |         }
 36 |     }
 37 |     
 38 |     // Method for tailing a log file (following new changes)
 39 |     static func tailLogFile(path: String, initialLines: Int? = 10) {
 40 |         let fileManager = FileManager.default
 41 |         
 42 |         // Check if file exists
 43 |         guard fileManager.fileExists(atPath: path) else {
 44 |             print("Log file not found at \(path)")
 45 |             return
 46 |         }
 47 |         
 48 |         do {
 49 |             // Get initial content with only the specified number of lines from the end
 50 |             var lastPosition: UInt64 = 0
 51 |             let fileHandle = try FileHandle(forReadingFrom: URL(fileURLWithPath: path))
 52 |             
 53 |             // First, print the last few lines of the file
 54 |             if let lines = initialLines {
 55 |                 let content = try String(contentsOfFile: path, encoding: .utf8)
 56 |                 let allLines = content.components(separatedBy: .newlines)
 57 |                 let startIndex = max(0, allLines.count - lines)
 58 |                 let lastLines = Array(allLines[startIndex...])
 59 |                 print(lastLines.joined(separator: "\n"))
 60 |             }
 61 |             
 62 |             // Get current file size
 63 |             lastPosition = UInt64(try fileManager.attributesOfItem(atPath: path)[.size] as? UInt64 ?? 0)
 64 |             
 65 |             // Set up for continuous monitoring
 66 |             print("\nTailing log file... Press Ctrl+C to stop")
 67 |             
 68 |             // Monitor file for changes
 69 |             while true {
 70 |                 // Brief pause to reduce CPU usage
 71 |                 Thread.sleep(forTimeInterval: 0.5)
 72 |                 
 73 |                 // Get current size
 74 |                 let currentSize = try fileManager.attributesOfItem(atPath: path)[.size] as? UInt64 ?? 0
 75 |                 
 76 |                 // If file has grown
 77 |                 if currentSize > lastPosition {
 78 |                     // Seek to where we last read
 79 |                     fileHandle.seek(toFileOffset: lastPosition)
 80 |                     
 81 |                     // Read new content
 82 |                     if let newData = try? fileHandle.readToEnd() {
 83 |                         if let newContent = String(data: newData, encoding: .utf8) {
 84 |                             // Print new content without trailing newline
 85 |                             if newContent.hasSuffix("\n") {
 86 |                                 print(newContent, terminator: "")
 87 |                             } else {
 88 |                                 print(newContent)
 89 |                             }
 90 |                         }
 91 |                     }
 92 |                     
 93 |                     // Update position
 94 |                     lastPosition = currentSize
 95 |                 }
 96 |                 
 97 |                 // Handle file rotation (if file became smaller)
 98 |                 else if currentSize < lastPosition {
 99 |                     // File was probably rotated, start from beginning
100 |                     lastPosition = 0
101 |                     fileHandle.seek(toFileOffset: 0)
102 |                     
103 |                     if let newData = try? fileHandle.readToEnd() {
104 |                         if let newContent = String(data: newData, encoding: .utf8) {
105 |                             print(newContent, terminator: "")
106 |                         }
107 |                     }
108 |                     
109 |                     lastPosition = currentSize
110 |                 }
111 |             }
112 |         } catch {
113 |             print("Error tailing log file: \(error.localizedDescription)")
114 |         }
115 |     }
116 |     
117 |     // MARK: - Info Logs Subcommand
118 |     
119 |     struct Info: ParsableCommand {
120 |         static let configuration = CommandConfiguration(
121 |             commandName: "info",
122 |             abstract: "View info logs from the daemon"
123 |         )
124 |         
125 |         @Option(name: .shortAndLong, help: "Number of lines to display from the end of the file")
126 |         var lines: Int?
127 |         
128 |         @Flag(name: .shortAndLong, help: "Follow log file continuously (like tail -f)")
129 |         var follow: Bool = false
130 |         
131 |         func run() throws {
132 |             let logPath = "/tmp/lume_daemon.log"
133 |             
134 |             print("=== Info Logs ===")
135 |             
136 |             if follow {
137 |                 // Use tailing functionality to continuously monitor the log
138 |                 Logs.tailLogFile(path: logPath, initialLines: lines ?? 10)
139 |             } else {
140 |                 // Regular one-time viewing of logs
141 |                 let content = Logs.readLogFile(path: logPath, lines: lines)
142 |                 print(content)
143 |             }
144 |         }
145 |     }
146 |     
147 |     // MARK: - Error Logs Subcommand
148 |     
149 |     struct Error: ParsableCommand {
150 |         static let configuration = CommandConfiguration(
151 |             commandName: "error",
152 |             abstract: "View error logs from the daemon"
153 |         )
154 |         
155 |         @Option(name: .shortAndLong, help: "Number of lines to display from the end of the file")
156 |         var lines: Int?
157 |         
158 |         @Flag(name: .shortAndLong, help: "Follow log file continuously (like tail -f)")
159 |         var follow: Bool = false
160 |         
161 |         func run() throws {
162 |             let logPath = "/tmp/lume_daemon.error.log"
163 |             
164 |             print("=== Error Logs ===")
165 |             
166 |             if follow {
167 |                 // Use tailing functionality to continuously monitor the log
168 |                 Logs.tailLogFile(path: logPath, initialLines: lines ?? 10)
169 |             } else {
170 |                 // Regular one-time viewing of logs
171 |                 let content = Logs.readLogFile(path: logPath, lines: lines)
172 |                 print(content)
173 |             }
174 |         }
175 |     }
176 |     
177 |     // MARK: - All Logs Subcommand
178 |     
179 |     struct All: ParsableCommand {
180 |         static let configuration = CommandConfiguration(
181 |             commandName: "all",
182 |             abstract: "View both info and error logs from the daemon"
183 |         )
184 |         
185 |         @Option(name: .shortAndLong, help: "Number of lines to display from the end of each file")
186 |         var lines: Int?
187 |         
188 |         @Flag(name: .shortAndLong, help: "Follow log files continuously (like tail -f)")
189 |         var follow: Bool = false
190 |         
191 |         // Custom implementation to tail both logs simultaneously
192 |         private func tailBothLogs(infoPath: String, errorPath: String, initialLines: Int? = 10) {
193 |             let fileManager = FileManager.default
194 |             var infoExists = fileManager.fileExists(atPath: infoPath)
195 |             var errorExists = fileManager.fileExists(atPath: errorPath)
196 |             
197 |             if !infoExists && !errorExists {
198 |                 print("Neither info nor error log files found")
199 |                 return
200 |             }
201 |             
202 |             // Print initial content
203 |             print("=== Info Logs ===")
204 |             if infoExists {
205 |                 if let lines = initialLines {
206 |                     let content = (try? String(contentsOfFile: infoPath, encoding: .utf8)) ?? ""
207 |                     let allLines = content.components(separatedBy: .newlines)
208 |                     let startIndex = max(0, allLines.count - lines)
209 |                     let lastLines = Array(allLines[startIndex...])
210 |                     print(lastLines.joined(separator: "\n"))
211 |                 }
212 |             } else {
213 |                 print("Info log file not found")
214 |             }
215 |             
216 |             print("\n=== Error Logs ===")
217 |             if errorExists {
218 |                 if let lines = initialLines {
219 |                     let content = (try? String(contentsOfFile: errorPath, encoding: .utf8)) ?? ""
220 |                     let allLines = content.components(separatedBy: .newlines)
221 |                     let startIndex = max(0, allLines.count - lines)
222 |                     let lastLines = Array(allLines[startIndex...])
223 |                     print(lastLines.joined(separator: "\n"))
224 |                 }
225 |             } else {
226 |                 print("Error log file not found")
227 |             }
228 |             
229 |             print("\nTailing both log files... Press Ctrl+C to stop")
230 |             
231 |             // Initialize file handles and positions
232 |             var infoHandle: FileHandle? = nil
233 |             var errorHandle: FileHandle? = nil
234 |             var infoPosition: UInt64 = 0
235 |             var errorPosition: UInt64 = 0
236 |             
237 |             // Set up file handles
238 |             if infoExists {
239 |                 do {
240 |                     infoHandle = try FileHandle(forReadingFrom: URL(fileURLWithPath: infoPath))
241 |                     infoPosition = UInt64(try fileManager.attributesOfItem(atPath: infoPath)[.size] as? UInt64 ?? 0)
242 |                 } catch {
243 |                     print("Error opening info log file: \(error.localizedDescription)")
244 |                 }
245 |             }
246 |             
247 |             if errorExists {
248 |                 do {
249 |                     errorHandle = try FileHandle(forReadingFrom: URL(fileURLWithPath: errorPath))
250 |                     errorPosition = UInt64(try fileManager.attributesOfItem(atPath: errorPath)[.size] as? UInt64 ?? 0)
251 |                 } catch {
252 |                     print("Error opening error log file: \(error.localizedDescription)")
253 |                 }
254 |             }
255 |             
256 |             // Monitor both files for changes
257 |             while true {
258 |                 Thread.sleep(forTimeInterval: 0.5)
259 |                 
260 |                 // Check for new content in info log
261 |                 if let handle = infoHandle {
262 |                     do {
263 |                         // Re-check existence in case file was deleted
264 |                         infoExists = fileManager.fileExists(atPath: infoPath)
265 |                         if !infoExists {
266 |                             print("\n[Info log file was removed]")
267 |                             infoHandle = nil
268 |                             continue
269 |                         }
270 |                         
271 |                         let currentSize = try fileManager.attributesOfItem(atPath: infoPath)[.size] as? UInt64 ?? 0
272 |                         
273 |                         if currentSize > infoPosition {
274 |                             handle.seek(toFileOffset: infoPosition)
275 |                             if let newData = try? handle.readToEnd() {
276 |                                 if let newContent = String(data: newData, encoding: .utf8) {
277 |                                     print("\n--- New Info Log Content ---")
278 |                                     if newContent.hasSuffix("\n") {
279 |                                         print(newContent, terminator: "")
280 |                                     } else {
281 |                                         print(newContent)
282 |                                     }
283 |                                 }
284 |                             }
285 |                             infoPosition = currentSize
286 |                         } else if currentSize < infoPosition {
287 |                             // File was rotated
288 |                             print("\n[Info log was rotated]")
289 |                             infoPosition = 0
290 |                             handle.seek(toFileOffset: 0)
291 |                             if let newData = try? handle.readToEnd() {
292 |                                 if let newContent = String(data: newData, encoding: .utf8) {
293 |                                     print("\n--- New Info Log Content ---")
294 |                                     print(newContent, terminator: "")
295 |                                 }
296 |                             }
297 |                             infoPosition = currentSize
298 |                         }
299 |                     } catch {
300 |                         print("\nError reading info log: \(error.localizedDescription)")
301 |                     }
302 |                 } else if fileManager.fileExists(atPath: infoPath) && !infoExists {
303 |                     // File exists again after being deleted
304 |                     do {
305 |                         infoHandle = try FileHandle(forReadingFrom: URL(fileURLWithPath: infoPath))
306 |                         infoPosition = 0
307 |                         infoExists = true
308 |                         print("\n[Info log file reappeared]")
309 |                     } catch {
310 |                         print("\nError reopening info log: \(error.localizedDescription)")
311 |                     }
312 |                 }
313 |                 
314 |                 // Check for new content in error log
315 |                 if let handle = errorHandle {
316 |                     do {
317 |                         // Re-check existence in case file was deleted
318 |                         errorExists = fileManager.fileExists(atPath: errorPath)
319 |                         if !errorExists {
320 |                             print("\n[Error log file was removed]")
321 |                             errorHandle = nil
322 |                             continue
323 |                         }
324 |                         
325 |                         let currentSize = try fileManager.attributesOfItem(atPath: errorPath)[.size] as? UInt64 ?? 0
326 |                         
327 |                         if currentSize > errorPosition {
328 |                             handle.seek(toFileOffset: errorPosition)
329 |                             if let newData = try? handle.readToEnd() {
330 |                                 if let newContent = String(data: newData, encoding: .utf8) {
331 |                                     print("\n--- New Error Log Content ---")
332 |                                     if newContent.hasSuffix("\n") {
333 |                                         print(newContent, terminator: "")
334 |                                     } else {
335 |                                         print(newContent)
336 |                                     }
337 |                                 }
338 |                             }
339 |                             errorPosition = currentSize
340 |                         } else if currentSize < errorPosition {
341 |                             // File was rotated
342 |                             print("\n[Error log was rotated]")
343 |                             errorPosition = 0
344 |                             handle.seek(toFileOffset: 0)
345 |                             if let newData = try? handle.readToEnd() {
346 |                                 if let newContent = String(data: newData, encoding: .utf8) {
347 |                                     print("\n--- New Error Log Content ---")
348 |                                     print(newContent, terminator: "")
349 |                                 }
350 |                             }
351 |                             errorPosition = currentSize
352 |                         }
353 |                     } catch {
354 |                         print("\nError reading error log: \(error.localizedDescription)")
355 |                     }
356 |                 } else if fileManager.fileExists(atPath: errorPath) && !errorExists {
357 |                     // File exists again after being deleted
358 |                     do {
359 |                         errorHandle = try FileHandle(forReadingFrom: URL(fileURLWithPath: errorPath))
360 |                         errorPosition = 0
361 |                         errorExists = true
362 |                         print("\n[Error log file reappeared]")
363 |                     } catch {
364 |                         print("\nError reopening error log: \(error.localizedDescription)")
365 |                     }
366 |                 }
367 |             }
368 |         }
369 |         
370 |         func run() throws {
371 |             let infoLogPath = "/tmp/lume_daemon.log"
372 |             let errorLogPath = "/tmp/lume_daemon.error.log"
373 |             
374 |             if follow {
375 |                 // Use custom tailing implementation for both logs
376 |                 tailBothLogs(infoPath: infoLogPath, errorPath: errorLogPath, initialLines: lines ?? 10)
377 |             } else {
378 |                 // Regular one-time viewing of logs
379 |                 let infoContent = Logs.readLogFile(path: infoLogPath, lines: lines)
380 |                 let errorContent = Logs.readLogFile(path: errorLogPath, lines: lines)
381 |                 
382 |                 print("=== Info Logs ===")
383 |                 print(infoContent)
384 |                 print("\n=== Error Logs ===")
385 |                 print(errorContent)
386 |             }
387 |         }
388 |     }
389 | }
390 | 
```

--------------------------------------------------------------------------------
/examples/som_examples.py:
--------------------------------------------------------------------------------

```python
  1 | #!/usr/bin/env python3
  2 | """
  3 | Example script demonstrating the usage of OmniParser's UI element detection functionality.
  4 | This script shows how to:
  5 | 1. Initialize the OmniParser
  6 | 2. Load and process images
  7 | 3. Visualize detection results
  8 | 4. Compare performance between CPU and MPS (Apple Silicon)
  9 | """
 10 | 
 11 | import argparse
 12 | import base64
 13 | import glob
 14 | import io
 15 | import logging
 16 | import os
 17 | import sys
 18 | import time
 19 | from pathlib import Path
 20 | from typing import Any, Dict, List, Optional
 21 | 
 22 | import numpy as np
 23 | from PIL import Image
 24 | 
 25 | # Load environment variables from .env file
 26 | project_root = Path(__file__).parent.parent
 27 | env_file = project_root / ".env"
 28 | print(f"Loading environment from: {env_file}")
 29 | from dotenv import load_dotenv
 30 | 
 31 | load_dotenv(env_file)
 32 | 
 33 | # Add paths to sys.path if needed
 34 | pythonpath = os.environ.get("PYTHONPATH", "")
 35 | for path in pythonpath.split(":"):
 36 |     if path and path not in sys.path:
 37 |         sys.path.append(path)
 38 |         print(f"Added to sys.path: {path}")
 39 | 
 40 | # Add the libs directory to the path to find som
 41 | libs_path = project_root / "libs"
 42 | if str(libs_path) not in sys.path:
 43 |     sys.path.append(str(libs_path))
 44 |     print(f"Added to sys.path: {libs_path}")
 45 | 
 46 | from som import IconElement, OmniParser, ParseResult, TextElement
 47 | from som.models import BoundingBox, ParserMetadata, UIElement
 48 | 
 49 | # Configure logging
 50 | logging.basicConfig(
 51 |     level=logging.INFO,
 52 |     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 53 |     datefmt="%Y-%m-%d %H:%M:%S",
 54 | )
 55 | logger = logging.getLogger(__name__)
 56 | 
 57 | 
 58 | def setup_logging():
 59 |     """Configure logging with a nice format."""
 60 |     logging.basicConfig(
 61 |         level=logging.INFO,
 62 |         format="%(asctime)s - %(levelname)s - %(message)s",
 63 |         datefmt="%Y-%m-%d %H:%M:%S",
 64 |     )
 65 | 
 66 | 
 67 | class Timer:
 68 |     """Enhanced context manager for timing code blocks."""
 69 | 
 70 |     def __init__(self, name: str, logger):
 71 |         self.name = name
 72 |         self.logger = logger
 73 |         self.start_time: float = 0.0
 74 |         self.elapsed_time: float = 0.0
 75 | 
 76 |     def __enter__(self):
 77 |         self.start_time = time.time()
 78 |         return self
 79 | 
 80 |     def __exit__(self, *args):
 81 |         self.elapsed_time = time.time() - self.start_time
 82 |         self.logger.info(f"{self.name}: {self.elapsed_time:.3f}s")
 83 |         return False
 84 | 
 85 | 
 86 | def image_to_bytes(image: Image.Image) -> bytes:
 87 |     """Convert PIL Image to PNG bytes."""
 88 |     buf = io.BytesIO()
 89 |     image.save(buf, format="PNG")
 90 |     return buf.getvalue()
 91 | 
 92 | 
 93 | def process_image(
 94 |     parser: OmniParser, image_path: str, output_dir: Path, use_ocr: bool = False
 95 | ) -> None:
 96 |     """Process a single image and save the result."""
 97 |     try:
 98 |         # Load image
 99 |         logger.info(f"Processing image: {image_path}")
100 |         image = Image.open(image_path).convert("RGB")
101 |         logger.info(f"Image loaded successfully, size: {image.size}")
102 | 
103 |         # Create output filename
104 |         input_filename = Path(image_path).stem
105 |         output_path = output_dir / f"{input_filename}_analyzed.png"
106 | 
107 |         # Convert image to PNG bytes
108 |         image_bytes = image_to_bytes(image)
109 | 
110 |         # Process image
111 |         with Timer(f"Processing {input_filename}", logger):
112 |             result = parser.parse(image_bytes, use_ocr=use_ocr)
113 |             logger.info(
114 |                 f"Found {result.metadata.num_icons} icons and {result.metadata.num_text} text elements"
115 |             )
116 | 
117 |             # Save the annotated image
118 |             logger.info(f"Saving annotated image to: {output_path}")
119 |             try:
120 |                 # Save image from base64
121 |                 img_data = base64.b64decode(result.annotated_image_base64)
122 |                 img = Image.open(io.BytesIO(img_data))
123 |                 img.save(output_path)
124 | 
125 |                 # Print detailed results
126 |                 logger.info("\nDetected Elements:")
127 |                 for elem in result.elements:
128 |                     if isinstance(elem, IconElement):
129 |                         logger.info(
130 |                             f"Icon: confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}"
131 |                         )
132 |                     elif isinstance(elem, TextElement):
133 |                         logger.info(
134 |                             f"Text: '{elem.content}', confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}"
135 |                         )
136 | 
137 |                 # Verify file exists and log size
138 |                 if output_path.exists():
139 |                     logger.info(
140 |                         f"Successfully saved image. File size: {output_path.stat().st_size} bytes"
141 |                     )
142 |                 else:
143 |                     logger.error(f"Failed to verify file at {output_path}")
144 |             except Exception as e:
145 |                 logger.error(f"Error saving image: {str(e)}", exc_info=True)
146 | 
147 |     except Exception as e:
148 |         logger.error(f"Error processing image {image_path}: {str(e)}", exc_info=True)
149 | 
150 | 
151 | def run_detection_benchmark(
152 |     input_path: str,
153 |     output_dir: Path,
154 |     use_ocr: bool = False,
155 |     box_threshold: float = 0.01,
156 |     iou_threshold: float = 0.1,
157 | ):
158 |     """Run detection benchmark on images."""
159 |     logger.info(
160 |         f"Starting benchmark with OCR enabled: {use_ocr}, box_threshold: {box_threshold}, iou_threshold: {iou_threshold}"
161 |     )
162 | 
163 |     try:
164 |         # Initialize parser
165 |         logger.info("Initializing OmniParser...")
166 |         parser = OmniParser()
167 | 
168 |         # Create output directory
169 |         output_dir.mkdir(parents=True, exist_ok=True)
170 |         logger.info(f"Output directory created at: {output_dir}")
171 | 
172 |         # Get list of PNG files
173 |         if os.path.isdir(input_path):
174 |             image_files = glob.glob(os.path.join(input_path, "*.png"))
175 |         else:
176 |             image_files = [input_path]
177 | 
178 |         logger.info(f"Found {len(image_files)} images to process")
179 | 
180 |         # Process each image with specified thresholds
181 |         for image_path in image_files:
182 |             try:
183 |                 # Load image
184 |                 logger.info(f"Processing image: {image_path}")
185 |                 image = Image.open(image_path).convert("RGB")
186 |                 logger.info(f"Image loaded successfully, size: {image.size}")
187 | 
188 |                 # Create output filename
189 |                 input_filename = Path(image_path).stem
190 |                 output_path = output_dir / f"{input_filename}_analyzed.png"
191 | 
192 |                 # Convert image to PNG bytes
193 |                 image_bytes = image_to_bytes(image)
194 | 
195 |                 # Process image with specified thresholds
196 |                 with Timer(f"Processing {input_filename}", logger):
197 |                     result = parser.parse(
198 |                         image_bytes,
199 |                         use_ocr=use_ocr,
200 |                         box_threshold=box_threshold,
201 |                         iou_threshold=iou_threshold,
202 |                     )
203 |                     logger.info(
204 |                         f"Found {result.metadata.num_icons} icons and {result.metadata.num_text} text elements"
205 |                     )
206 | 
207 |                     # Save the annotated image
208 |                     logger.info(f"Saving annotated image to: {output_path}")
209 |                     try:
210 |                         # Save image from base64
211 |                         img_data = base64.b64decode(result.annotated_image_base64)
212 |                         img = Image.open(io.BytesIO(img_data))
213 |                         img.save(output_path)
214 | 
215 |                         # Print detailed results
216 |                         logger.info("\nDetected Elements:")
217 |                         for elem in result.elements:
218 |                             if isinstance(elem, IconElement):
219 |                                 logger.info(
220 |                                     f"Icon: confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}"
221 |                                 )
222 |                             elif isinstance(elem, TextElement):
223 |                                 logger.info(
224 |                                     f"Text: '{elem.content}', confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}"
225 |                                 )
226 | 
227 |                         # Verify file exists and log size
228 |                         if output_path.exists():
229 |                             logger.info(
230 |                                 f"Successfully saved image. File size: {output_path.stat().st_size} bytes"
231 |                             )
232 |                         else:
233 |                             logger.error(f"Failed to verify file at {output_path}")
234 |                     except Exception as e:
235 |                         logger.error(f"Error saving image: {str(e)}", exc_info=True)
236 | 
237 |             except Exception as e:
238 |                 logger.error(f"Error processing image {image_path}: {str(e)}", exc_info=True)
239 | 
240 |     except Exception as e:
241 |         logger.error(f"Benchmark failed: {str(e)}", exc_info=True)
242 |         raise
243 | 
244 | 
245 | def run_experiments(input_path: str, output_dir: Path, use_ocr: bool = False):
246 |     """Run experiments with different threshold combinations."""
247 |     # Define threshold values to test
248 |     box_thresholds = [0.01, 0.05, 0.1, 0.3]
249 |     iou_thresholds = [0.05, 0.1, 0.2, 0.5]
250 | 
251 |     logger.info("Starting threshold experiments...")
252 |     logger.info("Box thresholds to test: %s", box_thresholds)
253 |     logger.info("IOU thresholds to test: %s", iou_thresholds)
254 | 
255 |     # Create results directory for this experiment
256 |     timestamp = time.strftime("%Y%m%d-%H%M%S")
257 |     ocr_suffix = "_ocr" if use_ocr else "_no_ocr"
258 |     exp_dir = output_dir / f"experiment_{timestamp}{ocr_suffix}"
259 |     exp_dir.mkdir(parents=True, exist_ok=True)
260 | 
261 |     # Create a summary file
262 |     summary_file = exp_dir / "results_summary.txt"
263 |     with open(summary_file, "w") as f:
264 |         f.write("Threshold Experiments Results\n")
265 |         f.write("==========================\n\n")
266 |         f.write(f"Input: {input_path}\n")
267 |         f.write(f"OCR Enabled: {use_ocr}\n")
268 |         f.write(f"Date: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n")
269 |         f.write("Results:\n")
270 |         f.write("-" * 80 + "\n")
271 |         f.write(
272 |             f"{'Box Thresh':^10} | {'IOU Thresh':^10} | {'Num Icons':^10} | {'Num Text':^10} | {'Time (s)':^10}\n"
273 |         )
274 |         f.write("-" * 80 + "\n")
275 | 
276 |         # Initialize parser once for all experiments
277 |         parser = OmniParser()
278 | 
279 |         # Run experiments with each combination
280 |         for box_thresh in box_thresholds:
281 |             for iou_thresh in iou_thresholds:
282 |                 logger.info(f"\nTesting box_threshold={box_thresh}, iou_threshold={iou_thresh}")
283 | 
284 |                 # Create directory for this combination
285 |                 combo_dir = exp_dir / f"box_{box_thresh}_iou_{iou_thresh}"
286 |                 combo_dir.mkdir(exist_ok=True)
287 | 
288 |                 try:
289 |                     # Process each image
290 |                     if os.path.isdir(input_path):
291 |                         image_files = glob.glob(os.path.join(input_path, "*.png"))
292 |                     else:
293 |                         image_files = [input_path]
294 | 
295 |                     total_icons = 0
296 |                     total_text = 0
297 |                     total_time = 0
298 | 
299 |                     for image_path in image_files:
300 |                         # Load and process image
301 |                         image = Image.open(image_path).convert("RGB")
302 |                         image_bytes = image_to_bytes(image)
303 | 
304 |                         # Process with current thresholds
305 |                         with Timer(f"Processing {Path(image_path).stem}", logger) as t:
306 |                             result = parser.parse(
307 |                                 image_bytes,
308 |                                 use_ocr=use_ocr,
309 |                                 box_threshold=box_thresh,
310 |                                 iou_threshold=iou_thresh,
311 |                             )
312 | 
313 |                             # Save annotated image
314 |                             output_path = combo_dir / f"{Path(image_path).stem}_analyzed.png"
315 |                             img_data = base64.b64decode(result.annotated_image_base64)
316 |                             img = Image.open(io.BytesIO(img_data))
317 |                             img.save(output_path)
318 | 
319 |                             # Update totals
320 |                             total_icons += result.metadata.num_icons
321 |                             total_text += result.metadata.num_text
322 | 
323 |                             # Log detailed results
324 |                             detail_file = combo_dir / f"{Path(image_path).stem}_details.txt"
325 |                             with open(detail_file, "w") as detail_f:
326 |                                 detail_f.write(f"Results for {Path(image_path).name}\n")
327 |                                 detail_f.write("-" * 40 + "\n")
328 |                                 detail_f.write(f"Number of icons: {result.metadata.num_icons}\n")
329 |                                 detail_f.write(
330 |                                     f"Number of text elements: {result.metadata.num_text}\n\n"
331 |                                 )
332 | 
333 |                                 detail_f.write("Icon Detections:\n")
334 |                                 icon_count = 1
335 |                                 text_count = (
336 |                                     result.metadata.num_icons + 1
337 |                                 )  # Text boxes start after icons
338 | 
339 |                                 # First list all icons
340 |                                 for elem in result.elements:
341 |                                     if isinstance(elem, IconElement):
342 |                                         detail_f.write(f"Box #{icon_count}: Icon\n")
343 |                                         detail_f.write(f"  - Confidence: {elem.confidence:.3f}\n")
344 |                                         detail_f.write(
345 |                                             f"  - Coordinates: {elem.bbox.coordinates}\n"
346 |                                         )
347 |                                         icon_count += 1
348 | 
349 |                                 if use_ocr:
350 |                                     detail_f.write("\nText Detections:\n")
351 |                                     for elem in result.elements:
352 |                                         if isinstance(elem, TextElement):
353 |                                             detail_f.write(f"Box #{text_count}: Text\n")
354 |                                             detail_f.write(f"  - Content: '{elem.content}'\n")
355 |                                             detail_f.write(
356 |                                                 f"  - Confidence: {elem.confidence:.3f}\n"
357 |                                             )
358 |                                             detail_f.write(
359 |                                                 f"  - Coordinates: {elem.bbox.coordinates}\n"
360 |                                             )
361 |                                             text_count += 1
362 | 
363 |                         # Update timing totals
364 |                         total_time += t.elapsed_time
365 | 
366 |                     # Write summary for this combination
367 |                     avg_time = total_time / len(image_files)
368 |                     f.write(
369 |                         f"{box_thresh:^10.3f} | {iou_thresh:^10.3f} | {total_icons:^10d} | {total_text:^10d} | {avg_time:^10.3f}\n"
370 |                     )
371 | 
372 |                 except Exception as e:
373 |                     logger.error(
374 |                         f"Error in experiment box={box_thresh}, iou={iou_thresh}: {str(e)}"
375 |                     )
376 |                     f.write(
377 |                         f"{box_thresh:^10.3f} | {iou_thresh:^10.3f} | {'ERROR':^10s} | {'ERROR':^10s} | {'ERROR':^10s}\n"
378 |                     )
379 | 
380 |         # Write summary footer
381 |         f.write("-" * 80 + "\n")
382 |         f.write("\nExperiment completed successfully!\n")
383 | 
384 |     logger.info(f"\nExperiment results saved to {exp_dir}")
385 |     logger.info(f"Summary file: {summary_file}")
386 | 
387 | 
388 | def main():
389 |     """Main entry point."""
390 |     parser = argparse.ArgumentParser(description="Run OmniParser benchmark")
391 |     parser.add_argument("input_path", help="Path to input image or directory containing images")
392 |     parser.add_argument(
393 |         "--output-dir", default="examples/output", help="Output directory for annotated images"
394 |     )
395 |     parser.add_argument(
396 |         "--ocr",
397 |         choices=["none", "easyocr"],
398 |         default="none",
399 |         help="OCR engine to use (default: none)",
400 |     )
401 |     parser.add_argument(
402 |         "--mode",
403 |         choices=["single", "experiment"],
404 |         default="single",
405 |         help="Run mode: single run or threshold experiments (default: single)",
406 |     )
407 |     parser.add_argument(
408 |         "--box-threshold",
409 |         type=float,
410 |         default=0.01,
411 |         help="Confidence threshold for detection (default: 0.01)",
412 |     )
413 |     parser.add_argument(
414 |         "--iou-threshold",
415 |         type=float,
416 |         default=0.1,
417 |         help="IOU threshold for Non-Maximum Suppression (default: 0.1)",
418 |     )
419 |     args = parser.parse_args()
420 | 
421 |     logger.info(f"Starting OmniParser with arguments: {args}")
422 |     use_ocr = args.ocr != "none"
423 |     output_dir = Path(args.output_dir)
424 | 
425 |     try:
426 |         if args.mode == "experiment":
427 |             run_experiments(args.input_path, output_dir, use_ocr)
428 |         else:
429 |             run_detection_benchmark(
430 |                 args.input_path, output_dir, use_ocr, args.box_threshold, args.iou_threshold
431 |             )
432 |     except Exception as e:
433 |         logger.error(f"Process failed: {str(e)}", exc_info=True)
434 |         return 1
435 | 
436 |     return 0
437 | 
438 | 
439 | if __name__ == "__main__":
440 |     sys.exit(main())
441 | 
```

--------------------------------------------------------------------------------
/libs/python/som/som/detect.py:
--------------------------------------------------------------------------------

```python
  1 | import argparse
  2 | import base64
  3 | import io
  4 | import logging
  5 | import signal
  6 | import time
  7 | from contextlib import contextmanager
  8 | from pathlib import Path
  9 | from typing import Any, Dict, List, Optional, Tuple, Union, cast
 10 | 
 11 | import cv2
 12 | import numpy as np
 13 | import supervision as sv
 14 | import torch
 15 | import torchvision.ops
 16 | import torchvision.transforms as T
 17 | from huggingface_hub import hf_hub_download
 18 | from PIL import Image
 19 | from supervision.detection.core import Detections
 20 | from ultralytics import YOLO
 21 | 
 22 | from .detection import DetectionProcessor
 23 | from .models import (
 24 |     BoundingBox,
 25 |     IconElement,
 26 |     ParseResult,
 27 |     ParserMetadata,
 28 |     TextElement,
 29 |     UIElement,
 30 | )
 31 | from .ocr import OCRProcessor
 32 | from .visualization import BoxAnnotator
 33 | 
 34 | logger = logging.getLogger(__name__)
 35 | 
 36 | 
 37 | class TimeoutException(Exception):
 38 |     pass
 39 | 
 40 | 
 41 | @contextmanager
 42 | def timeout(seconds: int):
 43 |     def timeout_handler(signum, frame):
 44 |         raise TimeoutException("OCR process timed out")
 45 | 
 46 |     # Register the signal handler
 47 |     original_handler = signal.signal(signal.SIGALRM, timeout_handler)
 48 |     signal.alarm(seconds)
 49 | 
 50 |     try:
 51 |         yield
 52 |     finally:
 53 |         signal.alarm(0)
 54 |         signal.signal(signal.SIGALRM, original_handler)
 55 | 
 56 | 
 57 | def process_text_box(box, image):
 58 |     """Process a single text box with OCR."""
 59 |     try:
 60 |         from typing import Any, List, Sequence, Tuple
 61 | 
 62 |         import easyocr
 63 | 
 64 |         x1 = int(min(point[0] for point in box))
 65 |         y1 = int(min(point[1] for point in box))
 66 |         x2 = int(max(point[0] for point in box))
 67 |         y2 = int(max(point[1] for point in box))
 68 | 
 69 |         # Add padding
 70 |         pad = 2
 71 |         x1 = max(0, x1 - pad)
 72 |         y1 = max(0, y1 - pad)
 73 |         x2 = min(image.shape[1], x2 + pad)
 74 |         y2 = min(image.shape[0], y2 + pad)
 75 | 
 76 |         region = image[y1:y2, x1:x2]
 77 |         if region.size > 0:
 78 |             reader = easyocr.Reader(["en"])
 79 |             results = reader.readtext(region)
 80 |             if results and len(results) > 0:
 81 |                 # EasyOCR returns a list of tuples (bbox, text, confidence)
 82 |                 first_result = results[0]
 83 |                 if isinstance(first_result, (list, tuple)) and len(first_result) >= 3:
 84 |                     text = str(first_result[1])
 85 |                     confidence = float(first_result[2])
 86 |                     if confidence > 0.5:
 87 |                         return text, [x1, y1, x2, y2], confidence
 88 |     except Exception:
 89 |         pass
 90 |     return None
 91 | 
 92 | 
 93 | def check_ocr_box(image_path: Union[str, Path]) -> Tuple[List[str], List[List[float]]]:
 94 |     """Check OCR box using EasyOCR."""
 95 |     # Read image once
 96 |     if isinstance(image_path, str):
 97 |         image_path = Path(image_path)
 98 | 
 99 |     # Read image into memory
100 |     image_cv = cv2.imread(str(image_path))
101 |     if image_cv is None:
102 |         logger.error(f"Failed to read image: {image_path}")
103 |         return [], []
104 | 
105 |     # Get image dimensions
106 |     img_height, img_width = image_cv.shape[:2]
107 |     confidence_threshold = 0.5
108 | 
109 |     # Use EasyOCR
110 |     import ssl
111 | 
112 |     import easyocr
113 | 
114 |     # Create unverified SSL context for development
115 |     ssl._create_default_https_context = ssl._create_unverified_context
116 |     try:
117 |         reader = easyocr.Reader(["en"])
118 |         with timeout(5):  # 5 second timeout for EasyOCR
119 |             results = reader.readtext(image_cv, paragraph=False, text_threshold=0.5)
120 |     except TimeoutException:
121 |         logger.warning("EasyOCR timed out, returning no results")
122 |         return [], []
123 |     except Exception as e:
124 |         logger.warning(f"EasyOCR failed: {str(e)}")
125 |         return [], []
126 |     finally:
127 |         # Restore default SSL context
128 |         ssl._create_default_https_context = ssl.create_default_context
129 | 
130 |     texts = []
131 |     boxes = []
132 | 
133 |     for box, text, conf in results:
134 |         # Convert box format to [x1, y1, x2, y2]
135 |         x1 = min(point[0] for point in box)
136 |         y1 = min(point[1] for point in box)
137 |         x2 = max(point[0] for point in box)
138 |         y2 = max(point[1] for point in box)
139 | 
140 |         if float(conf) > 0.5:  # Only keep higher confidence detections
141 |             texts.append(text)
142 |             boxes.append([x1, y1, x2, y2])
143 | 
144 |     return texts, boxes
145 | 
146 | 
147 | class OmniParser:
148 |     """Enhanced UI parser using computer vision and OCR for detecting interactive elements."""
149 | 
150 |     def __init__(
151 |         self,
152 |         model_path: Optional[Union[str, Path]] = None,
153 |         cache_dir: Optional[Union[str, Path]] = None,
154 |         force_device: Optional[str] = None,
155 |     ):
156 |         """Initialize the OmniParser.
157 | 
158 |         Args:
159 |             model_path: Optional path to the YOLO model
160 |             cache_dir: Optional directory to cache model files
161 |             force_device: Force specific device (cpu/cuda/mps)
162 |         """
163 |         self.detector = DetectionProcessor(
164 |             model_path=Path(model_path) if model_path else None,
165 |             cache_dir=Path(cache_dir) if cache_dir else None,
166 |             force_device=force_device,
167 |         )
168 |         self.ocr = OCRProcessor()
169 |         self.visualizer = BoxAnnotator()
170 | 
171 |     def process_image(
172 |         self,
173 |         image: Image.Image,
174 |         box_threshold: float = 0.3,
175 |         iou_threshold: float = 0.1,
176 |         use_ocr: bool = True,
177 |     ) -> Tuple[Image.Image, List[UIElement]]:
178 |         """Process an image to detect UI elements and optionally text.
179 | 
180 |         Args:
181 |             image: Input PIL Image
182 |             box_threshold: Confidence threshold for detection
183 |             iou_threshold: IOU threshold for NMS
184 |             use_ocr: Whether to enable OCR processing
185 | 
186 |         Returns:
187 |             Tuple of (annotated image, list of detections)
188 |         """
189 |         try:
190 |             logger.info("Starting UI element detection...")
191 | 
192 |             # Detect icons
193 |             icon_detections = self.detector.detect_icons(
194 |                 image=image, box_threshold=box_threshold, iou_threshold=iou_threshold
195 |             )
196 |             logger.info(f"Found {len(icon_detections)} interactive elements")
197 | 
198 |             # Convert icon detections to typed objects
199 |             elements: List[UIElement] = cast(
200 |                 List[UIElement],
201 |                 [
202 |                     IconElement(
203 |                         id=i + 1,
204 |                         bbox=BoundingBox(
205 |                             x1=det["bbox"][0],
206 |                             y1=det["bbox"][1],
207 |                             x2=det["bbox"][2],
208 |                             y2=det["bbox"][3],
209 |                         ),
210 |                         confidence=det["confidence"],
211 |                         scale=det.get("scale"),
212 |                     )
213 |                     for i, det in enumerate(icon_detections)
214 |                 ],
215 |             )
216 | 
217 |             # Run OCR if enabled
218 |             if use_ocr:
219 |                 logger.info("Running OCR detection...")
220 |                 text_detections = self.ocr.detect_text(image=image, confidence_threshold=0.5)
221 |                 if text_detections is None:
222 |                     text_detections = []
223 |                 logger.info(f"Found {len(text_detections)} text regions")
224 | 
225 |                 # Convert text detections to typed objects
226 |                 text_elements = cast(
227 |                     List[UIElement],
228 |                     [
229 |                         TextElement(
230 |                             id=len(elements) + i + 1,
231 |                             bbox=BoundingBox(
232 |                                 x1=det["bbox"][0],
233 |                                 y1=det["bbox"][1],
234 |                                 x2=det["bbox"][2],
235 |                                 y2=det["bbox"][3],
236 |                             ),
237 |                             content=det["content"],
238 |                             confidence=det["confidence"],
239 |                         )
240 |                         for i, det in enumerate(text_detections)
241 |                     ],
242 |                 )
243 | 
244 |                 if elements and text_elements:
245 |                     # Filter out non-OCR elements that have OCR elements with center points colliding with them
246 |                     filtered_elements = []
247 |                     for elem in elements:  # elements at this point contains only non-OCR elements
248 |                         should_keep = True
249 |                         for text_elem in text_elements:
250 |                             # Calculate center point of the text element
251 |                             center_x = (text_elem.bbox.x1 + text_elem.bbox.x2) / 2
252 |                             center_y = (text_elem.bbox.y1 + text_elem.bbox.y2) / 2
253 | 
254 |                             # Check if this center point is inside the non-OCR element
255 |                             if (
256 |                                 center_x >= elem.bbox.x1
257 |                                 and center_x <= elem.bbox.x2
258 |                                 and center_y >= elem.bbox.y1
259 |                                 and center_y <= elem.bbox.y2
260 |                             ):
261 |                                 should_keep = False
262 |                                 break
263 | 
264 |                         if should_keep:
265 |                             filtered_elements.append(elem)
266 |                     elements = filtered_elements
267 | 
268 |                     # Merge detections using NMS
269 |                     all_elements = elements + text_elements
270 |                     boxes = torch.tensor([elem.bbox.coordinates for elem in all_elements])
271 |                     scores = torch.tensor([elem.confidence for elem in all_elements])
272 |                     keep_indices = torchvision.ops.nms(boxes, scores, iou_threshold)
273 |                     elements = [all_elements[i] for i in keep_indices]
274 |                 else:
275 |                     # Just add text elements to the list if IOU doesn't need to be applied
276 |                     elements.extend(text_elements)
277 | 
278 |             # Calculate drawing parameters based on image size
279 |             box_overlay_ratio = max(image.size) / 3200
280 |             draw_config = {
281 |                 "font_size": int(12 * box_overlay_ratio),
282 |                 "box_thickness": max(int(2 * box_overlay_ratio), 1),
283 |                 "text_padding": max(int(3 * box_overlay_ratio), 1),
284 |             }
285 | 
286 |             # Convert elements back to dict format for visualization
287 |             detection_dicts = [
288 |                 {
289 |                     "type": elem.type,
290 |                     "bbox": elem.bbox.coordinates,
291 |                     "confidence": elem.confidence,
292 |                     "content": elem.content if isinstance(elem, TextElement) else None,
293 |                 }
294 |                 for elem in elements
295 |             ]
296 | 
297 |             # Create visualization
298 |             logger.info("Creating visualization...")
299 |             annotated_image = self.visualizer.draw_boxes(
300 |                 image=image.copy(), detections=detection_dicts, draw_config=draw_config
301 |             )
302 |             logger.info("Visualization complete")
303 | 
304 |             return annotated_image, elements
305 | 
306 |         except Exception as e:
307 |             logger.error(f"Error in process_image: {str(e)}")
308 |             import traceback
309 | 
310 |             logger.error(traceback.format_exc())
311 |             raise
312 | 
313 |     def parse(
314 |         self,
315 |         screenshot_data: Union[bytes, str],
316 |         box_threshold: float = 0.3,
317 |         iou_threshold: float = 0.1,
318 |         use_ocr: bool = True,
319 |     ) -> ParseResult:
320 |         """Parse a UI screenshot to detect interactive elements and text.
321 | 
322 |         Args:
323 |             screenshot_data: Raw bytes or base64 string of the screenshot
324 |             box_threshold: Confidence threshold for detection
325 |             iou_threshold: IOU threshold for NMS
326 |             use_ocr: Whether to enable OCR processing
327 | 
328 |         Returns:
329 |             ParseResult object containing elements, annotated image, and metadata
330 |         """
331 |         try:
332 |             start_time = time.time()
333 | 
334 |             # Convert input to PIL Image
335 |             if isinstance(screenshot_data, str):
336 |                 screenshot_data = base64.b64decode(screenshot_data)
337 |             image = Image.open(io.BytesIO(screenshot_data)).convert("RGB")
338 | 
339 |             # Process image
340 |             annotated_image, elements = self.process_image(
341 |                 image=image,
342 |                 box_threshold=box_threshold,
343 |                 iou_threshold=iou_threshold,
344 |                 use_ocr=use_ocr,
345 |             )
346 | 
347 |             # Convert annotated image to base64
348 |             buffered = io.BytesIO()
349 |             annotated_image.save(buffered, format="PNG")
350 |             annotated_image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
351 | 
352 |             # Generate screen info text
353 |             screen_info = []
354 |             parsed_content_list = []
355 | 
356 |             # Set element IDs and generate human-readable descriptions
357 |             for i, elem in enumerate(elements):
358 |                 # Set the ID (1-indexed)
359 |                 elem.id = i + 1
360 | 
361 |                 if isinstance(elem, IconElement):
362 |                     screen_info.append(
363 |                         f"Box #{i+1}: Icon (confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates})"
364 |                     )
365 |                     parsed_content_list.append(
366 |                         {
367 |                             "id": i + 1,
368 |                             "type": "icon",
369 |                             "bbox": elem.bbox.coordinates,
370 |                             "confidence": elem.confidence,
371 |                             "content": None,
372 |                         }
373 |                     )
374 |                 elif isinstance(elem, TextElement):
375 |                     screen_info.append(
376 |                         f"Box #{i+1}: Text '{elem.content}' (confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates})"
377 |                     )
378 |                     parsed_content_list.append(
379 |                         {
380 |                             "id": i + 1,
381 |                             "type": "text",
382 |                             "bbox": elem.bbox.coordinates,
383 |                             "confidence": elem.confidence,
384 |                             "content": elem.content,
385 |                         }
386 |                     )
387 | 
388 |             # Calculate metadata
389 |             latency = time.time() - start_time
390 |             width, height = image.size
391 | 
392 |             # Create ParseResult object with enhanced properties
393 |             result = ParseResult(
394 |                 elements=elements,
395 |                 annotated_image_base64=annotated_image_base64,
396 |                 screen_info=screen_info,
397 |                 parsed_content_list=parsed_content_list,
398 |                 metadata=ParserMetadata(
399 |                     image_size=(width, height),
400 |                     num_icons=len([e for e in elements if isinstance(e, IconElement)]),
401 |                     num_text=len([e for e in elements if isinstance(e, TextElement)]),
402 |                     device=self.detector.device,
403 |                     ocr_enabled=use_ocr,
404 |                     latency=latency,
405 |                 ),
406 |             )
407 | 
408 |             # Return the ParseResult object directly
409 |             return result
410 | 
411 |         except Exception as e:
412 |             logger.error(f"Error in parse: {str(e)}")
413 |             import traceback
414 | 
415 |             logger.error(traceback.format_exc())
416 |             raise
417 | 
418 | 
419 | def main():
420 |     """Command line interface for UI element detection."""
421 |     parser = argparse.ArgumentParser(description="Detect UI elements and text in images")
422 |     parser.add_argument("image_path", help="Path to the input image")
423 |     parser.add_argument("--model-path", help="Path to YOLO model")
424 |     parser.add_argument(
425 |         "--box-threshold", type=float, default=0.3, help="Box confidence threshold (default: 0.3)"
426 |     )
427 |     parser.add_argument(
428 |         "--iou-threshold", type=float, default=0.1, help="IOU threshold (default: 0.1)"
429 |     )
430 |     parser.add_argument(
431 |         "--ocr", action="store_true", default=True, help="Enable OCR processing (default: True)"
432 |     )
433 |     parser.add_argument("--output", help="Output path for annotated image")
434 |     args = parser.parse_args()
435 | 
436 |     # Setup logging
437 |     logging.basicConfig(level=logging.INFO)
438 | 
439 |     try:
440 |         # Initialize parser
441 |         parser = OmniParser(model_path=args.model_path)
442 | 
443 |         # Load and process image
444 |         logger.info(f"Loading image from: {args.image_path}")
445 |         image = Image.open(args.image_path).convert("RGB")
446 |         logger.info(f"Image loaded successfully, size: {image.size}")
447 | 
448 |         # Process image
449 |         annotated_image, elements = parser.process_image(
450 |             image=image,
451 |             box_threshold=args.box_threshold,
452 |             iou_threshold=args.iou_threshold,
453 |             use_ocr=args.ocr,
454 |         )
455 | 
456 |         # Save output image
457 |         output_path = args.output or str(
458 |             Path(args.image_path).parent
459 |             / f"{Path(args.image_path).stem}_analyzed{Path(args.image_path).suffix}"
460 |         )
461 |         logger.info(f"Saving annotated image to: {output_path}")
462 | 
463 |         Path(output_path).parent.mkdir(parents=True, exist_ok=True)
464 |         annotated_image.save(output_path)
465 |         logger.info(f"Image saved successfully to {output_path}")
466 | 
467 |         # Print detections
468 |         logger.info("\nDetections:")
469 |         for i, elem in enumerate(elements):
470 |             if isinstance(elem, IconElement):
471 |                 logger.info(
472 |                     f"Interactive element {i}: confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}"
473 |                 )
474 |             elif isinstance(elem, TextElement):
475 |                 logger.info(f"Text {i}: '{elem.content}', bbox={elem.bbox.coordinates}")
476 | 
477 |     except Exception as e:
478 |         logger.error(f"Error processing image: {str(e)}")
479 |         import traceback
480 | 
481 |         logger.error(traceback.format_exc())
482 |         return 1
483 | 
484 |     return 0
485 | 
486 | 
487 | if __name__ == "__main__":
488 |     import sys
489 | 
490 |     sys.exit(main())
491 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/cli.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | CLI chat interface for agent - Computer Use Agent
  3 | 
  4 | Usage:
  5 |     python -m agent.cli <model_string>
  6 | 
  7 | Examples:
  8 |     python -m agent.cli openai/computer-use-preview
  9 |     python -m agent.cli anthropic/claude-sonnet-4-5-20250929
 10 |     python -m agent.cli omniparser+anthropic/claude-sonnet-4-5-20250929
 11 | """
 12 | 
 13 | try:
 14 |     import argparse
 15 |     import asyncio
 16 |     import base64
 17 |     import json
 18 |     import os
 19 |     import platform
 20 |     import sys
 21 |     import time
 22 |     from pathlib import Path
 23 |     from typing import Any, Dict, List
 24 | 
 25 |     import dotenv
 26 | 
 27 |     try:
 28 |         from PIL import Image, ImageDraw
 29 | 
 30 |         PIL_AVAILABLE = True
 31 |     except Exception:
 32 |         PIL_AVAILABLE = False
 33 |     from yaspin import yaspin
 34 | except ImportError:
 35 |     if __name__ == "__main__":
 36 |         raise ImportError(
 37 |             "CLI dependencies not found. " 'Please install with: pip install "cua-agent[cli]"'
 38 |         )
 39 | 
 40 | # Load environment variables
 41 | dotenv.load_dotenv()
 42 | 
 43 | 
 44 | # Color codes for terminal output
 45 | class Colors:
 46 |     RESET = "\033[0m"
 47 |     BOLD = "\033[1m"
 48 |     DIM = "\033[2m"
 49 | 
 50 |     # Text colors
 51 |     RED = "\033[31m"
 52 |     GREEN = "\033[32m"
 53 |     YELLOW = "\033[33m"
 54 |     BLUE = "\033[34m"
 55 |     MAGENTA = "\033[35m"
 56 |     CYAN = "\033[36m"
 57 |     WHITE = "\033[37m"
 58 |     GRAY = "\033[90m"
 59 | 
 60 |     # Background colors
 61 |     BG_RED = "\033[41m"
 62 |     BG_GREEN = "\033[42m"
 63 |     BG_YELLOW = "\033[43m"
 64 |     BG_BLUE = "\033[44m"
 65 | 
 66 | 
 67 | def print_colored(
 68 |     text: str,
 69 |     color: str = "",
 70 |     bold: bool = False,
 71 |     dim: bool = False,
 72 |     end: str = "\n",
 73 |     right: str = "",
 74 | ):
 75 |     """Print colored text to terminal with optional right-aligned text."""
 76 |     prefix = ""
 77 |     if bold:
 78 |         prefix += Colors.BOLD
 79 |     if dim:
 80 |         prefix += Colors.DIM
 81 |     if color:
 82 |         prefix += color
 83 | 
 84 |     if right:
 85 |         # Get terminal width (default to 80 if unable to determine)
 86 |         try:
 87 |             import shutil
 88 | 
 89 |             terminal_width = shutil.get_terminal_size().columns
 90 |         except:
 91 |             terminal_width = 80
 92 | 
 93 |         # Add right margin
 94 |         terminal_width -= 1
 95 | 
 96 |         # Calculate padding needed
 97 |         # Account for ANSI escape codes not taking visual space
 98 |         visible_left_len = len(text)
 99 |         visible_right_len = len(right)
100 |         padding = terminal_width - visible_left_len - visible_right_len
101 | 
102 |         if padding > 0:
103 |             output = f"{prefix}{text}{' ' * padding}{right}{Colors.RESET}"
104 |         else:
105 |             # If not enough space, just put a single space between
106 |             output = f"{prefix}{text} {right}{Colors.RESET}"
107 |     else:
108 |         output = f"{prefix}{text}{Colors.RESET}"
109 | 
110 |     print(output, end=end)
111 | 
112 | 
113 | def print_action(action_type: str, details: Dict[str, Any], total_cost: float):
114 |     """Print computer action with nice formatting."""
115 |     # Format action details
116 |     args_str = ""
117 |     if action_type == "click" and "x" in details and "y" in details:
118 |         args_str = f"_{details.get('button', 'left')}({details['x']}, {details['y']})"
119 |     elif action_type == "type" and "text" in details:
120 |         text = details["text"]
121 |         if len(text) > 50:
122 |             text = text[:47] + "..."
123 |         args_str = f'("{text}")'
124 |     elif action_type == "key" and "text" in details:
125 |         args_str = f"('{details['text']}')"
126 |     elif action_type == "scroll" and "x" in details and "y" in details:
127 |         args_str = f"({details['x']}, {details['y']})"
128 | 
129 |     if total_cost > 0:
130 |         print_colored(f"🛠️  {action_type}{args_str}", dim=True, right=f"💸 ${total_cost:.2f}")
131 |     else:
132 |         print_colored(f"🛠️  {action_type}{args_str}", dim=True)
133 | 
134 | 
135 | def print_welcome(model: str, agent_loop: str, container_name: str):
136 |     """Print welcome message."""
137 |     print_colored(f"Connected to {container_name} ({model}, {agent_loop})")
138 |     print_colored("Type 'exit' to quit.", dim=True)
139 | 
140 | 
141 | async def ainput(prompt: str = ""):
142 |     return await asyncio.to_thread(input, prompt)
143 | 
144 | 
145 | async def chat_loop(
146 |     agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True
147 | ):
148 |     """Main chat loop with the agent."""
149 |     print_welcome(model, agent.agent_config_info.agent_class.__name__, container_name)
150 | 
151 |     history = []
152 | 
153 |     if initial_prompt:
154 |         history.append({"role": "user", "content": initial_prompt})
155 | 
156 |     total_cost = 0
157 | 
158 |     while True:
159 |         if len(history) == 0 or history[-1].get("role") != "user":
160 |             # Get user input with prompt
161 |             print_colored("> ", end="")
162 |             user_input = await ainput()
163 | 
164 |             if user_input.lower() in ["exit", "quit", "q"]:
165 |                 print_colored("\n👋 Goodbye!")
166 |                 break
167 | 
168 |             if not user_input:
169 |                 continue
170 | 
171 |             # Add user message to history
172 |             history.append({"role": "user", "content": user_input})
173 | 
174 |         # Stream responses from the agent with spinner
175 |         with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
176 |             spinner.hide()
177 | 
178 |             async for result in agent.run(history):
179 |                 # Add agent responses to history
180 |                 history.extend(result.get("output", []))
181 | 
182 |                 if show_usage:
183 |                     total_cost += result.get("usage", {}).get("response_cost", 0)
184 | 
185 |                 # Process and display the output
186 |                 for item in result.get("output", []):
187 |                     if item.get("type") == "message" and item.get("role") == "assistant":
188 |                         # Display agent text response
189 |                         content = item.get("content", [])
190 |                         for content_part in content:
191 |                             if content_part.get("text"):
192 |                                 text = content_part.get("text", "").strip()
193 |                                 if text:
194 |                                     spinner.hide()
195 |                                     print_colored(text)
196 | 
197 |                     elif item.get("type") == "computer_call":
198 |                         # Display computer action
199 |                         action = item.get("action", {})
200 |                         action_type = action.get("type", "")
201 |                         if action_type:
202 |                             spinner.hide()
203 |                             print_action(action_type, action, total_cost)
204 |                             spinner.text = f"Performing {action_type}..."
205 |                             spinner.show()
206 | 
207 |                     elif item.get("type") == "function_call":
208 |                         # Display function call
209 |                         function_name = item.get("name", "")
210 |                         spinner.hide()
211 |                         print_colored(f"🔧 Calling function: {function_name}", dim=True)
212 |                         spinner.text = f"Calling {function_name}..."
213 |                         spinner.show()
214 | 
215 |                     elif item.get("type") == "function_call_output":
216 |                         # Display function output (dimmed)
217 |                         output = item.get("output", "")
218 |                         if output and len(output.strip()) > 0:
219 |                             spinner.hide()
220 |                             print_colored(f"📤 {output}", dim=True)
221 | 
222 |             spinner.hide()
223 |             if show_usage and total_cost > 0:
224 |                 print_colored(f"Total cost: ${total_cost:.2f}", dim=True)
225 | 
226 | 
227 | async def main():
228 |     """Main CLI function."""
229 |     parser = argparse.ArgumentParser(
230 |         description="CUA Agent CLI - Interactive computer use assistant",
231 |         formatter_class=argparse.RawDescriptionHelpFormatter,
232 |         epilog="""
233 | Examples:
234 |   python -m agent.cli openai/computer-use-preview
235 |   python -m agent.cli anthropic/claude-sonnet-4-5-20250929
236 |   python -m agent.cli omniparser+anthropic/claude-sonnet-4-5-20250929
237 |   python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
238 |         """,
239 |     )
240 | 
241 |     parser.add_argument(
242 |         "model",
243 |         help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-sonnet-4-5-20250929')",
244 |     )
245 | 
246 |     parser.add_argument(
247 |         "--provider",
248 |         choices=["cloud", "lume", "winsandbox", "docker"],
249 |         default="cloud",
250 |         help="Computer provider to use: cloud (default), lume, winsandbox, or docker",
251 |     )
252 | 
253 |     parser.add_argument(
254 |         "--images",
255 |         type=int,
256 |         default=3,
257 |         help="Number of recent images to keep in context (default: 3)",
258 |     )
259 | 
260 |     parser.add_argument("--trajectory", action="store_true", help="Save trajectory for debugging")
261 | 
262 |     parser.add_argument("--budget", type=float, help="Maximum budget for the session (in dollars)")
263 | 
264 |     parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
265 | 
266 |     parser.add_argument(
267 |         "-p",
268 |         "--prompt",
269 |         type=str,
270 |         help="Initial prompt to send to the agent. Leave blank for interactive mode.",
271 |     )
272 | 
273 |     parser.add_argument(
274 |         "--prompt-file",
275 |         type=Path,
276 |         help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt.",
277 |     )
278 | 
279 |     parser.add_argument(
280 |         "--predict-click",
281 |         dest="predict_click",
282 |         type=str,
283 |         help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it.",
284 |     )
285 | 
286 |     parser.add_argument("-c", "--cache", action="store_true", help="Tell the API to enable caching")
287 | 
288 |     parser.add_argument(
289 |         "-u", "--usage", action="store_true", help="Show total cost of the agent runs"
290 |     )
291 | 
292 |     parser.add_argument(
293 |         "-r",
294 |         "--max-retries",
295 |         type=int,
296 |         default=3,
297 |         help="Maximum number of retries for the LLM API calls",
298 |     )
299 | 
300 |     # Provider override credentials
301 |     parser.add_argument(
302 |         "--api-key",
303 |         dest="api_key",
304 |         type=str,
305 |         help="API key override for the model provider (passed to ComputerAgent)",
306 |     )
307 |     parser.add_argument(
308 |         "--api-base",
309 |         dest="api_base",
310 |         type=str,
311 |         help="API base URL override for the model provider (passed to ComputerAgent)",
312 |     )
313 | 
314 |     args = parser.parse_args()
315 | 
316 |     # Check for required environment variables
317 |     container_name = os.getenv("CUA_CONTAINER_NAME")
318 |     cua_api_key = os.getenv("CUA_API_KEY")
319 | 
320 |     # Prompt for missing environment variables (container name always required)
321 |     if not container_name:
322 |         if args.provider == "cloud":
323 |             print_colored("CUA_CONTAINER_NAME not set.", dim=True)
324 |             print_colored("You can get a CUA container at https://cua.ai/", dim=True)
325 |             container_name = input("Enter your CUA container name: ").strip()
326 |             if not container_name:
327 |                 print_colored("❌ Container name is required.")
328 |                 sys.exit(1)
329 |         else:
330 |             container_name = "cli-sandbox"
331 | 
332 |     # Only require API key for cloud provider
333 |     if args.provider == "cloud" and not cua_api_key:
334 |         print_colored("CUA_API_KEY not set.", dim=True)
335 |         cua_api_key = input("Enter your CUA API key: ").strip()
336 |         if not cua_api_key:
337 |             print_colored("❌ API key is required for cloud provider.")
338 |             sys.exit(1)
339 | 
340 |     # Check for provider-specific API keys based on model
341 |     provider_api_keys = {
342 |         "openai/": "OPENAI_API_KEY",
343 |         "anthropic/": "ANTHROPIC_API_KEY",
344 |     }
345 | 
346 |     # Find matching provider and check for API key
347 |     for prefix, env_var in provider_api_keys.items():
348 |         if prefix in args.model:
349 |             if not os.getenv(env_var):
350 |                 print_colored(f"{env_var} not set.", dim=True)
351 |                 api_key = input(f"Enter your {env_var.replace('_', ' ').title()}: ").strip()
352 |                 if not api_key:
353 |                     print_colored(f"❌ {env_var.replace('_', ' ').title()} is required.")
354 |                     sys.exit(1)
355 |                 # Set the environment variable for the session
356 |                 os.environ[env_var] = api_key
357 |             break
358 | 
359 |     # Import here to avoid import errors if dependencies are missing
360 |     try:
361 |         from agent import ComputerAgent
362 |         from computer import Computer
363 |     except ImportError as e:
364 |         print_colored(f"❌ Import error: {e}", Colors.RED, bold=True)
365 |         print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
366 |         sys.exit(1)
367 | 
368 |     # Resolve provider -> os_type, provider_type, api key requirement
369 |     provider_map = {
370 |         "cloud": ("linux", "cloud", True),
371 |         "lume": ("macos", "lume", False),
372 |         "winsandbox": ("windows", "winsandbox", False),
373 |         "docker": ("linux", "docker", False),
374 |     }
375 |     os_type, provider_type, needs_api_key = provider_map[args.provider]
376 | 
377 |     computer_kwargs = {
378 |         "os_type": os_type,
379 |         "provider_type": provider_type,
380 |         "name": container_name,
381 |     }
382 |     if needs_api_key:
383 |         computer_kwargs["api_key"] = cua_api_key  # type: ignore
384 | 
385 |     # Create computer instance
386 |     async with Computer(**computer_kwargs) as computer:  # type: ignore
387 | 
388 |         # Create agent
389 |         agent_kwargs = {
390 |             "model": args.model,
391 |             "tools": [computer],
392 |             "trust_remote_code": True,  # needed for some local models (e.g., InternVL, OpenCUA)
393 |             "verbosity": 20 if args.verbose else 30,  # DEBUG vs WARNING
394 |             "max_retries": args.max_retries,
395 |         }
396 | 
397 |         # Thread API credentials to agent if provided
398 |         if args.api_key:
399 |             agent_kwargs["api_key"] = args.api_key
400 |         if args.api_base:
401 |             agent_kwargs["api_base"] = args.api_base
402 | 
403 |         if args.images > 0:
404 |             agent_kwargs["only_n_most_recent_images"] = args.images
405 | 
406 |         if args.trajectory:
407 |             agent_kwargs["trajectory_dir"] = "trajectories"
408 | 
409 |         if args.budget:
410 |             agent_kwargs["max_trajectory_budget"] = {
411 |                 "max_budget": args.budget,
412 |                 "raise_error": True,
413 |                 "reset_after_each_run": False,
414 |             }
415 | 
416 |         if args.cache:
417 |             agent_kwargs["use_prompt_caching"] = True
418 | 
419 |         agent = ComputerAgent(**agent_kwargs)
420 | 
421 |         # If predict-click mode is requested, run once and exit
422 |         if args.predict_click:
423 |             if not PIL_AVAILABLE:
424 |                 print_colored(
425 |                     "❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow",
426 |                     Colors.RED,
427 |                     bold=True,
428 |                 )
429 |                 sys.exit(1)
430 | 
431 |             instruction = args.predict_click
432 |             print_colored(f"Predicting click for: '{instruction}'", Colors.CYAN)
433 | 
434 |             # Take a fresh screenshot FIRST
435 |             try:
436 |                 img_bytes = await computer.interface.screenshot()
437 |             except Exception as e:
438 |                 print_colored(f"❌ Failed to take screenshot: {e}", Colors.RED, bold=True)
439 |                 sys.exit(1)
440 | 
441 |             # Encode screenshot to base64 for predict_click
442 |             try:
443 |                 image_b64 = base64.b64encode(img_bytes).decode("utf-8")
444 |             except Exception as e:
445 |                 print_colored(f"❌ Failed to encode screenshot: {e}", Colors.RED, bold=True)
446 |                 sys.exit(1)
447 | 
448 |             try:
449 |                 coords = await agent.predict_click(instruction, image_b64=image_b64)
450 |             except Exception as e:
451 |                 print_colored(f"❌ predict_click failed: {e}", Colors.RED, bold=True)
452 |                 sys.exit(1)
453 | 
454 |             if not coords:
455 |                 print_colored("⚠️  No coordinates returned.", Colors.YELLOW)
456 |                 sys.exit(2)
457 | 
458 |             x, y = coords
459 |             print_colored(f"✅ Predicted coordinates: ({x}, {y})", Colors.GREEN)
460 | 
461 |             try:
462 |                 from io import BytesIO
463 | 
464 |                 with Image.open(BytesIO(img_bytes)) as img:
465 |                     img = img.convert("RGB")
466 |                     draw = ImageDraw.Draw(img)
467 |                     # Draw crosshair
468 |                     size = 12
469 |                     color = (255, 0, 0)
470 |                     draw.line([(x - size, y), (x + size, y)], fill=color, width=3)
471 |                     draw.line([(x, y - size), (x, y + size)], fill=color, width=3)
472 |                     # Optional small circle
473 |                     r = 6
474 |                     draw.ellipse([(x - r, y - r), (x + r, y + r)], outline=color, width=2)
475 | 
476 |                     out_path = Path.cwd() / f"predict_click_{int(time.time())}.png"
477 |                     img.save(out_path)
478 |                     print_colored(f"🖼️  Saved to {out_path}")
479 | 
480 |                     # Open the image with default viewer
481 |                     try:
482 |                         system = platform.system().lower()
483 |                         if system == "windows":
484 |                             os.startfile(str(out_path))  # type: ignore[attr-defined]
485 |                         elif system == "darwin":
486 |                             os.system(f'open "{out_path}"')
487 |                         else:
488 |                             os.system(f'xdg-open "{out_path}"')
489 |                     except Exception:
490 |                         pass
491 |             except Exception as e:
492 |                 print_colored(f"❌ Failed to render/save screenshot: {e}", Colors.RED, bold=True)
493 |                 sys.exit(1)
494 | 
495 |             # Done
496 |             sys.exit(0)
497 | 
498 |         # Resolve initial prompt from --prompt-file or --prompt
499 |         initial_prompt = args.prompt or ""
500 |         if args.prompt_file:
501 |             try:
502 |                 initial_prompt = args.prompt_file.read_text(encoding="utf-8")
503 |             except Exception as e:
504 |                 print_colored(f"❌ Failed to read --prompt-file: {e}", Colors.RED, bold=True)
505 |                 sys.exit(1)
506 | 
507 |         # Start chat loop (default interactive mode)
508 |         await chat_loop(agent, args.model, container_name, initial_prompt, args.usage)
509 | 
510 | 
511 | if __name__ == "__main__":
512 |     try:
513 |         asyncio.run(main())
514 |     except (KeyboardInterrupt, EOFError) as _:
515 |         print_colored("\n\n👋 Goodbye!")
516 | 
```

--------------------------------------------------------------------------------
/libs/python/computer/computer/helpers.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Helper functions and decorators for the Computer module.
  3 | """
  4 | 
  5 | import ast
  6 | import asyncio
  7 | import builtins
  8 | import importlib.util
  9 | import inspect
 10 | import logging
 11 | import os
 12 | import sys
 13 | from functools import wraps
 14 | from inspect import getsource
 15 | from textwrap import dedent
 16 | from types import FunctionType, ModuleType
 17 | from typing import Any, Awaitable, Callable, Dict, List, Set, TypedDict, TypeVar
 18 | 
 19 | try:
 20 |     # Python 3.12+ has ParamSpec in typing
 21 |     from typing import ParamSpec
 22 | except ImportError:  # pragma: no cover
 23 |     # Fallback for environments without ParamSpec in typing
 24 |     from typing_extensions import ParamSpec  # type: ignore
 25 | 
 26 | P = ParamSpec("P")
 27 | R = TypeVar("R")
 28 | 
 29 | 
 30 | class DependencyInfo(TypedDict):
 31 |     import_statements: List[str]
 32 |     definitions: List[tuple[str, Any]]
 33 | 
 34 | 
 35 | # Global reference to the default computer instance
 36 | _default_computer = None
 37 | 
 38 | # Global cache for function dependency analysis
 39 | _function_dependency_map: Dict[FunctionType, DependencyInfo] = {}
 40 | 
 41 | logger = logging.getLogger(__name__)
 42 | 
 43 | 
 44 | def set_default_computer(computer: Any) -> None:
 45 |     """
 46 |     Set the default computer instance to be used by the remote decorator.
 47 | 
 48 |     Args:
 49 |         computer: The computer instance to use as default
 50 |     """
 51 |     global _default_computer
 52 |     _default_computer = computer
 53 | 
 54 | 
 55 | def sandboxed(
 56 |     venv_name: str = "default",
 57 |     computer: str = "default",
 58 |     max_retries: int = 3,
 59 | ) -> Callable[[Callable[P, R]], Callable[P, Awaitable[R]]]:
 60 |     """
 61 |     Decorator that wraps a function to be executed remotely via computer.venv_exec
 62 | 
 63 |     The function is automatically analyzed for dependencies (imports, helper functions,
 64 |     constants, etc.) and reconstructed with all necessary code in the remote sandbox.
 65 | 
 66 |     Args:
 67 |         venv_name: Name of the virtual environment to execute in
 68 |         computer: The computer instance to use, or "default" to use the globally set default
 69 |         max_retries: Maximum number of retries for the remote execution
 70 |     """
 71 | 
 72 |     def decorator(func: Callable[P, R]) -> Callable[P, Awaitable[R]]:
 73 |         @wraps(func)
 74 |         async def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
 75 |             # Determine which computer instance to use
 76 |             comp = computer if computer != "default" else _default_computer
 77 | 
 78 |             if comp is None:
 79 |                 raise RuntimeError(
 80 |                     "No computer instance available. Either specify a computer instance or call set_default_computer() first."
 81 |                 )
 82 | 
 83 |             for i in range(max_retries):
 84 |                 try:
 85 |                     return await comp.venv_exec(venv_name, func, *args, **kwargs)
 86 |                 except Exception as e:
 87 |                     logger.error(f"Attempt {i+1} failed: {e}")
 88 |                     await asyncio.sleep(1)
 89 |                     if i == max_retries - 1:
 90 |                         raise e
 91 | 
 92 |             # Should be unreachable because we either returned or raised
 93 |             raise RuntimeError("sandboxed wrapper reached unreachable code path")
 94 | 
 95 |         return wrapper
 96 | 
 97 |     return decorator
 98 | 
 99 | 
100 | def _extract_import_statement(name: str, module: ModuleType) -> str:
101 |     """Extract the original import statement for a module."""
102 |     module_name = module.__name__
103 | 
104 |     if name == module_name.split(".")[0]:
105 |         return f"import {module_name}"
106 |     else:
107 |         return f"import {module_name} as {name}"
108 | 
109 | 
110 | def _is_third_party_module(module_name: str) -> bool:
111 |     """Check if a module is a third-party module."""
112 |     stdlib_modules = set(sys.stdlib_module_names) if hasattr(sys, "stdlib_module_names") else set()
113 | 
114 |     if module_name in stdlib_modules:
115 |         return False
116 | 
117 |     try:
118 |         spec = importlib.util.find_spec(module_name)
119 |         if spec is None:
120 |             return False
121 | 
122 |         if spec.origin and ("site-packages" in spec.origin or "dist-packages" in spec.origin):
123 |             return True
124 | 
125 |         return False
126 |     except (ImportError, ModuleNotFoundError, ValueError):
127 |         return False
128 | 
129 | 
130 | def _is_project_import(module_name: str) -> bool:
131 |     """Check if a module is a project-level import."""
132 |     if module_name.startswith("__relative_import_level_"):
133 |         return True
134 | 
135 |     if module_name in sys.modules:
136 |         module = sys.modules[module_name]
137 |         if hasattr(module, "__file__") and module.__file__:
138 |             if "site-packages" not in module.__file__ and "dist-packages" not in module.__file__:
139 |                 cwd = os.getcwd()
140 |                 if module.__file__.startswith(cwd):
141 |                     return True
142 | 
143 |     return False
144 | 
145 | 
146 | def _categorize_module(module_name: str) -> str:
147 |     """Categorize a module as stdlib, third-party, or project."""
148 |     if module_name.startswith("__relative_import_level_"):
149 |         return "project"
150 |     elif module_name in (
151 |         set(sys.stdlib_module_names) if hasattr(sys, "stdlib_module_names") else set()
152 |     ):
153 |         return "stdlib"
154 |     elif _is_third_party_module(module_name):
155 |         return "third_party"
156 |     elif _is_project_import(module_name):
157 |         return "project"
158 |     else:
159 |         return "unknown"
160 | 
161 | 
162 | class _DependencyVisitor(ast.NodeVisitor):
163 |     """AST visitor to extract imports and name references from a function."""
164 | 
165 |     def __init__(self, function_name: str) -> None:
166 |         self.function_name = function_name
167 |         self.internal_imports: Set[str] = set()
168 |         self.internal_import_statements: List[str] = []
169 |         self.name_references: Set[str] = set()
170 |         self.local_names: Set[str] = set()
171 |         self.inside_function = False
172 | 
173 |     def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
174 |         if node.name == self.function_name and not self.inside_function:
175 |             self.inside_function = True
176 | 
177 |             for arg in node.args.args + node.args.posonlyargs + node.args.kwonlyargs:
178 |                 self.local_names.add(arg.arg)
179 |             if node.args.vararg:
180 |                 self.local_names.add(node.args.vararg.arg)
181 |             if node.args.kwarg:
182 |                 self.local_names.add(node.args.kwarg.arg)
183 | 
184 |             for child in node.body:
185 |                 self.visit(child)
186 | 
187 |             self.inside_function = False
188 |         else:
189 |             if self.inside_function:
190 |                 self.local_names.add(node.name)
191 |                 for child in node.body:
192 |                     self.visit(child)
193 | 
194 |     def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
195 |         self.visit_FunctionDef(node)  # type: ignore
196 | 
197 |     def visit_Import(self, node: ast.Import) -> None:
198 |         if self.inside_function:
199 |             for alias in node.names:
200 |                 module_name = alias.name.split(".")[0]
201 |                 self.internal_imports.add(module_name)
202 |                 imported_as = alias.asname if alias.asname else alias.name.split(".")[0]
203 |                 self.local_names.add(imported_as)
204 |             self.internal_import_statements.append(ast.unparse(node))
205 |         self.generic_visit(node)
206 | 
207 |     def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
208 |         if self.inside_function:
209 |             if node.level == 0 and node.module:
210 |                 module_name = node.module.split(".")[0]
211 |                 self.internal_imports.add(module_name)
212 |             elif node.level > 0:
213 |                 self.internal_imports.add(f"__relative_import_level_{node.level}__")
214 | 
215 |             for alias in node.names:
216 |                 imported_as = alias.asname if alias.asname else alias.name
217 |                 self.local_names.add(imported_as)
218 |             self.internal_import_statements.append(ast.unparse(node))
219 | 
220 |         self.generic_visit(node)
221 | 
222 |     def visit_Name(self, node: ast.Name) -> None:
223 |         if self.inside_function:
224 |             if isinstance(node.ctx, ast.Load):
225 |                 self.name_references.add(node.id)
226 |             elif isinstance(node.ctx, ast.Store):
227 |                 self.local_names.add(node.id)
228 |         self.generic_visit(node)
229 | 
230 |     def visit_ClassDef(self, node: ast.ClassDef) -> None:
231 |         if self.inside_function:
232 |             self.local_names.add(node.name)
233 |         self.generic_visit(node)
234 | 
235 |     def visit_For(self, node: ast.For) -> None:
236 |         if self.inside_function and isinstance(node.target, ast.Name):
237 |             self.local_names.add(node.target.id)
238 |         self.generic_visit(node)
239 | 
240 |     def visit_comprehension(self, node: ast.comprehension) -> None:
241 |         if self.inside_function and isinstance(node.target, ast.Name):
242 |             self.local_names.add(node.target.id)
243 |         self.generic_visit(node)
244 | 
245 |     def visit_ExceptHandler(self, node: ast.ExceptHandler) -> None:
246 |         if self.inside_function and node.name:
247 |             self.local_names.add(node.name)
248 |         self.generic_visit(node)
249 | 
250 |     def visit_With(self, node: ast.With) -> None:
251 |         if self.inside_function:
252 |             for item in node.items:
253 |                 if item.optional_vars and isinstance(item.optional_vars, ast.Name):
254 |                     self.local_names.add(item.optional_vars.id)
255 |         self.generic_visit(node)
256 | 
257 | 
258 | def _traverse_and_collect_dependencies(func: FunctionType) -> DependencyInfo:
259 |     """
260 |     Traverse a function and collect its dependencies.
261 | 
262 |     Returns a dict with:
263 |         - import_statements: List of import statements needed
264 |         - definitions: List of (name, obj) tuples for helper functions/classes/constants
265 |     """
266 |     source = dedent(getsource(func))
267 |     tree = ast.parse(source)
268 | 
269 |     visitor = _DependencyVisitor(func.__name__)
270 |     visitor.visit(tree)
271 | 
272 |     builtin_names = set(dir(builtins))
273 |     external_refs = (visitor.name_references - visitor.local_names) - builtin_names
274 | 
275 |     import_statements = []
276 |     definitions = []
277 |     visited = set()
278 | 
279 |     # Include all internal import statements
280 |     import_statements.extend(visitor.internal_import_statements)
281 | 
282 |     # Analyze external references recursively
283 |     def analyze_object(obj: Any, name: str, depth: int = 0) -> None:
284 |         if depth > 20:
285 |             return
286 | 
287 |         obj_id = id(obj)
288 |         if obj_id in visited:
289 |             return
290 |         visited.add(obj_id)
291 | 
292 |         # Handle modules
293 |         if inspect.ismodule(obj):
294 |             import_stmt = _extract_import_statement(name, obj)
295 |             import_statements.append(import_stmt)
296 |             return
297 | 
298 |         # Handle functions and classes
299 |         if (
300 |             inspect.isfunction(obj)
301 |             or inspect.isclass(obj)
302 |             or inspect.isbuiltin(obj)
303 |             or inspect.ismethod(obj)
304 |         ):
305 |             obj_module = getattr(obj, "__module__", None)
306 |             if obj_module:
307 |                 base_module = obj_module.split(".")[0]
308 |                 module_category = _categorize_module(base_module)
309 | 
310 |                 # If from stdlib/third-party, just add import
311 |                 if module_category in ("stdlib", "third_party"):
312 |                     obj_name = getattr(obj, "__name__", name)
313 | 
314 |                     # Check if object is accessible by 'name' (in globals or closures)
315 |                     is_accessible = False
316 |                     if name in func.__globals__ and func.__globals__[name] is obj:
317 |                         is_accessible = True
318 |                     elif func.__closure__ and hasattr(func, "__code__"):
319 |                         freevars = func.__code__.co_freevars
320 |                         for i, var_name in enumerate(freevars):
321 |                             if var_name == name and i < len(func.__closure__):
322 |                                 try:
323 |                                     if func.__closure__[i].cell_contents is obj:
324 |                                         is_accessible = True
325 |                                         break
326 |                                 except (ValueError, AttributeError):
327 |                                     pass
328 | 
329 |                     if is_accessible and name == obj_name:
330 |                         # Direct import: from requests import get, from math import sqrt
331 |                         import_statements.append(f"from {base_module} import {name}")
332 |                     else:
333 |                         # Module import: import requests
334 |                         import_statements.append(f"import {base_module}")
335 |                     return
336 | 
337 |             try:
338 |                 obj_tree = ast.parse(dedent(getsource(obj)))
339 |                 obj_visitor = _DependencyVisitor(obj.__name__)
340 |                 obj_visitor.visit(obj_tree)
341 | 
342 |                 obj_external_refs = obj_visitor.name_references - obj_visitor.local_names
343 |                 obj_external_refs = obj_external_refs - builtin_names
344 | 
345 |                 # Add internal imports from this object
346 |                 import_statements.extend(obj_visitor.internal_import_statements)
347 | 
348 |                 # Recursively analyze its dependencies
349 |                 obj_globals = getattr(obj, "__globals__", None)
350 |                 obj_closure = getattr(obj, "__closure__", None)
351 |                 obj_code = getattr(obj, "__code__", None)
352 |                 if obj_globals:
353 |                     for ref_name in obj_external_refs:
354 |                         ref_obj = None
355 | 
356 |                         # Check globals first
357 |                         if ref_name in obj_globals:
358 |                             ref_obj = obj_globals[ref_name]
359 |                         # Check closure variables using co_freevars
360 |                         elif obj_closure and obj_code:
361 |                             freevars = obj_code.co_freevars
362 |                             for i, var_name in enumerate(freevars):
363 |                                 if var_name == ref_name and i < len(obj_closure):
364 |                                     try:
365 |                                         ref_obj = obj_closure[i].cell_contents
366 |                                         break
367 |                                     except (ValueError, AttributeError):
368 |                                         pass
369 | 
370 |                         if ref_obj is not None:
371 |                             analyze_object(ref_obj, ref_name, depth + 1)
372 | 
373 |                 # Add this object to definitions
374 |                 if not inspect.ismodule(obj):
375 |                     ref_module = getattr(obj, "__module__", None)
376 |                     if ref_module:
377 |                         ref_base_module = ref_module.split(".")[0]
378 |                         ref_category = _categorize_module(ref_base_module)
379 |                         if ref_category not in ("stdlib", "third_party"):
380 |                             definitions.append((name, obj))
381 |                     else:
382 |                         definitions.append((name, obj))
383 | 
384 |             except (OSError, TypeError):
385 |                 pass
386 |             return
387 | 
388 |         if isinstance(obj, (int, float, str, bool, list, dict, tuple, set, frozenset, type(None))):
389 |             definitions.append((name, obj))
390 | 
391 |     # Analyze all external references
392 |     for name in external_refs:
393 |         obj = None
394 | 
395 |         # First check globals
396 |         if name in func.__globals__:
397 |             obj = func.__globals__[name]
398 |         # Then check closure variables (sibling functions in enclosing scope)
399 |         elif func.__closure__ and func.__code__.co_freevars:
400 |             # Match closure variable names with cell contents
401 |             freevars = func.__code__.co_freevars
402 |             for i, var_name in enumerate(freevars):
403 |                 if var_name == name and i < len(func.__closure__):
404 |                     try:
405 |                         obj = func.__closure__[i].cell_contents
406 |                         break
407 |                     except (ValueError, AttributeError):
408 |                         # Cell is empty or doesn't have contents
409 |                         pass
410 | 
411 |         if obj is not None:
412 |             analyze_object(obj, name)
413 | 
414 |     # Remove duplicate import statements
415 |     unique_imports = []
416 |     seen = set()
417 |     for stmt in import_statements:
418 |         if stmt not in seen:
419 |             seen.add(stmt)
420 |             unique_imports.append(stmt)
421 | 
422 |     # Remove duplicate definitions
423 |     unique_definitions = []
424 |     seen_names = set()
425 |     for name, obj in definitions:
426 |         if name not in seen_names:
427 |             seen_names.add(name)
428 |             unique_definitions.append((name, obj))
429 | 
430 |     return {
431 |         "import_statements": unique_imports,
432 |         "definitions": unique_definitions,
433 |     }
434 | 
435 | 
436 | def generate_source_code(func: FunctionType) -> str:
437 |     """
438 |     Generate complete source code for a function with all dependencies.
439 | 
440 |     Args:
441 |         func: The function to generate source code for
442 | 
443 |     Returns:
444 |         Complete Python source code as a string
445 |     """
446 | 
447 |     if func in _function_dependency_map:
448 |         info = _function_dependency_map[func]
449 |     else:
450 |         info = _traverse_and_collect_dependencies(func)
451 |         _function_dependency_map[func] = info
452 | 
453 |     # Build source code
454 |     parts = []
455 | 
456 |     # 1. Add imports
457 |     if info["import_statements"]:
458 |         parts.append("\n".join(info["import_statements"]))
459 | 
460 |     # 2. Add definitions
461 |     for name, obj in info["definitions"]:
462 |         try:
463 |             if inspect.isfunction(obj):
464 |                 source = dedent(getsource(obj))
465 |                 tree = ast.parse(source)
466 |                 if tree.body and isinstance(tree.body[0], (ast.FunctionDef, ast.AsyncFunctionDef)):
467 |                     tree.body[0].decorator_list = []
468 |                     source = ast.unparse(tree)
469 |                 parts.append(source)
470 |             elif inspect.isclass(obj):
471 |                 source = dedent(getsource(obj))
472 |                 tree = ast.parse(source)
473 |                 if tree.body and isinstance(tree.body[0], ast.ClassDef):
474 |                     tree.body[0].decorator_list = []
475 |                     source = ast.unparse(tree)
476 |                 parts.append(source)
477 |             else:
478 |                 parts.append(f"{name} = {repr(obj)}")
479 |         except (OSError, TypeError):
480 |             pass
481 | 
482 |     # 3. Add main function (without decorators)
483 |     func_source = dedent(getsource(func))
484 |     tree = ast.parse(func_source)
485 |     if tree.body and isinstance(tree.body[0], (ast.FunctionDef, ast.AsyncFunctionDef)):
486 |         tree.body[0].decorator_list = []
487 |         func_source = ast.unparse(tree)
488 |     parts.append(func_source)
489 | 
490 |     return "\n\n".join(parts)
491 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/moondream3.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Moondream3+ composed-grounded agent loop implementation.
  3 | Grounding is handled by a local Moondream3 preview model via Transformers.
  4 | Thinking is delegated to the trailing LLM in the composed model string: "moondream3+<thinking_model>".
  5 | 
  6 | Differences from composed_grounded:
  7 | - Provides a singleton Moondream3 client outside the class.
  8 | - predict_click uses model.point(image, instruction, settings={"max_objects": 1}) and returns pixel coordinates.
  9 | - If the last image was a screenshot (or we take one), run model.detect(image, "all form ui") to get bboxes, then
 10 |   run model.caption on each cropped bbox to label it. Overlay labels on the screenshot and emit via _on_screenshot.
 11 | - Add a user message listing all detected form UI names so the thinker can reference them.
 12 | - If the thinking model doesn't support vision, filter out image content before calling litellm.
 13 | """
 14 | 
 15 | from __future__ import annotations
 16 | 
 17 | import base64
 18 | import io
 19 | import uuid
 20 | from typing import Any, Dict, List, Optional, Tuple
 21 | 
 22 | import litellm
 23 | from PIL import Image, ImageDraw, ImageFont
 24 | 
 25 | from ..decorators import register_agent
 26 | from ..loops.base import AsyncAgentConfig
 27 | from ..responses import (
 28 |     convert_completion_messages_to_responses_items,
 29 |     convert_computer_calls_desc2xy,
 30 |     convert_computer_calls_xy2desc,
 31 |     convert_responses_items_to_completion_messages,
 32 |     get_all_element_descriptions,
 33 | )
 34 | from ..types import AgentCapability
 35 | 
 36 | _MOONDREAM_SINGLETON = None
 37 | 
 38 | 
 39 | def get_moondream_model() -> Any:
 40 |     """Get a singleton instance of the Moondream3 preview model."""
 41 |     global _MOONDREAM_SINGLETON
 42 |     if _MOONDREAM_SINGLETON is None:
 43 |         try:
 44 |             import torch
 45 |             from transformers import AutoModelForCausalLM
 46 | 
 47 |             _MOONDREAM_SINGLETON = AutoModelForCausalLM.from_pretrained(
 48 |                 "moondream/moondream3-preview",
 49 |                 trust_remote_code=True,
 50 |                 torch_dtype=torch.bfloat16,
 51 |                 device_map="cuda",
 52 |             )
 53 |         except ImportError as e:
 54 |             raise RuntimeError(
 55 |                 "moondream3 requires torch and transformers. Install with: pip install cua-agent[moondream3]"
 56 |             ) from e
 57 |     return _MOONDREAM_SINGLETON
 58 | 
 59 | 
 60 | def _decode_image_b64(image_b64: str) -> Image.Image:
 61 |     data = base64.b64decode(image_b64)
 62 |     return Image.open(io.BytesIO(data)).convert("RGB")
 63 | 
 64 | 
 65 | def _image_to_b64(img: Image.Image) -> str:
 66 |     buf = io.BytesIO()
 67 |     img.save(buf, format="PNG")
 68 |     return base64.b64encode(buf.getvalue()).decode("utf-8")
 69 | 
 70 | 
 71 | def _supports_vision(model: str) -> bool:
 72 |     """Heuristic vision support detection for thinking model."""
 73 |     m = model.lower()
 74 |     vision_markers = [
 75 |         "gpt-4o",
 76 |         "gpt-4.1",
 77 |         "o1",
 78 |         "o3",
 79 |         "claude-3",
 80 |         "claude-3.5",
 81 |         "sonnet",
 82 |         "haiku",
 83 |         "opus",
 84 |         "gemini-1.5",
 85 |         "llava",
 86 |     ]
 87 |     return any(v in m for v in vision_markers)
 88 | 
 89 | 
 90 | def _filter_images_from_completion_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
 91 |     filtered: List[Dict[str, Any]] = []
 92 |     for msg in messages:
 93 |         msg_copy = {**msg}
 94 |         content = msg_copy.get("content")
 95 |         if isinstance(content, list):
 96 |             msg_copy["content"] = [c for c in content if c.get("type") != "image_url"]
 97 |         filtered.append(msg_copy)
 98 |     return filtered
 99 | 
100 | 
101 | def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str, List[str]]:
102 |     """Detect UI elements with Moondream, caption each, draw labels with backgrounds.
103 | 
104 |     Args:
105 |         base_img: PIL image of the screenshot (RGB or RGBA). Will be copied/converted internally.
106 |         model_md: Moondream model instance with .detect() and .query() methods.
107 | 
108 |     Returns:
109 |         A tuple of (annotated_image_base64_png, detected_names)
110 |     """
111 |     # Ensure RGBA for semi-transparent fills
112 |     if base_img.mode != "RGBA":
113 |         base_img = base_img.convert("RGBA")
114 |     W, H = base_img.width, base_img.height
115 | 
116 |     # Detect objects
117 |     try:
118 |         detect_result = model_md.detect(base_img, "all ui elements")
119 |         objects = detect_result.get("objects", []) if isinstance(detect_result, dict) else []
120 |     except Exception:
121 |         objects = []
122 | 
123 |     draw = ImageDraw.Draw(base_img)
124 |     try:
125 |         font = ImageFont.load_default()
126 |     except Exception:
127 |         font = None
128 | 
129 |     detected_names: List[str] = []
130 | 
131 |     for i, obj in enumerate(objects):
132 |         try:
133 |             # Clamp normalized coords and crop
134 |             x_min = max(0.0, min(1.0, float(obj.get("x_min", 0.0))))
135 |             y_min = max(0.0, min(1.0, float(obj.get("y_min", 0.0))))
136 |             x_max = max(0.0, min(1.0, float(obj.get("x_max", 0.0))))
137 |             y_max = max(0.0, min(1.0, float(obj.get("y_max", 0.0))))
138 |             left, top, right, bottom = (
139 |                 int(x_min * W),
140 |                 int(y_min * H),
141 |                 int(x_max * W),
142 |                 int(y_max * H),
143 |             )
144 |             left, top = max(0, left), max(0, top)
145 |             right, bottom = min(W - 1, right), min(H - 1, bottom)
146 |             crop = base_img.crop((left, top, right, bottom))
147 | 
148 |             # Prompted short caption
149 |             try:
150 |                 result = model_md.query(crop, "Caption this UI element in few words.")
151 |                 caption_text = (result or {}).get("answer", "")
152 |             except Exception:
153 |                 caption_text = ""
154 | 
155 |             name = (caption_text or "").strip() or f"element_{i+1}"
156 |             detected_names.append(name)
157 | 
158 |             # Draw bbox
159 |             draw.rectangle([left, top, right, bottom], outline=(255, 215, 0, 255), width=2)
160 | 
161 |             # Label background with padding and rounded corners
162 |             label = f"{i+1}. {name}"
163 |             padding = 3
164 |             if font:
165 |                 text_bbox = draw.textbbox((0, 0), label, font=font)
166 |             else:
167 |                 text_bbox = draw.textbbox((0, 0), label)
168 |             text_w = text_bbox[2] - text_bbox[0]
169 |             text_h = text_bbox[3] - text_bbox[1]
170 | 
171 |             tx = left + 3
172 |             ty = top - (text_h + 2 * padding + 4)
173 |             if ty < 0:
174 |                 ty = top + 3
175 | 
176 |             bg_left = tx - padding
177 |             bg_top = ty - padding
178 |             bg_right = tx + text_w + padding
179 |             bg_bottom = ty + text_h + padding
180 |             try:
181 |                 draw.rounded_rectangle(
182 |                     [bg_left, bg_top, bg_right, bg_bottom],
183 |                     radius=4,
184 |                     fill=(0, 0, 0, 160),
185 |                     outline=(255, 215, 0, 200),
186 |                     width=1,
187 |                 )
188 |             except Exception:
189 |                 draw.rectangle(
190 |                     [bg_left, bg_top, bg_right, bg_bottom],
191 |                     fill=(0, 0, 0, 160),
192 |                     outline=(255, 215, 0, 200),
193 |                     width=1,
194 |                 )
195 | 
196 |             text_fill = (255, 255, 255, 255)
197 |             if font:
198 |                 draw.text((tx, ty), label, fill=text_fill, font=font)
199 |             else:
200 |                 draw.text((tx, ty), label, fill=text_fill)
201 |         except Exception:
202 |             continue
203 | 
204 |     # Encode PNG base64
205 |     annotated = base_img
206 |     if annotated.mode not in ("RGBA", "RGB"):
207 |         annotated = annotated.convert("RGBA")
208 |     annotated_b64 = _image_to_b64(annotated)
209 |     return annotated_b64, detected_names
210 | 
211 | 
212 | GROUNDED_COMPUTER_TOOL_SCHEMA = {
213 |     "type": "function",
214 |     "function": {
215 |         "name": "computer",
216 |         "description": (
217 |             "Control a computer by taking screenshots and interacting with UI elements. "
218 |             "The screenshot action will include a list of detected form UI element names when available. "
219 |             "Use element descriptions to locate and interact with UI elements on the screen."
220 |         ),
221 |         "parameters": {
222 |             "type": "object",
223 |             "properties": {
224 |                 "action": {
225 |                     "type": "string",
226 |                     "enum": [
227 |                         "screenshot",
228 |                         "click",
229 |                         "double_click",
230 |                         "drag",
231 |                         "type",
232 |                         "keypress",
233 |                         "scroll",
234 |                         "move",
235 |                         "wait",
236 |                         "get_current_url",
237 |                         "get_dimensions",
238 |                         "get_environment",
239 |                     ],
240 |                     "description": "The action to perform (required for all actions)",
241 |                 },
242 |                 "element_description": {
243 |                     "type": "string",
244 |                     "description": "Description of the element to interact with (required for click/double_click/move/scroll)",
245 |                 },
246 |                 "start_element_description": {
247 |                     "type": "string",
248 |                     "description": "Description of the element to start dragging from (required for drag)",
249 |                 },
250 |                 "end_element_description": {
251 |                     "type": "string",
252 |                     "description": "Description of the element to drag to (required for drag)",
253 |                 },
254 |                 "text": {
255 |                     "type": "string",
256 |                     "description": "The text to type (required for type)",
257 |                 },
258 |                 "keys": {
259 |                     "type": "array",
260 |                     "items": {"type": "string"},
261 |                     "description": "Key(s) to press (required for keypress)",
262 |                 },
263 |                 "button": {
264 |                     "type": "string",
265 |                     "enum": ["left", "right", "wheel", "back", "forward"],
266 |                     "description": "The mouse button to use for click/double_click",
267 |                 },
268 |                 "scroll_x": {
269 |                     "type": "integer",
270 |                     "description": "Horizontal scroll amount (required for scroll)",
271 |                 },
272 |                 "scroll_y": {
273 |                     "type": "integer",
274 |                     "description": "Vertical scroll amount (required for scroll)",
275 |                 },
276 |             },
277 |             "required": ["action"],
278 |         },
279 |     },
280 | }
281 | 
282 | 
283 | @register_agent(r"moondream3\+.*", priority=2)
284 | class Moondream3PlusConfig(AsyncAgentConfig):
285 |     def __init__(self):
286 |         self.desc2xy: Dict[str, Tuple[float, float]] = {}
287 | 
288 |     async def predict_step(
289 |         self,
290 |         messages: List[Dict[str, Any]],
291 |         model: str,
292 |         tools: Optional[List[Dict[str, Any]]] = None,
293 |         max_retries: Optional[int] = None,
294 |         stream: bool = False,
295 |         computer_handler=None,
296 |         use_prompt_caching: Optional[bool] = False,
297 |         _on_api_start=None,
298 |         _on_api_end=None,
299 |         _on_usage=None,
300 |         _on_screenshot=None,
301 |         **kwargs,
302 |     ) -> Dict[str, Any]:
303 |         # Parse composed model: moondream3+<thinking_model>
304 |         if "+" not in model:
305 |             raise ValueError(f"Composed model must be 'moondream3+<thinking_model>', got: {model}")
306 |         _, thinking_model = model.split("+", 1)
307 | 
308 |         pre_output_items: List[Dict[str, Any]] = []
309 | 
310 |         # Acquire last screenshot; if missing, take one
311 |         last_image_b64: Optional[str] = None
312 |         for message in reversed(messages):
313 |             if (
314 |                 isinstance(message, dict)
315 |                 and message.get("type") == "computer_call_output"
316 |                 and isinstance(message.get("output"), dict)
317 |                 and message["output"].get("type") == "input_image"
318 |             ):
319 |                 image_url = message["output"].get("image_url", "")
320 |                 if image_url.startswith("data:image/png;base64,"):
321 |                     last_image_b64 = image_url.split(",", 1)[1]
322 |                     break
323 | 
324 |         if last_image_b64 is None and computer_handler is not None:
325 |             # Take a screenshot
326 |             screenshot_b64 = await computer_handler.screenshot()  # type: ignore
327 |             if screenshot_b64:
328 |                 call_id = uuid.uuid4().hex
329 |                 pre_output_items += [
330 |                     {
331 |                         "type": "message",
332 |                         "role": "assistant",
333 |                         "content": [
334 |                             {
335 |                                 "type": "output_text",
336 |                                 "text": "Taking a screenshot to analyze the current screen.",
337 |                             }
338 |                         ],
339 |                     },
340 |                     {
341 |                         "type": "computer_call",
342 |                         "call_id": call_id,
343 |                         "status": "completed",
344 |                         "action": {"type": "screenshot"},
345 |                     },
346 |                     {
347 |                         "type": "computer_call_output",
348 |                         "call_id": call_id,
349 |                         "output": {
350 |                             "type": "input_image",
351 |                             "image_url": f"data:image/png;base64,{screenshot_b64}",
352 |                         },
353 |                     },
354 |                 ]
355 |                 last_image_b64 = screenshot_b64
356 |                 if _on_screenshot:
357 |                     await _on_screenshot(screenshot_b64)
358 | 
359 |         # If we have a last screenshot, run Moondream detection and labeling
360 |         detected_names: List[str] = []
361 |         if last_image_b64 is not None:
362 |             base_img = _decode_image_b64(last_image_b64)
363 |             model_md = get_moondream_model()
364 |             annotated_b64, detected_names = _annotate_detect_and_label_ui(base_img, model_md)
365 |             if _on_screenshot:
366 |                 await _on_screenshot(annotated_b64, "annotated_form_ui")
367 | 
368 |             # Also push a user message listing all detected names
369 |             if detected_names:
370 |                 names_text = "\n".join(f"- {n}" for n in detected_names)
371 |                 pre_output_items.append(
372 |                     {
373 |                         "type": "message",
374 |                         "role": "user",
375 |                         "content": [
376 |                             {"type": "input_text", "text": "Detected form UI elements on screen:"},
377 |                             {"type": "input_text", "text": names_text},
378 |                             {
379 |                                 "type": "input_text",
380 |                                 "text": "Please continue with the next action needed to perform your task.",
381 |                             },
382 |                         ],
383 |                     }
384 |                 )
385 | 
386 |         tool_schemas = []
387 |         for schema in tools or []:
388 |             if schema.get("type") == "computer":
389 |                 tool_schemas.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
390 |             else:
391 |                 tool_schemas.append(schema)
392 | 
393 |         # Step 1: Convert computer calls from xy to descriptions
394 |         input_messages = messages + pre_output_items
395 |         messages_with_descriptions = convert_computer_calls_xy2desc(input_messages, self.desc2xy)
396 | 
397 |         # Step 2: Convert responses items to completion messages
398 |         completion_messages = convert_responses_items_to_completion_messages(
399 |             messages_with_descriptions,
400 |             allow_images_in_tool_results=False,
401 |         )
402 | 
403 |         # Optionally filter images if model lacks vision
404 |         if not _supports_vision(thinking_model):
405 |             completion_messages = _filter_images_from_completion_messages(completion_messages)
406 | 
407 |         # Step 3: Call thinking model with litellm.acompletion
408 |         api_kwargs = {
409 |             "model": thinking_model,
410 |             "messages": completion_messages,
411 |             "tools": tool_schemas,
412 |             "max_retries": max_retries,
413 |             "stream": stream,
414 |             **kwargs,
415 |         }
416 |         if use_prompt_caching:
417 |             api_kwargs["use_prompt_caching"] = use_prompt_caching
418 | 
419 |         if _on_api_start:
420 |             await _on_api_start(api_kwargs)
421 | 
422 |         response = await litellm.acompletion(**api_kwargs)
423 | 
424 |         if _on_api_end:
425 |             await _on_api_end(api_kwargs, response)
426 | 
427 |         usage = {
428 |             **response.usage.model_dump(),  # type: ignore
429 |             "response_cost": response._hidden_params.get("response_cost", 0.0),
430 |         }
431 |         if _on_usage:
432 |             await _on_usage(usage)
433 | 
434 |         # Step 4: Convert completion messages back to responses items format
435 |         response_dict = response.model_dump()  # type: ignore
436 |         choice_messages = [choice["message"] for choice in response_dict["choices"]]
437 |         thinking_output_items: List[Dict[str, Any]] = []
438 |         for choice_message in choice_messages:
439 |             thinking_output_items.extend(
440 |                 convert_completion_messages_to_responses_items([choice_message])
441 |             )
442 | 
443 |         # Step 5: Use Moondream to get coordinates for each description
444 |         element_descriptions = get_all_element_descriptions(thinking_output_items)
445 |         if element_descriptions and last_image_b64:
446 |             for desc in element_descriptions:
447 |                 for _ in range(3):  # try 3 times
448 |                     coords = await self.predict_click(
449 |                         model=model,
450 |                         image_b64=last_image_b64,
451 |                         instruction=desc,
452 |                     )
453 |                     if coords:
454 |                         self.desc2xy[desc] = coords
455 |                         break
456 | 
457 |         # Step 6: Convert computer calls from descriptions back to xy coordinates
458 |         final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
459 | 
460 |         # Step 7: Return output and usage
461 |         return {"output": pre_output_items + final_output_items, "usage": usage}
462 | 
463 |     async def predict_click(
464 |         self,
465 |         model: str,
466 |         image_b64: str,
467 |         instruction: str,
468 |         **kwargs,
469 |     ) -> Optional[Tuple[float, float]]:
470 |         """Predict click coordinates using Moondream3's point API.
471 | 
472 |         Returns pixel coordinates (x, y) as floats.
473 |         """
474 |         img = _decode_image_b64(image_b64)
475 |         W, H = img.width, img.height
476 |         model_md = get_moondream_model()
477 |         try:
478 |             result = model_md.point(img, instruction, settings={"max_objects": 1})
479 |         except Exception:
480 |             return None
481 | 
482 |         try:
483 |             pt = (result or {}).get("points", [])[0]
484 |             x_norm = float(pt.get("x", 0.0))
485 |             y_norm = float(pt.get("y", 0.0))
486 |             x_px = max(0.0, min(float(W - 1), x_norm * W))
487 |             y_px = max(0.0, min(float(H - 1), y_norm * H))
488 |             return (x_px, y_px)
489 |         except Exception:
490 |             return None
491 | 
492 |     def get_capabilities(self) -> List[AgentCapability]:
493 |         return ["click", "step"]
494 | 
```
Page 16/28FirstPrevNextLast