trycua/cua # codebase.md

This is page 22 of 29. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .cursorignore
├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github
│   ├── FUNDING.yml
│   ├── scripts
│   │   ├── get_pyproject_version.py
│   │   └── tests
│   │       ├── __init__.py
│   │       ├── README.md
│   │       └── test_get_pyproject_version.py
│   └── workflows
│       ├── bump-version.yml
│       ├── ci-lume.yml
│       ├── docker-publish-cua-linux.yml
│       ├── docker-publish-cua-windows.yml
│       ├── docker-publish-kasm.yml
│       ├── docker-publish-xfce.yml
│       ├── docker-reusable-publish.yml
│       ├── link-check.yml
│       ├── lint.yml
│       ├── npm-publish-cli.yml
│       ├── npm-publish-computer.yml
│       ├── npm-publish-core.yml
│       ├── publish-lume.yml
│       ├── pypi-publish-agent.yml
│       ├── pypi-publish-computer-server.yml
│       ├── pypi-publish-computer.yml
│       ├── pypi-publish-core.yml
│       ├── pypi-publish-mcp-server.yml
│       ├── pypi-publish-som.yml
│       ├── pypi-reusable-publish.yml
│       ├── python-tests.yml
│       ├── test-cua-models.yml
│       └── test-validation-script.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierignore
├── .prettierrc.yaml
├── .vscode
│   ├── docs.code-workspace
│   ├── extensions.json
│   ├── launch.json
│   ├── libs-ts.code-workspace
│   ├── lume.code-workspace
│   ├── lumier.code-workspace
│   ├── py.code-workspace
│   └── settings.json
├── blog
│   ├── app-use.md
│   ├── assets
│   │   ├── composite-agents.png
│   │   ├── docker-ubuntu-support.png
│   │   ├── hack-booth.png
│   │   ├── hack-closing-ceremony.jpg
│   │   ├── hack-cua-ollama-hud.jpeg
│   │   ├── hack-leaderboard.png
│   │   ├── hack-the-north.png
│   │   ├── hack-winners.jpeg
│   │   ├── hack-workshop.jpeg
│   │   ├── hud-agent-evals.png
│   │   └── trajectory-viewer.jpeg
│   ├── bringing-computer-use-to-the-web.md
│   ├── build-your-own-operator-on-macos-1.md
│   ├── build-your-own-operator-on-macos-2.md
│   ├── cloud-windows-ga-macos-preview.md
│   ├── composite-agents.md
│   ├── computer-use-agents-for-growth-hacking.md
│   ├── cua-hackathon.md
│   ├── cua-playground-preview.md
│   ├── cua-vlm-router.md
│   ├── hack-the-north.md
│   ├── hud-agent-evals.md
│   ├── human-in-the-loop.md
│   ├── introducing-cua-cli.md
│   ├── introducing-cua-cloud-containers.md
│   ├── lume-to-containerization.md
│   ├── neurips-2025-cua-papers.md
│   ├── sandboxed-python-execution.md
│   ├── training-computer-use-models-trajectories-1.md
│   ├── trajectory-viewer.md
│   ├── ubuntu-docker-support.md
│   └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│   ├── .env.example
│   ├── .gitignore
│   ├── content
│   │   └── docs
│   │       ├── agent-sdk
│   │       │   ├── agent-loops.mdx
│   │       │   ├── benchmarks
│   │       │   │   ├── index.mdx
│   │       │   │   ├── interactive.mdx
│   │       │   │   ├── introduction.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── osworld-verified.mdx
│   │       │   │   ├── screenspot-pro.mdx
│   │       │   │   └── screenspot-v2.mdx
│   │       │   ├── callbacks
│   │       │   │   ├── agent-lifecycle.mdx
│   │       │   │   ├── cost-saving.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── logging.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── pii-anonymization.mdx
│   │       │   │   └── trajectories.mdx
│   │       │   ├── chat-history.mdx
│   │       │   ├── custom-tools.mdx
│   │       │   ├── customizing-computeragent.mdx
│   │       │   ├── integrations
│   │       │   │   ├── hud.mdx
│   │       │   │   ├── meta.json
│   │       │   │   └── observability.mdx
│   │       │   ├── mcp-server
│   │       │   │   ├── client-integrations.mdx
│   │       │   │   ├── configuration.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── llm-integrations.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── tools.mdx
│   │       │   │   └── usage.mdx
│   │       │   ├── message-format.mdx
│   │       │   ├── meta.json
│   │       │   ├── migration-guide.mdx
│   │       │   ├── prompt-caching.mdx
│   │       │   ├── supported-agents
│   │       │   │   ├── composed-agents.mdx
│   │       │   │   ├── computer-use-agents.mdx
│   │       │   │   ├── grounding-models.mdx
│   │       │   │   ├── human-in-the-loop.mdx
│   │       │   │   └── meta.json
│   │       │   ├── supported-model-providers
│   │       │   │   ├── cua-vlm-router.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   └── local-models.mdx
│   │       │   ├── telemetry.mdx
│   │       │   └── usage-tracking.mdx
│   │       ├── cli-playbook
│   │       │   ├── commands.mdx
│   │       │   ├── index.mdx
│   │       │   └── meta.json
│   │       ├── computer-sdk
│   │       │   ├── cloud-vm-management.mdx
│   │       │   ├── commands.mdx
│   │       │   ├── computer-server
│   │       │   │   ├── Commands.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── REST-API.mdx
│   │       │   │   └── WebSocket-API.mdx
│   │       │   ├── computer-ui.mdx
│   │       │   ├── computers.mdx
│   │       │   ├── custom-computer-handlers.mdx
│   │       │   ├── meta.json
│   │       │   ├── sandboxed-python.mdx
│   │       │   └── tracing-api.mdx
│   │       ├── example-usecases
│   │       │   ├── form-filling.mdx
│   │       │   ├── gemini-complex-ui-navigation.mdx
│   │       │   ├── meta.json
│   │       │   ├── post-event-contact-export.mdx
│   │       │   └── windows-app-behind-vpn.mdx
│   │       ├── get-started
│   │       │   ├── meta.json
│   │       │   └── quickstart.mdx
│   │       ├── index.mdx
│   │       ├── macos-vm-cli-playbook
│   │       │   ├── lume
│   │       │   │   ├── cli-reference.mdx
│   │       │   │   ├── faq.md
│   │       │   │   ├── http-api.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── meta.json
│   │       │   │   └── prebuilt-images.mdx
│   │       │   ├── lumier
│   │       │   │   ├── building-lumier.mdx
│   │       │   │   ├── docker-compose.mdx
│   │       │   │   ├── docker.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   └── meta.json
│   │       │   └── meta.json
│   │       └── meta.json
│   ├── next.config.mjs
│   ├── package-lock.json
│   ├── package.json
│   ├── pnpm-lock.yaml
│   ├── postcss.config.mjs
│   ├── public
│   │   └── img
│   │       ├── agent_gradio_ui.png
│   │       ├── agent.png
│   │       ├── bg-dark.jpg
│   │       ├── bg-light.jpg
│   │       ├── cli.png
│   │       ├── computer.png
│   │       ├── grounding-with-gemini3.gif
│   │       ├── hero.png
│   │       ├── laminar_trace_example.png
│   │       ├── som_box_threshold.png
│   │       └── som_iou_threshold.png
│   ├── README.md
│   ├── source.config.ts
│   ├── src
│   │   ├── app
│   │   │   ├── (home)
│   │   │   │   ├── [[...slug]]
│   │   │   │   │   └── page.tsx
│   │   │   │   └── layout.tsx
│   │   │   ├── api
│   │   │   │   ├── posthog
│   │   │   │   │   └── [...path]
│   │   │   │   │       └── route.ts
│   │   │   │   └── search
│   │   │   │       └── route.ts
│   │   │   ├── favicon.ico
│   │   │   ├── global.css
│   │   │   ├── layout.config.tsx
│   │   │   ├── layout.tsx
│   │   │   ├── llms.mdx
│   │   │   │   └── [[...slug]]
│   │   │   │       └── route.ts
│   │   │   ├── llms.txt
│   │   │   │   └── route.ts
│   │   │   ├── robots.ts
│   │   │   └── sitemap.ts
│   │   ├── assets
│   │   │   ├── discord-black.svg
│   │   │   ├── discord-white.svg
│   │   │   ├── logo-black.svg
│   │   │   └── logo-white.svg
│   │   ├── components
│   │   │   ├── analytics-tracker.tsx
│   │   │   ├── cookie-consent.tsx
│   │   │   ├── doc-actions-menu.tsx
│   │   │   ├── editable-code-block.tsx
│   │   │   ├── footer.tsx
│   │   │   ├── hero.tsx
│   │   │   ├── iou.tsx
│   │   │   ├── mermaid.tsx
│   │   │   └── page-feedback.tsx
│   │   ├── lib
│   │   │   ├── llms.ts
│   │   │   └── source.ts
│   │   ├── mdx-components.tsx
│   │   └── providers
│   │       └── posthog-provider.tsx
│   └── tsconfig.json
├── examples
│   ├── agent_examples.py
│   ├── agent_ui_examples.py
│   ├── browser_agent_example.py
│   ├── browser_tool_example.py
│   ├── cloud_api_examples.py
│   ├── computer_examples_windows.py
│   ├── computer_examples.py
│   ├── computer_ui_examples.py
│   ├── computer-example-ts
│   │   ├── .env.example
│   │   ├── .gitignore
│   │   ├── package-lock.json
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── README.md
│   │   ├── src
│   │   │   ├── helpers.ts
│   │   │   └── index.ts
│   │   └── tsconfig.json
│   ├── docker_examples.py
│   ├── evals
│   │   ├── hud_eval_examples.py
│   │   └── wikipedia_most_linked.txt
│   ├── pylume_examples.py
│   ├── sandboxed_functions_examples.py
│   ├── som_examples.py
│   ├── tracing_examples.py
│   ├── utils.py
│   └── winsandbox_example.py
├── img
│   ├── agent_gradio_ui.png
│   ├── agent.png
│   ├── cli.png
│   ├── computer.png
│   ├── logo_black.png
│   └── logo_white.png
├── libs
│   ├── kasm
│   │   ├── Dockerfile
│   │   ├── LICENSE
│   │   ├── README.md
│   │   └── src
│   │       └── ubuntu
│   │           └── install
│   │               └── firefox
│   │                   ├── custom_startup.sh
│   │                   ├── firefox.desktop
│   │                   └── install_firefox.sh
│   ├── lume
│   │   ├── .cursorignore
│   │   ├── CONTRIBUTING.md
│   │   ├── Development.md
│   │   ├── img
│   │   │   └── cli.png
│   │   ├── Package.resolved
│   │   ├── Package.swift
│   │   ├── README.md
│   │   ├── resources
│   │   │   └── lume.entitlements
│   │   ├── scripts
│   │   │   ├── build
│   │   │   │   ├── build-debug.sh
│   │   │   │   ├── build-release-notarized.sh
│   │   │   │   └── build-release.sh
│   │   │   └── install.sh
│   │   ├── src
│   │   │   ├── Commands
│   │   │   │   ├── Clone.swift
│   │   │   │   ├── Config.swift
│   │   │   │   ├── Create.swift
│   │   │   │   ├── Delete.swift
│   │   │   │   ├── Get.swift
│   │   │   │   ├── Images.swift
│   │   │   │   ├── IPSW.swift
│   │   │   │   ├── List.swift
│   │   │   │   ├── Logs.swift
│   │   │   │   ├── Options
│   │   │   │   │   └── FormatOption.swift
│   │   │   │   ├── Prune.swift
│   │   │   │   ├── Pull.swift
│   │   │   │   ├── Push.swift
│   │   │   │   ├── Run.swift
│   │   │   │   ├── Serve.swift
│   │   │   │   ├── Set.swift
│   │   │   │   └── Stop.swift
│   │   │   ├── ContainerRegistry
│   │   │   │   ├── ImageContainerRegistry.swift
│   │   │   │   ├── ImageList.swift
│   │   │   │   └── ImagesPrinter.swift
│   │   │   ├── Errors
│   │   │   │   └── Errors.swift
│   │   │   ├── FileSystem
│   │   │   │   ├── Home.swift
│   │   │   │   ├── Settings.swift
│   │   │   │   ├── VMConfig.swift
│   │   │   │   ├── VMDirectory.swift
│   │   │   │   └── VMLocation.swift
│   │   │   ├── LumeController.swift
│   │   │   ├── Main.swift
│   │   │   ├── Server
│   │   │   │   ├── Handlers.swift
│   │   │   │   ├── HTTP.swift
│   │   │   │   ├── Requests.swift
│   │   │   │   ├── Responses.swift
│   │   │   │   └── Server.swift
│   │   │   ├── Utils
│   │   │   │   ├── CommandRegistry.swift
│   │   │   │   ├── CommandUtils.swift
│   │   │   │   ├── Logger.swift
│   │   │   │   ├── NetworkUtils.swift
│   │   │   │   ├── Path.swift
│   │   │   │   ├── ProcessRunner.swift
│   │   │   │   ├── ProgressLogger.swift
│   │   │   │   ├── String.swift
│   │   │   │   └── Utils.swift
│   │   │   ├── Virtualization
│   │   │   │   ├── DarwinImageLoader.swift
│   │   │   │   ├── DHCPLeaseParser.swift
│   │   │   │   ├── ImageLoaderFactory.swift
│   │   │   │   └── VMVirtualizationService.swift
│   │   │   ├── VM
│   │   │   │   ├── DarwinVM.swift
│   │   │   │   ├── LinuxVM.swift
│   │   │   │   ├── VM.swift
│   │   │   │   ├── VMDetails.swift
│   │   │   │   ├── VMDetailsPrinter.swift
│   │   │   │   ├── VMDisplayResolution.swift
│   │   │   │   └── VMFactory.swift
│   │   │   └── VNC
│   │   │       ├── PassphraseGenerator.swift
│   │   │       └── VNCService.swift
│   │   └── tests
│   │       ├── Mocks
│   │       │   ├── MockVM.swift
│   │       │   ├── MockVMVirtualizationService.swift
│   │       │   └── MockVNCService.swift
│   │       ├── VM
│   │       │   └── VMDetailsPrinterTests.swift
│   │       ├── VMTests.swift
│   │       ├── VMVirtualizationServiceTests.swift
│   │       └── VNCServiceTests.swift
│   ├── lumier
│   │   ├── .dockerignore
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   └── src
│   │       ├── bin
│   │       │   └── entry.sh
│   │       ├── config
│   │       │   └── constants.sh
│   │       ├── hooks
│   │       │   └── on-logon.sh
│   │       └── lib
│   │           ├── utils.sh
│   │           └── vm.sh
│   ├── python
│   │   ├── agent
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── agent
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── adapters
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── azure_ml_adapter.py
│   │   │   │   │   ├── cua_adapter.py
│   │   │   │   │   ├── huggingfacelocal_adapter.py
│   │   │   │   │   ├── human_adapter.py
│   │   │   │   │   ├── mlxvlm_adapter.py
│   │   │   │   │   └── models
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── generic.py
│   │   │   │   │       ├── internvl.py
│   │   │   │   │       ├── opencua.py
│   │   │   │   │       └── qwen2_5_vl.py
│   │   │   │   ├── agent.py
│   │   │   │   ├── callbacks
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── budget_manager.py
│   │   │   │   │   ├── image_retention.py
│   │   │   │   │   ├── logging.py
│   │   │   │   │   ├── operator_validator.py
│   │   │   │   │   ├── pii_anonymization.py
│   │   │   │   │   ├── prompt_instructions.py
│   │   │   │   │   ├── telemetry.py
│   │   │   │   │   └── trajectory_saver.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── computers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cua.py
│   │   │   │   │   └── custom.py
│   │   │   │   ├── decorators.py
│   │   │   │   ├── human_tool
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   ├── server.py
│   │   │   │   │   └── ui.py
│   │   │   │   ├── integrations
│   │   │   │   │   └── hud
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── agent.py
│   │   │   │   │       └── proxy.py
│   │   │   │   ├── loops
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── anthropic.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── composed_grounded.py
│   │   │   │   │   ├── fara.py
│   │   │   │   │   ├── gelato.py
│   │   │   │   │   ├── gemini.py
│   │   │   │   │   ├── generic_vlm.py
│   │   │   │   │   ├── glm45v.py
│   │   │   │   │   ├── gta1.py
│   │   │   │   │   ├── holo.py
│   │   │   │   │   ├── internvl.py
│   │   │   │   │   ├── model_types.csv
│   │   │   │   │   ├── moondream3.py
│   │   │   │   │   ├── omniparser.py
│   │   │   │   │   ├── openai.py
│   │   │   │   │   ├── opencua.py
│   │   │   │   │   ├── uiins.py
│   │   │   │   │   ├── uitars.py
│   │   │   │   │   └── uitars2.py
│   │   │   │   ├── playground
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── server.py
│   │   │   │   ├── proxy
│   │   │   │   │   ├── examples.py
│   │   │   │   │   └── handlers.py
│   │   │   │   ├── responses.py
│   │   │   │   ├── tools
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   └── browser_tool.py
│   │   │   │   ├── types.py
│   │   │   │   └── ui
│   │   │   │       ├── __init__.py
│   │   │   │       ├── __main__.py
│   │   │   │       └── gradio
│   │   │   │           ├── __init__.py
│   │   │   │           ├── app.py
│   │   │   │           └── ui_components.py
│   │   │   ├── benchmarks
│   │   │   │   ├── .gitignore
│   │   │   │   ├── contrib.md
│   │   │   │   ├── interactive.py
│   │   │   │   ├── models
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   └── gta1.py
│   │   │   │   ├── README.md
│   │   │   │   ├── ss-pro.py
│   │   │   │   ├── ss-v2.py
│   │   │   │   └── utils.py
│   │   │   ├── example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_computer_agent.py
│   │   ├── bench-ui
│   │   │   ├── bench_ui
│   │   │   │   ├── __init__.py
│   │   │   │   ├── api.py
│   │   │   │   └── child.py
│   │   │   ├── examples
│   │   │   │   ├── folder_example.py
│   │   │   │   ├── gui
│   │   │   │   │   ├── index.html
│   │   │   │   │   ├── logo.svg
│   │   │   │   │   └── styles.css
│   │   │   │   ├── output_overlay.png
│   │   │   │   └── simple_example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       └── test_port_detection.py
│   │   ├── computer
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer
│   │   │   │   ├── __init__.py
│   │   │   │   ├── computer.py
│   │   │   │   ├── diorama_computer.py
│   │   │   │   ├── helpers.py
│   │   │   │   ├── interface
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   ├── models.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── logger.py
│   │   │   │   ├── models.py
│   │   │   │   ├── providers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cloud
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── docker
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── lume
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── lume_api.py
│   │   │   │   │   ├── lumier
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── types.py
│   │   │   │   │   └── winsandbox
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── provider.py
│   │   │   │   │       └── setup_script.ps1
│   │   │   │   ├── tracing_wrapper.py
│   │   │   │   ├── tracing.py
│   │   │   │   ├── ui
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   └── gradio
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       └── app.py
│   │   │   │   └── utils.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       ├── test_computer.py
│   │   │       └── test_helpers.py
│   │   ├── computer-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── browser.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── diorama
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── diorama_computer.py
│   │   │   │   │   ├── diorama.py
│   │   │   │   │   ├── draw.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── safezone.py
│   │   │   │   ├── handlers
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── main.py
│   │   │   │   ├── server.py
│   │   │   │   ├── utils
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── wallpaper.py
│   │   │   │   └── watchdog.py
│   │   │   ├── examples
│   │   │   │   ├── __init__.py
│   │   │   │   └── usage_example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   ├── run_server.py
│   │   │   ├── test_connection.py
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_server.py
│   │   ├── core
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── core
│   │   │   │   ├── __init__.py
│   │   │   │   └── telemetry
│   │   │   │       ├── __init__.py
│   │   │   │       └── posthog.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_telemetry.py
│   │   ├── mcp-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── build-extension.py
│   │   │   ├── CONCURRENT_SESSIONS.md
│   │   │   ├── desktop-extension
│   │   │   │   ├── cua-extension.mcpb
│   │   │   │   ├── desktop_extension.png
│   │   │   │   ├── manifest.json
│   │   │   │   ├── README.md
│   │   │   │   ├── requirements.txt
│   │   │   │   ├── run_server.sh
│   │   │   │   └── setup.py
│   │   │   ├── mcp_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── server.py
│   │   │   │   └── session_manager.py
│   │   │   ├── pdm.lock
│   │   │   ├── pyproject.toml
│   │   │   ├── QUICK_TEST_COMMANDS.sh
│   │   │   ├── quick_test_local_option.py
│   │   │   ├── README.md
│   │   │   ├── scripts
│   │   │   │   ├── install_mcp_server.sh
│   │   │   │   └── start_mcp_server.sh
│   │   │   ├── test_mcp_server_local_option.py
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_mcp_server.py
│   │   ├── pylume
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_pylume.py
│   │   └── som
│   │       ├── .bumpversion.cfg
│   │       ├── LICENSE
│   │       ├── poetry.toml
│   │       ├── pyproject.toml
│   │       ├── README.md
│   │       ├── som
│   │       │   ├── __init__.py
│   │       │   ├── detect.py
│   │       │   ├── detection.py
│   │       │   ├── models.py
│   │       │   ├── ocr.py
│   │       │   ├── util
│   │       │   │   └── utils.py
│   │       │   └── visualization.py
│   │       └── tests
│   │           ├── conftest.py
│   │           └── test_omniparser.py
│   ├── qemu-docker
│   │   ├── linux
│   │   │   ├── Dockerfile
│   │   │   ├── README.md
│   │   │   └── src
│   │   │       ├── entry.sh
│   │   │       └── vm
│   │   │           ├── image
│   │   │           │   └── README.md
│   │   │           └── setup
│   │   │               ├── install.sh
│   │   │               ├── setup-cua-server.sh
│   │   │               └── setup.sh
│   │   ├── README.md
│   │   └── windows
│   │       ├── Dockerfile
│   │       ├── README.md
│   │       └── src
│   │           ├── entry.sh
│   │           └── vm
│   │               ├── image
│   │               │   └── README.md
│   │               └── setup
│   │                   ├── install.bat
│   │                   ├── on-logon.ps1
│   │                   ├── setup-cua-server.ps1
│   │                   ├── setup-utils.psm1
│   │                   └── setup.ps1
│   ├── typescript
│   │   ├── .gitignore
│   │   ├── .nvmrc
│   │   ├── agent
│   │   │   ├── examples
│   │   │   │   ├── playground-example.html
│   │   │   │   └── README.md
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── client.ts
│   │   │   │   ├── index.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   └── client.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── computer
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── computer
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── providers
│   │   │   │   │   │   ├── base.ts
│   │   │   │   │   │   ├── cloud.ts
│   │   │   │   │   │   └── index.ts
│   │   │   │   │   └── types.ts
│   │   │   │   ├── index.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── base.ts
│   │   │   │   │   ├── factory.ts
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── linux.ts
│   │   │   │   │   ├── macos.ts
│   │   │   │   │   └── windows.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   ├── computer
│   │   │   │   │   └── cloud.test.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── factory.test.ts
│   │   │   │   │   ├── index.test.ts
│   │   │   │   │   ├── linux.test.ts
│   │   │   │   │   ├── macos.test.ts
│   │   │   │   │   └── windows.test.ts
│   │   │   │   └── setup.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── core
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── index.ts
│   │   │   │   └── telemetry
│   │   │   │       ├── clients
│   │   │   │       │   ├── index.ts
│   │   │   │       │   └── posthog.ts
│   │   │   │       └── index.ts
│   │   │   ├── tests
│   │   │   │   └── telemetry.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── cua-cli
│   │   │   ├── .gitignore
│   │   │   ├── .prettierrc
│   │   │   ├── bun.lock
│   │   │   ├── CLAUDE.md
│   │   │   ├── index.ts
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── auth.ts
│   │   │   │   ├── cli.ts
│   │   │   │   ├── commands
│   │   │   │   │   ├── auth.ts
│   │   │   │   │   └── sandbox.ts
│   │   │   │   ├── config.ts
│   │   │   │   ├── http.ts
│   │   │   │   ├── storage.ts
│   │   │   │   └── util.ts
│   │   │   └── tsconfig.json
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── pnpm-workspace.yaml
│   │   └── README.md
│   └── xfce
│       ├── .dockerignore
│       ├── .gitignore
│       ├── Development.md
│       ├── Dockerfile
│       ├── Dockerfile.dev
│       ├── README.md
│       └── src
│           ├── scripts
│           │   ├── resize-display.sh
│           │   ├── start-computer-server.sh
│           │   ├── start-novnc.sh
│           │   ├── start-vnc.sh
│           │   └── xstartup.sh
│           ├── supervisor
│           │   └── supervisord.conf
│           └── xfce-config
│               ├── helpers.rc
│               ├── xfce4-power-manager.xml
│               └── xfce4-session.xml
├── LICENSE.md
├── Makefile
├── notebooks
│   ├── agent_nb.ipynb
│   ├── blog
│   │   ├── build-your-own-operator-on-macos-1.ipynb
│   │   └── build-your-own-operator-on-macos-2.ipynb
│   ├── composite_agents_docker_nb.ipynb
│   ├── computer_nb.ipynb
│   ├── computer_server_nb.ipynb
│   ├── customizing_computeragent.ipynb
│   ├── eval_osworld.ipynb
│   ├── ollama_nb.ipynb
│   ├── README.md
│   ├── sota_hackathon_cloud.ipynb
│   └── sota_hackathon.ipynb
├── package-lock.json
├── package.json
├── pnpm-lock.yaml
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── scripts
│   ├── install-cli.ps1
│   ├── install-cli.sh
│   ├── playground-docker.sh
│   ├── playground.sh
│   ├── run-docker-dev.sh
│   └── typescript-typecheck.js
├── TESTING.md
├── tests
│   ├── agent_loop_testing
│   │   ├── agent_test.py
│   │   └── README.md
│   ├── pytest.ini
│   ├── shell_cmd.py
│   ├── test_files.py
│   ├── test_mcp_server_session_management.py
│   ├── test_mcp_server_streaming.py
│   ├── test_shell_bash.py
│   ├── test_telemetry.py
│   ├── test_tracing.py
│   ├── test_venv.py
│   └── test_watchdog.py
└── uv.lock
```

# Files

--------------------------------------------------------------------------------
/libs/python/agent/agent/human_tool/ui.py:
--------------------------------------------------------------------------------

```python
  1 | import base64
  2 | import io
  3 | import json
  4 | import time
  5 | from datetime import datetime
  6 | from typing import Any, Dict, List, Optional
  7 | 
  8 | import gradio as gr
  9 | import requests
 10 | from PIL import Image
 11 | 
 12 | from .server import completion_queue
 13 | 
 14 | 
 15 | class HumanCompletionUI:
 16 |     def __init__(self, server_url: str = "http://localhost:8002"):
 17 |         self.server_url = server_url
 18 |         self.current_call_id: Optional[str] = None
 19 |         self.refresh_interval = 2.0  # seconds
 20 |         self.last_image = None  # Store the last image for display
 21 |         # Track current interactive action controls
 22 |         self.current_action_type: str = "click"
 23 |         self.current_button: str = "left"
 24 |         self.current_scroll_x: int = 0
 25 |         self.current_scroll_y: int = -120
 26 | 
 27 |     def format_messages_for_chatbot(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
 28 |         """Format messages for display in gr.Chatbot with type='messages'."""
 29 |         formatted = []
 30 |         for msg in messages:
 31 |             role = msg.get("role", "user")
 32 |             content = msg.get("content", "")
 33 |             tool_calls = msg.get("tool_calls", [])
 34 | 
 35 |             # Handle different content formats
 36 |             if isinstance(content, list):
 37 |                 # Multi-modal content - can include text and images
 38 |                 formatted_content = []
 39 |                 for item in content:
 40 |                     if item.get("type") == "text":
 41 |                         text = item.get("text", "")
 42 |                         if text.strip():  # Only add non-empty text
 43 |                             formatted_content.append(text)
 44 |                     elif item.get("type") == "image_url":
 45 |                         image_url = item.get("image_url", {}).get("url", "")
 46 |                         if image_url:
 47 |                             # Check if it's a base64 image or URL
 48 |                             if image_url.startswith("data:image"):
 49 |                                 # For base64 images, decode and create gr.Image
 50 |                                 try:
 51 |                                     header, data = image_url.split(",", 1)
 52 |                                     image_data = base64.b64decode(data)
 53 |                                     image = Image.open(io.BytesIO(image_data))
 54 |                                     formatted_content.append(gr.Image(value=image))
 55 |                                 except Exception as e:
 56 |                                     print(f"Error loading image: {e}")
 57 |                                     formatted_content.append(f"[Image loading error: {e}]")
 58 |                             else:
 59 |                                 # For URL images, create gr.Image with URL
 60 |                                 formatted_content.append(gr.Image(value=image_url))
 61 | 
 62 |                 # Determine final content format
 63 |                 if len(formatted_content) == 1:
 64 |                     content = formatted_content[0]
 65 |                 elif len(formatted_content) > 1:
 66 |                     content = formatted_content
 67 |                 else:
 68 |                     content = "[Empty content]"
 69 | 
 70 |             # Ensure role is valid for Gradio Chatbot
 71 |             if role not in ["user", "assistant"]:
 72 |                 role = "assistant" if role == "system" else "user"
 73 | 
 74 |             # Invert roles for better display in human UI context
 75 |             # (what the AI says becomes "user", what human should respond becomes "assistant")
 76 |             if role == "user":
 77 |                 role = "assistant"
 78 |             else:
 79 |                 role = "user"
 80 | 
 81 |             # Add the main message if it has content
 82 |             if content and str(content).strip():
 83 |                 formatted.append({"role": role, "content": content})
 84 | 
 85 |             # Handle tool calls - create separate messages for each tool call
 86 |             if tool_calls:
 87 |                 for tool_call in tool_calls:
 88 |                     function_name = tool_call.get("function", {}).get("name", "unknown")
 89 |                     arguments_str = tool_call.get("function", {}).get("arguments", "{}")
 90 | 
 91 |                     try:
 92 |                         # Parse arguments to format them nicely
 93 |                         arguments = json.loads(arguments_str)
 94 |                         formatted_args = json.dumps(arguments, indent=2)
 95 |                     except json.JSONDecodeError:
 96 |                         # If parsing fails, use the raw string
 97 |                         formatted_args = arguments_str
 98 | 
 99 |                     # Create a formatted message for the tool call
100 |                     tool_call_content = f"```json\n{formatted_args}\n```"
101 | 
102 |                     formatted.append(
103 |                         {
104 |                             "role": role,
105 |                             "content": tool_call_content,
106 |                             "metadata": {"title": f"🛠️ Used {function_name}"},
107 |                         }
108 |                     )
109 | 
110 |         return formatted
111 | 
112 |     def get_pending_calls(self) -> List[Dict[str, Any]]:
113 |         """Get pending calls from the server."""
114 |         try:
115 |             response = requests.get(f"{self.server_url}/pending", timeout=5)
116 |             if response.status_code == 200:
117 |                 return response.json().get("pending_calls", [])
118 |         except Exception as e:
119 |             print(f"Error fetching pending calls: {e}")
120 |         return []
121 | 
122 |     def complete_call_with_response(self, call_id: str, response: str) -> bool:
123 |         """Complete a call with a text response."""
124 |         try:
125 |             response_data = {"response": response}
126 |             response_obj = requests.post(
127 |                 f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
128 |             )
129 |             response_obj.raise_for_status()
130 |             return True
131 |         except requests.RequestException as e:
132 |             print(f"Error completing call: {e}")
133 |             return False
134 | 
135 |     def complete_call_with_tool_calls(self, call_id: str, tool_calls: List[Dict[str, Any]]) -> bool:
136 |         """Complete a call with tool calls."""
137 |         try:
138 |             response_data = {"tool_calls": tool_calls}
139 |             response_obj = requests.post(
140 |                 f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
141 |             )
142 |             response_obj.raise_for_status()
143 |             return True
144 |         except requests.RequestException as e:
145 |             print(f"Error completing call: {e}")
146 |             return False
147 | 
148 |     def complete_call(
149 |         self,
150 |         call_id: str,
151 |         response: Optional[str] = None,
152 |         tool_calls: Optional[List[Dict[str, Any]]] = None,
153 |     ) -> bool:
154 |         """Complete a call with either a response or tool calls."""
155 |         try:
156 |             response_data = {}
157 |             if response:
158 |                 response_data["response"] = response
159 |             if tool_calls:
160 |                 response_data["tool_calls"] = tool_calls
161 | 
162 |             response_obj = requests.post(
163 |                 f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
164 |             )
165 |             response_obj.raise_for_status()
166 |             return True
167 |         except requests.RequestException as e:
168 |             print(f"Error completing call: {e}")
169 |             return False
170 | 
171 |     def get_last_image_from_messages(self, messages: List[Dict[str, Any]]) -> Optional[Any]:
172 |         """Extract the last image from the messages for display above conversation."""
173 |         last_image = None
174 | 
175 |         for msg in reversed(messages):  # Start from the last message
176 |             content = msg.get("content", "")
177 | 
178 |             if isinstance(content, list):
179 |                 for item in reversed(content):  # Get the last image in the message
180 |                     if item.get("type") == "image_url":
181 |                         image_url = item.get("image_url", {}).get("url", "")
182 |                         if image_url:
183 |                             if image_url.startswith("data:image"):
184 |                                 # For base64 images, create a gr.Image component
185 |                                 try:
186 |                                     header, data = image_url.split(",", 1)
187 |                                     image_data = base64.b64decode(data)
188 |                                     image = Image.open(io.BytesIO(image_data))
189 |                                     return image
190 |                                 except Exception as e:
191 |                                     print(f"Error loading image: {e}")
192 |                                     continue
193 |                             else:
194 |                                 # For URL images, return the URL
195 |                                 return image_url
196 | 
197 |         return last_image
198 | 
199 |     def refresh_pending_calls(self):
200 |         """Refresh the list of pending calls."""
201 |         pending_calls = self.get_pending_calls()
202 | 
203 |         if not pending_calls:
204 |             return (
205 |                 gr.update(choices=["latest"], value="latest"),  # dropdown
206 |                 gr.update(value=None),  # image (no image)
207 |                 gr.update(value=[]),  # chatbot (empty messages)
208 |                 gr.update(interactive=False),  # submit button
209 |                 gr.update(visible=False),  # click_actions_group hidden
210 |                 gr.update(visible=False),  # actions_group hidden
211 |             )
212 | 
213 |         # Sort pending calls by created_at to get oldest first
214 |         sorted_calls = sorted(pending_calls, key=lambda x: x.get("created_at", ""))
215 | 
216 |         # Create choices for dropdown
217 |         choices = [("latest", "latest")]  # Add "latest" option first
218 | 
219 |         for call in sorted_calls:
220 |             call_id = call["id"]
221 |             model = call.get("model", "unknown")
222 |             created_at = call.get("created_at", "")
223 |             # Format timestamp
224 |             try:
225 |                 dt = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
226 |                 time_str = dt.strftime("%H:%M:%S")
227 |             except:
228 |                 time_str = created_at
229 | 
230 |             choice_label = f"{call_id[:8]}... ({model}) - {time_str}"
231 |             choices.append((choice_label, call_id))
232 | 
233 |         # Default to "latest" which shows the oldest pending conversation
234 |         selected_call_id = "latest"
235 |         if selected_call_id == "latest" and sorted_calls:
236 |             # Use the oldest call (first in sorted list)
237 |             selected_call = sorted_calls[0]
238 |             conversation = self.format_messages_for_chatbot(selected_call.get("messages", []))
239 |             self.current_call_id = selected_call["id"]
240 |             # Get the last image from messages
241 |             self.last_image = self.get_last_image_from_messages(selected_call.get("messages", []))
242 |         else:
243 |             conversation = []
244 |             self.current_call_id = None
245 |             self.last_image = None
246 | 
247 |         return (
248 |             gr.update(choices=choices, value="latest"),
249 |             gr.update(value=self.last_image),
250 |             gr.update(value=conversation),
251 |             gr.update(interactive=bool(choices)),
252 |             gr.update(visible=True),  # click_actions_group visible when there is a call
253 |             gr.update(visible=True),  # actions_group visible when there is a call
254 |         )
255 | 
256 |     def on_call_selected(self, selected_choice):
257 |         """Handle when a call is selected from the dropdown."""
258 |         if not selected_choice:
259 |             return (
260 |                 gr.update(value=None),  # no image
261 |                 gr.update(value=[]),  # empty chatbot
262 |                 gr.update(interactive=False),
263 |                 gr.update(visible=False),  # click_actions_group hidden
264 |                 gr.update(visible=False),  # actions_group hidden
265 |             )
266 | 
267 |         pending_calls = self.get_pending_calls()
268 |         if not pending_calls:
269 |             return (
270 |                 gr.update(value=None),  # no image
271 |                 gr.update(value=[]),  # empty chatbot
272 |                 gr.update(interactive=False),
273 |                 gr.update(visible=False),  # click_actions_group hidden
274 |                 gr.update(visible=False),  # actions_group hidden
275 |             )
276 | 
277 |         # Handle "latest" option
278 |         if selected_choice == "latest":
279 |             # Sort calls by created_at to get oldest first
280 |             sorted_calls = sorted(pending_calls, key=lambda x: x.get("created_at", ""))
281 |             selected_call = sorted_calls[0]  # Get the oldest call
282 |             call_id = selected_call["id"]
283 |         else:
284 |             # Extract call_id from the choice for specific calls
285 |             call_id = None
286 |             for call in pending_calls:
287 |                 call_id_short = call["id"][:8]
288 |                 if call_id_short in selected_choice:
289 |                     call_id = call["id"]
290 |                     break
291 | 
292 |             if not call_id:
293 |                 return (
294 |                     gr.update(value=None),  # no image
295 |                     gr.update(value=[]),  # empty chatbot
296 |                     gr.update(interactive=False),
297 |                 )
298 | 
299 |             # Find the selected call
300 |             selected_call = next((c for c in pending_calls if c["id"] == call_id), None)
301 | 
302 |         if not selected_call:
303 |             return (
304 |                 gr.update(value=None),  # no image
305 |                 gr.update(value=[]),  # empty chatbot
306 |                 gr.update(interactive=False),
307 |                 gr.update(visible=False),  # click_actions_group hidden
308 |                 gr.update(visible=False),  # actions_group hidden
309 |             )
310 | 
311 |         conversation = self.format_messages_for_chatbot(selected_call.get("messages", []))
312 |         self.current_call_id = call_id
313 |         # Get the last image from messages
314 |         self.last_image = self.get_last_image_from_messages(selected_call.get("messages", []))
315 | 
316 |         return (
317 |             gr.update(value=self.last_image),
318 |             gr.update(value=conversation),
319 |             gr.update(interactive=True),
320 |             gr.update(visible=True),  # click_actions_group visible
321 |             gr.update(visible=True),  # actions_group visible
322 |         )
323 | 
324 |     def submit_response(self, response_text: str):
325 |         """Submit a text response to the current call."""
326 |         if not self.current_call_id:
327 |             return (
328 |                 gr.update(value=response_text),  # keep response text
329 |                 gr.update(value="❌ No call selected"),  # status
330 |             )
331 | 
332 |         if not response_text.strip():
333 |             return (
334 |                 gr.update(value=response_text),  # keep response text
335 |                 gr.update(value="❌ Response cannot be empty"),  # status
336 |             )
337 | 
338 |         success = self.complete_call_with_response(self.current_call_id, response_text)
339 | 
340 |         if success:
341 |             status_msg = "✅ Response submitted successfully!"
342 |             return (
343 |                 gr.update(value=""),  # clear response text
344 |                 gr.update(value=status_msg),  # status
345 |             )
346 |         else:
347 |             return (
348 |                 gr.update(value=response_text),  # keep response text
349 |                 gr.update(value="❌ Failed to submit response"),  # status
350 |             )
351 | 
352 |     def submit_action(self, action_type: str, **kwargs) -> str:
353 |         """Submit a computer action as a tool call."""
354 |         if not self.current_call_id:
355 |             return "❌ No call selected"
356 | 
357 |         import uuid
358 | 
359 |         # Create tool call structure
360 |         action_data = {"type": action_type, **kwargs}
361 |         tool_call = {
362 |             "id": f"call_{uuid.uuid4().hex[:24]}",
363 |             "type": "function",
364 |             "function": {"name": "computer", "arguments": json.dumps(action_data)},
365 |         }
366 | 
367 |         success = self.complete_call_with_tool_calls(self.current_call_id, [tool_call])
368 | 
369 |         if success:
370 |             return f"✅ {action_type.capitalize()} action submitted as tool call"
371 |         else:
372 |             return f"❌ Failed to submit {action_type} action"
373 | 
374 |     def submit_click_action(
375 |         self, x: int, y: int, action_type: str = "click", button: str = "left"
376 |     ) -> str:
377 |         """Submit a coordinate-based action."""
378 |         if action_type == "click":
379 |             return self.submit_action(action_type, x=x, y=y, button=button)
380 |         else:
381 |             return self.submit_action(action_type, x=x, y=y)
382 | 
383 |     def submit_type_action(self, text: str) -> str:
384 |         """Submit a type action."""
385 |         return self.submit_action("type", text=text)
386 | 
387 |     def submit_hotkey_action(self, keys: str) -> str:
388 |         """Submit a hotkey action."""
389 |         return self.submit_action("keypress", keys=keys)
390 | 
391 |     def submit_wait_action(self) -> str:
392 |         """Submit a wait action with no kwargs."""
393 |         return self.submit_action("wait")
394 | 
395 |     def submit_description_click(
396 |         self, description: str, action_type: str = "click", button: str = "left"
397 |     ) -> str:
398 |         """Submit a description-based action."""
399 |         if action_type == "click":
400 |             return self.submit_action(action_type, element_description=description, button=button)
401 |         else:
402 |             return self.submit_action(action_type, element_description=description)
403 | 
404 |     def wait_for_pending_calls(self, max_seconds: float = 10.0, check_interval: float = 0.2):
405 |         """Wait for pending calls to appear or until max_seconds elapsed.
406 | 
407 |         This method loops and checks for pending calls at regular intervals,
408 |         returning as soon as a pending call is found or the maximum wait time is reached.
409 | 
410 |         Args:
411 |             max_seconds: Maximum number of seconds to wait
412 |             check_interval: How often to check for pending calls (in seconds)
413 |         """
414 |         import time
415 | 
416 |         start_time = time.time()
417 | 
418 |         while time.time() - start_time < max_seconds:
419 |             # Check if there are any pending calls
420 |             pending_calls = self.get_pending_calls()
421 |             if pending_calls:
422 |                 # Found pending calls, return immediately
423 |                 return self.refresh_pending_calls()
424 | 
425 |             # Wait before checking again
426 |             time.sleep(check_interval)
427 | 
428 |         # Max wait time reached, return current state
429 |         return self.refresh_pending_calls()
430 | 
431 | 
432 | def create_ui():
433 |     """Create the Gradio interface."""
434 |     ui_handler = HumanCompletionUI()
435 | 
436 |     with gr.Blocks(title="Human-in-the-Loop Agent Tool", fill_width=True) as demo:
437 |         gr.Markdown("# 🤖 Human-in-the-Loop Agent Tool")
438 |         gr.Markdown("Review AI conversation requests and provide human responses.")
439 | 
440 |         with gr.Row():
441 |             with gr.Column(scale=2):
442 |                 with gr.Group():
443 |                     screenshot_image = gr.Image(
444 |                         label="Interactive Screenshot", interactive=False, height=600
445 |                     )
446 | 
447 |                     # Action type selection for image clicks (wrapped for visibility control)
448 |                     with gr.Group(visible=False) as click_actions_group:
449 |                         with gr.Row():
450 |                             action_type_radio = gr.Dropdown(
451 |                                 label="Interactive Action",
452 |                                 choices=[
453 |                                     "click",
454 |                                     "double_click",
455 |                                     "move",
456 |                                     "left_mouse_up",
457 |                                     "left_mouse_down",
458 |                                     "scroll",
459 |                                 ],
460 |                                 value="click",
461 |                                 scale=2,
462 |                             )
463 |                             action_button_radio = gr.Dropdown(
464 |                                 label="Button",
465 |                                 choices=["left", "right", "wheel", "back", "forward"],
466 |                                 value="left",
467 |                                 visible=True,
468 |                                 scale=1,
469 |                             )
470 |                             scroll_x_input = gr.Number(
471 |                                 label="scroll_x", value=0, visible=False, scale=1
472 |                             )
473 |                             scroll_y_input = gr.Number(
474 |                                 label="scroll_y", value=-120, visible=False, scale=1
475 |                             )
476 | 
477 |                     conversation_chatbot = gr.Chatbot(
478 |                         label="Conversation", type="messages", height=500, show_copy_button=True
479 |                     )
480 | 
481 |             with gr.Column(scale=1):
482 |                 with gr.Group():
483 |                     call_dropdown = gr.Dropdown(
484 |                         label="Select a pending conversation request",
485 |                         choices=["latest"],
486 |                         interactive=True,
487 |                         value="latest",
488 |                     )
489 |                     refresh_btn = gr.Button("🔄 Refresh", variant="secondary")
490 |                     status_display = gr.Textbox(
491 |                         label="Status", interactive=False, value="Ready to receive requests..."
492 |                     )
493 | 
494 |                 with gr.Group():
495 |                     response_text = gr.Textbox(
496 |                         label="Message", lines=3, placeholder="Enter your message here..."
497 |                     )
498 |                     submit_btn = gr.Button(
499 |                         "📤 Submit Message", variant="primary", interactive=False
500 |                     )
501 | 
502 |                 # Action Accordions (wrapped for visibility control)
503 |                 with gr.Group(visible=False) as actions_group:
504 |                     with gr.Tabs():
505 |                         with gr.Tab("🖱️ Click Actions"):
506 |                             with gr.Group():
507 |                                 description_text = gr.Textbox(
508 |                                     label="Element Description",
509 |                                     placeholder="e.g., 'Privacy and security option in left sidebar'",
510 |                                 )
511 |                                 with gr.Row():
512 |                                     description_action_type = gr.Dropdown(
513 |                                         label="Action",
514 |                                         choices=[
515 |                                             "click",
516 |                                             "double_click",
517 |                                             "move",
518 |                                             "left_mouse_up",
519 |                                             "left_mouse_down",
520 |                                         ],
521 |                                         value="click",
522 |                                     )
523 |                                     description_button = gr.Dropdown(
524 |                                         label="Button",
525 |                                         choices=["left", "right", "wheel", "back", "forward"],
526 |                                         value="left",
527 |                                     )
528 |                                 description_submit_btn = gr.Button("Submit Click Action")
529 | 
530 |                         with gr.Tab("📝 Type Action"):
531 |                             with gr.Group():
532 |                                 type_text = gr.Textbox(
533 |                                     label="Text to Type", placeholder="Enter text to type..."
534 |                                 )
535 |                                 type_submit_btn = gr.Button("Submit Type")
536 | 
537 |                         with gr.Tab("⌨️ Keypress Action"):
538 |                             with gr.Group():
539 |                                 keypress_text = gr.Textbox(
540 |                                     label="Keys", placeholder="e.g., ctrl+c, alt+tab"
541 |                                 )
542 |                                 keypress_submit_btn = gr.Button("Submit Keypress")
543 | 
544 |                         with gr.Tab("🧰 Misc Actions"):
545 |                             with gr.Group():
546 |                                 misc_action_dropdown = gr.Dropdown(
547 |                                     label="Action", choices=["wait"], value="wait"
548 |                                 )
549 |                                 misc_submit_btn = gr.Button("Submit Action")
550 | 
551 |         # Event handlers
552 |         refresh_btn.click(
553 |             fn=ui_handler.refresh_pending_calls,
554 |             outputs=[
555 |                 call_dropdown,
556 |                 screenshot_image,
557 |                 conversation_chatbot,
558 |                 submit_btn,
559 |                 click_actions_group,
560 |                 actions_group,
561 |             ],
562 |         )
563 | 
564 |         call_dropdown.change(
565 |             fn=ui_handler.on_call_selected,
566 |             inputs=[call_dropdown],
567 |             outputs=[
568 |                 screenshot_image,
569 |                 conversation_chatbot,
570 |                 submit_btn,
571 |                 click_actions_group,
572 |                 actions_group,
573 |             ],
574 |         )
575 | 
576 |         def handle_image_click(evt: gr.SelectData):
577 |             if evt.index is not None:
578 |                 x, y = evt.index
579 |                 action_type = ui_handler.current_action_type or "click"
580 |                 button = ui_handler.current_button or "left"
581 |                 if action_type == "scroll":
582 |                     sx_i = int(ui_handler.current_scroll_x or 0)
583 |                     sy_i = int(ui_handler.current_scroll_y or 0)
584 |                     # Submit a scroll action with x,y position and scroll deltas
585 |                     result = ui_handler.submit_action(
586 |                         "scroll", x=x, y=y, scroll_x=sx_i, scroll_y=sy_i
587 |                     )
588 |                 else:
589 |                     result = ui_handler.submit_click_action(x, y, action_type, button)
590 |                 ui_handler.wait_for_pending_calls()
591 |                 return result
592 |             return "No coordinates selected"
593 | 
594 |         screenshot_image.select(fn=handle_image_click, outputs=[status_display]).then(
595 |             fn=ui_handler.wait_for_pending_calls,
596 |             outputs=[
597 |                 call_dropdown,
598 |                 screenshot_image,
599 |                 conversation_chatbot,
600 |                 submit_btn,
601 |                 click_actions_group,
602 |                 actions_group,
603 |             ],
604 |         )
605 | 
606 |         # Response submission
607 |         submit_btn.click(
608 |             fn=ui_handler.submit_response,
609 |             inputs=[response_text],
610 |             outputs=[response_text, status_display],
611 |         ).then(
612 |             fn=ui_handler.refresh_pending_calls,
613 |             outputs=[
614 |                 call_dropdown,
615 |                 screenshot_image,
616 |                 conversation_chatbot,
617 |                 submit_btn,
618 |                 click_actions_group,
619 |                 actions_group,
620 |             ],
621 |         )
622 | 
623 |         # Toggle visibility of controls based on action type
624 |         def toggle_action_controls(action_type):
625 |             # Button visible only for click
626 |             button_vis = gr.update(visible=(action_type == "click"))
627 |             # Scroll inputs visible only for scroll
628 |             scroll_x_vis = gr.update(visible=(action_type == "scroll"))
629 |             scroll_y_vis = gr.update(visible=(action_type == "scroll"))
630 |             # Update state
631 |             ui_handler.current_action_type = action_type or "click"
632 |             return button_vis, scroll_x_vis, scroll_y_vis
633 | 
634 |         action_type_radio.change(
635 |             fn=toggle_action_controls,
636 |             inputs=[action_type_radio],
637 |             outputs=[action_button_radio, scroll_x_input, scroll_y_input],
638 |         )
639 | 
640 |         # Keep other control values in ui_handler state
641 |         def on_button_change(val):
642 |             ui_handler.current_button = val or "left"
643 | 
644 |         action_button_radio.change(fn=on_button_change, inputs=[action_button_radio])
645 | 
646 |         def on_scroll_x_change(val):
647 |             try:
648 |                 ui_handler.current_scroll_x = int(val) if val is not None else 0
649 |             except Exception:
650 |                 ui_handler.current_scroll_x = 0
651 | 
652 |         scroll_x_input.change(fn=on_scroll_x_change, inputs=[scroll_x_input])
653 | 
654 |         def on_scroll_y_change(val):
655 |             try:
656 |                 ui_handler.current_scroll_y = int(val) if val is not None else 0
657 |             except Exception:
658 |                 ui_handler.current_scroll_y = 0
659 | 
660 |         scroll_y_input.change(fn=on_scroll_y_change, inputs=[scroll_y_input])
661 | 
662 |         type_submit_btn.click(
663 |             fn=ui_handler.submit_type_action, inputs=[type_text], outputs=[status_display]
664 |         ).then(
665 |             fn=ui_handler.wait_for_pending_calls,
666 |             outputs=[
667 |                 call_dropdown,
668 |                 screenshot_image,
669 |                 conversation_chatbot,
670 |                 submit_btn,
671 |                 click_actions_group,
672 |                 actions_group,
673 |             ],
674 |         )
675 | 
676 |         keypress_submit_btn.click(
677 |             fn=ui_handler.submit_hotkey_action, inputs=[keypress_text], outputs=[status_display]
678 |         ).then(
679 |             fn=ui_handler.wait_for_pending_calls,
680 |             outputs=[
681 |                 call_dropdown,
682 |                 screenshot_image,
683 |                 conversation_chatbot,
684 |                 submit_btn,
685 |                 click_actions_group,
686 |                 actions_group,
687 |             ],
688 |         )
689 | 
690 |         def handle_description_submit(description, action_type, button):
691 |             if description:
692 |                 result = ui_handler.submit_description_click(description, action_type, button)
693 |                 ui_handler.wait_for_pending_calls()
694 |                 return result
695 |             return "Please enter a description"
696 | 
697 |         description_submit_btn.click(
698 |             fn=handle_description_submit,
699 |             inputs=[description_text, description_action_type, description_button],
700 |             outputs=[status_display],
701 |         ).then(
702 |             fn=ui_handler.wait_for_pending_calls,
703 |             outputs=[
704 |                 call_dropdown,
705 |                 screenshot_image,
706 |                 conversation_chatbot,
707 |                 submit_btn,
708 |                 click_actions_group,
709 |                 actions_group,
710 |             ],
711 |         )
712 | 
713 |         # Misc action handler
714 |         def handle_misc_submit(selected_action):
715 |             if selected_action == "wait":
716 |                 result = ui_handler.submit_wait_action()
717 |                 ui_handler.wait_for_pending_calls()
718 |                 return result
719 |             return f"Unsupported misc action: {selected_action}"
720 | 
721 |         misc_submit_btn.click(
722 |             fn=handle_misc_submit, inputs=[misc_action_dropdown], outputs=[status_display]
723 |         ).then(
724 |             fn=ui_handler.wait_for_pending_calls,
725 |             outputs=[
726 |                 call_dropdown,
727 |                 screenshot_image,
728 |                 conversation_chatbot,
729 |                 submit_btn,
730 |                 click_actions_group,
731 |                 actions_group,
732 |             ],
733 |         )
734 | 
735 |         # Load initial data
736 |         demo.load(
737 |             fn=ui_handler.refresh_pending_calls,
738 |             outputs=[
739 |                 call_dropdown,
740 |                 screenshot_image,
741 |                 conversation_chatbot,
742 |                 submit_btn,
743 |                 click_actions_group,
744 |                 actions_group,
745 |             ],
746 |         )
747 | 
748 |     return demo
749 | 
750 | 
751 | if __name__ == "__main__":
752 |     demo = create_ui()
753 |     demo.queue()
754 |     demo.launch(server_name="0.0.0.0", server_port=7860)
755 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/uitars.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | UITARS agent loop implementation using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B
  3 | Paper: https://arxiv.org/abs/2501.12326
  4 | Code: https://github.com/bytedance/UI-TARS
  5 | """
  6 | 
  7 | import ast
  8 | import asyncio
  9 | import base64
 10 | import json
 11 | import math
 12 | import re
 13 | from ctypes import cast
 14 | from io import BytesIO
 15 | from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
 16 | 
 17 | import litellm
 18 | from litellm.responses.litellm_completion_transformation.transformation import (
 19 |     LiteLLMCompletionResponsesConfig,
 20 | )
 21 | from litellm.responses.utils import Usage
 22 | from litellm.types.utils import ModelResponse
 23 | from openai.types.responses.response_computer_tool_call_param import (
 24 |     ActionType,
 25 |     ResponseComputerToolCallParam,
 26 | )
 27 | from openai.types.responses.response_input_param import ComputerCallOutput
 28 | from openai.types.responses.response_output_message_param import (
 29 |     ResponseOutputMessageParam,
 30 | )
 31 | from openai.types.responses.response_reasoning_item_param import (
 32 |     ResponseReasoningItemParam,
 33 |     Summary,
 34 | )
 35 | from PIL import Image
 36 | 
 37 | from ..decorators import register_agent
 38 | from ..responses import (
 39 |     make_click_item,
 40 |     make_double_click_item,
 41 |     make_drag_item,
 42 |     make_input_image_item,
 43 |     make_keypress_item,
 44 |     make_output_text_item,
 45 |     make_reasoning_item,
 46 |     make_scroll_item,
 47 |     make_type_item,
 48 |     make_wait_item,
 49 | )
 50 | from ..types import AgentCapability, AgentResponse, Messages, Tools
 51 | 
 52 | # Constants from reference code
 53 | IMAGE_FACTOR = 28
 54 | MIN_PIXELS = 100 * 28 * 28
 55 | MAX_PIXELS = 16384 * 28 * 28
 56 | MAX_RATIO = 200
 57 | 
 58 | FINISH_WORD = "finished"
 59 | WAIT_WORD = "wait"
 60 | ENV_FAIL_WORD = "error_env"
 61 | CALL_USER = "call_user"
 62 | 
 63 | # Action space prompt for UITARS
 64 | UITARS_ACTION_SPACE = """
 65 | click(start_box='<|box_start|>(x1,y1)<|box_end|>')
 66 | left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
 67 | right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
 68 | drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
 69 | hotkey(key='')
 70 | type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
 71 | scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
 72 | wait() #Sleep for 5s and take a screenshot to check for any changes.
 73 | finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
 74 | """
 75 | 
 76 | UITARS_PROMPT_TEMPLATE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. 
 77 | 
 78 | ## Output Format
 79 | ```
 80 | Thought: ...
 81 | Action: ...
 82 | ```
 83 | 
 84 | ## Action Space
 85 | {action_space}
 86 | 
 87 | ## Note
 88 | - Use {language} in `Thought` part.
 89 | - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
 90 | 
 91 | ## User Instruction
 92 | {instruction}
 93 | """
 94 | 
 95 | GROUNDING_UITARS_PROMPT_TEMPLATE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. 
 96 | 
 97 | ## Output Format
 98 | 
 99 | Action: ...
100 | 
101 | 
102 | ## Action Space
103 | click(point='<|box_start|>(x1,y1)<|box_end|>')
104 | 
105 | ## User Instruction
106 | {instruction}"""
107 | 
108 | 
109 | def round_by_factor(number: float, factor: int) -> int:
110 |     """Returns the closest integer to 'number' that is divisible by 'factor'."""
111 |     return round(number / factor) * factor
112 | 
113 | 
114 | def ceil_by_factor(number: float, factor: int) -> int:
115 |     """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
116 |     return math.ceil(number / factor) * factor
117 | 
118 | 
119 | def floor_by_factor(number: float, factor: int) -> int:
120 |     """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
121 |     return math.floor(number / factor) * factor
122 | 
123 | 
124 | def smart_resize(
125 |     height: int,
126 |     width: int,
127 |     factor: int = IMAGE_FACTOR,
128 |     min_pixels: int = MIN_PIXELS,
129 |     max_pixels: int = MAX_PIXELS,
130 | ) -> tuple[int, int]:
131 |     """
132 |     Rescales the image so that the following conditions are met:
133 |     1. Both dimensions (height and width) are divisible by 'factor'.
134 |     2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
135 |     3. The aspect ratio of the image is maintained as closely as possible.
136 |     """
137 |     if max(height, width) / min(height, width) > MAX_RATIO:
138 |         raise ValueError(
139 |             f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
140 |         )
141 |     h_bar = max(factor, round_by_factor(height, factor))
142 |     w_bar = max(factor, round_by_factor(width, factor))
143 |     if h_bar * w_bar > max_pixels:
144 |         beta = math.sqrt((height * width) / max_pixels)
145 |         h_bar = floor_by_factor(height / beta, factor)
146 |         w_bar = floor_by_factor(width / beta, factor)
147 |     elif h_bar * w_bar < min_pixels:
148 |         beta = math.sqrt(min_pixels / (height * width))
149 |         h_bar = ceil_by_factor(height * beta, factor)
150 |         w_bar = ceil_by_factor(width * beta, factor)
151 |     return h_bar, w_bar
152 | 
153 | 
154 | def escape_single_quotes(text):
155 |     """Escape single quotes in text for safe string formatting."""
156 |     pattern = r"(?<!\\)'"
157 |     return re.sub(pattern, r"\\'", text)
158 | 
159 | 
160 | def parse_action(action_str):
161 |     """Parse action string into structured format."""
162 |     try:
163 |         node = ast.parse(action_str, mode="eval")
164 |         if not isinstance(node, ast.Expression):
165 |             raise ValueError("Not an expression")
166 | 
167 |         call = node.body
168 |         if not isinstance(call, ast.Call):
169 |             raise ValueError("Not a function call")
170 | 
171 |         # Get function name
172 |         if isinstance(call.func, ast.Name):
173 |             func_name = call.func.id
174 |         elif isinstance(call.func, ast.Attribute):
175 |             func_name = call.func.attr
176 |         else:
177 |             func_name = None
178 | 
179 |         # Get keyword arguments
180 |         kwargs = {}
181 |         for kw in call.keywords:
182 |             key = kw.arg
183 |             if isinstance(kw.value, ast.Constant):
184 |                 value = kw.value.value
185 |             elif isinstance(kw.value, ast.Str):  # Compatibility with older Python
186 |                 value = kw.value.s
187 |             else:
188 |                 value = None
189 |             kwargs[key] = value
190 | 
191 |         return {"function": func_name, "args": kwargs}
192 | 
193 |     except Exception as e:
194 |         print(f"Failed to parse action '{action_str}': {e}")
195 |         return None
196 | 
197 | 
198 | def parse_uitars_response(text: str, image_width: int, image_height: int) -> List[Dict[str, Any]]:
199 |     """Parse UITARS model response into structured actions."""
200 |     text = text.strip()
201 | 
202 |     # Extract thought
203 |     thought = None
204 |     if text.startswith("Thought:"):
205 |         thought_match = re.search(r"Thought: (.+?)(?=\s*Action:|$)", text, re.DOTALL)
206 |         if thought_match:
207 |             thought = thought_match.group(1).strip()
208 | 
209 |     # Extract action
210 |     if "Action:" not in text:
211 |         raise ValueError("No Action found in response")
212 | 
213 |     action_str = text.split("Action:")[-1].strip()
214 | 
215 |     # Handle special case for type actions
216 |     if "type(content" in action_str:
217 | 
218 |         def escape_quotes(match):
219 |             return match.group(1)
220 | 
221 |         pattern = r"type\(content='(.*?)'\)"
222 |         content = re.sub(pattern, escape_quotes, action_str)
223 |         action_str = escape_single_quotes(content)
224 |         action_str = "type(content='" + action_str + "')"
225 | 
226 |     # Parse the action
227 |     parsed_action = parse_action(action_str.replace("\n", "\\n").lstrip())
228 |     if parsed_action is None:
229 |         raise ValueError(f"Action can't parse: {action_str}")
230 | 
231 |     action_type = parsed_action["function"]
232 |     params = parsed_action["args"]
233 | 
234 |     # Process parameters
235 |     action_inputs = {}
236 |     for param_name, param in params.items():
237 |         if param == "":
238 |             continue
239 |         param = str(param).lstrip()
240 |         action_inputs[param_name.strip()] = param
241 | 
242 |         # Handle coordinate parameters
243 |         if "start_box" in param_name or "end_box" in param_name:
244 |             # Parse coordinates like '<|box_start|>(x,y)<|box_end|>' or '(x,y)'
245 |             # First, remove special tokens
246 |             clean_param = param.replace("<|box_start|>", "").replace("<|box_end|>", "")
247 |             # Then remove parentheses and split
248 |             numbers = clean_param.replace("(", "").replace(")", "").split(",")
249 | 
250 |             try:
251 |                 float_numbers = [
252 |                     float(num.strip()) / 1000 for num in numbers
253 |                 ]  # Normalize to 0-1 range
254 | 
255 |                 if len(float_numbers) == 2:
256 |                     # Single point, duplicate for box format
257 |                     float_numbers = [
258 |                         float_numbers[0],
259 |                         float_numbers[1],
260 |                         float_numbers[0],
261 |                         float_numbers[1],
262 |                     ]
263 | 
264 |                 action_inputs[param_name.strip()] = str(float_numbers)
265 |             except ValueError as e:
266 |                 # If parsing fails, keep the original parameter value
267 |                 print(f"Warning: Could not parse coordinates '{param}': {e}")
268 |                 action_inputs[param_name.strip()] = param
269 | 
270 |     return [
271 |         {
272 |             "thought": thought,
273 |             "action_type": action_type,
274 |             "action_inputs": action_inputs,
275 |             "text": text,
276 |         }
277 |     ]
278 | 
279 | 
280 | def convert_to_computer_actions(
281 |     parsed_responses: List[Dict[str, Any]], image_width: int, image_height: int
282 | ) -> List[ResponseComputerToolCallParam | ResponseOutputMessageParam]:
283 |     """Convert parsed UITARS responses to computer actions."""
284 |     computer_actions = []
285 | 
286 |     for response in parsed_responses:
287 |         action_type = response.get("action_type")
288 |         action_inputs = response.get("action_inputs", {})
289 | 
290 |         if action_type == "finished":
291 |             finished_text = action_inputs.get("content", "Task completed successfully.")
292 |             computer_actions.append(make_output_text_item(finished_text))
293 |             break
294 | 
295 |         elif action_type == "wait":
296 |             computer_actions.append(make_wait_item())
297 | 
298 |         elif action_type == "call_user":
299 |             computer_actions.append(
300 |                 make_output_text_item("I need assistance from the user to proceed with this task.")
301 |             )
302 | 
303 |         elif action_type in ["click", "left_single"]:
304 |             start_box = action_inputs.get("start_box")
305 |             if start_box:
306 |                 coords = eval(start_box)
307 |                 x = int((coords[0] + coords[2]) / 2 * image_width)
308 |                 y = int((coords[1] + coords[3]) / 2 * image_height)
309 | 
310 |                 computer_actions.append(make_click_item(x, y, "left"))
311 | 
312 |         elif action_type == "double_click":
313 |             start_box = action_inputs.get("start_box")
314 |             if start_box:
315 |                 coords = eval(start_box)
316 |                 x = int((coords[0] + coords[2]) / 2 * image_width)
317 |                 y = int((coords[1] + coords[3]) / 2 * image_height)
318 | 
319 |                 computer_actions.append(make_double_click_item(x, y))
320 | 
321 |         elif action_type == "right_click":
322 |             start_box = action_inputs.get("start_box")
323 |             if start_box:
324 |                 coords = eval(start_box)
325 |                 x = int((coords[0] + coords[2]) / 2 * image_width)
326 |                 y = int((coords[1] + coords[3]) / 2 * image_height)
327 | 
328 |                 computer_actions.append(make_click_item(x, y, "right"))
329 | 
330 |         elif action_type == "type":
331 |             content = action_inputs.get("content", "")
332 |             computer_actions.append(make_type_item(content))
333 | 
334 |         elif action_type == "hotkey":
335 |             key = action_inputs.get("key", "")
336 |             keys = key.split()
337 |             computer_actions.append(make_keypress_item(keys))
338 | 
339 |         elif action_type == "press":
340 |             key = action_inputs.get("key", "")
341 |             computer_actions.append(make_keypress_item([key]))
342 | 
343 |         elif action_type == "scroll":
344 |             start_box = action_inputs.get("start_box")
345 |             direction = action_inputs.get("direction", "down")
346 | 
347 |             if start_box:
348 |                 coords = eval(start_box)
349 |                 x = int((coords[0] + coords[2]) / 2 * image_width)
350 |                 y = int((coords[1] + coords[3]) / 2 * image_height)
351 |             else:
352 |                 x, y = image_width // 2, image_height // 2
353 | 
354 |             scroll_y = 5 if "up" in direction.lower() else -5
355 |             computer_actions.append(make_scroll_item(x, y, 0, scroll_y))
356 | 
357 |         elif action_type == "drag":
358 |             start_box = action_inputs.get("start_box")
359 |             end_box = action_inputs.get("end_box")
360 | 
361 |             if start_box and end_box:
362 |                 start_coords = eval(start_box)
363 |                 end_coords = eval(end_box)
364 | 
365 |                 start_x = int((start_coords[0] + start_coords[2]) / 2 * image_width)
366 |                 start_y = int((start_coords[1] + start_coords[3]) / 2 * image_height)
367 |                 end_x = int((end_coords[0] + end_coords[2]) / 2 * image_width)
368 |                 end_y = int((end_coords[1] + end_coords[3]) / 2 * image_height)
369 | 
370 |                 path = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
371 |                 computer_actions.append(make_drag_item(path))
372 | 
373 |     return computer_actions
374 | 
375 | 
376 | def pil_to_base64(image: Image.Image) -> str:
377 |     """Convert PIL image to base64 string."""
378 |     buffer = BytesIO()
379 |     image.save(buffer, format="PNG")
380 |     return base64.b64encode(buffer.getvalue()).decode("utf-8")
381 | 
382 | 
383 | def process_image_for_uitars(
384 |     image_data: str, max_pixels: int = MAX_PIXELS, min_pixels: int = MIN_PIXELS
385 | ) -> tuple[Image.Image, int, int]:
386 |     """Process image for UITARS model input."""
387 |     # Decode base64 image
388 |     if image_data.startswith("data:image"):
389 |         image_data = image_data.split(",")[1]
390 | 
391 |     image_bytes = base64.b64decode(image_data)
392 |     image = Image.open(BytesIO(image_bytes))
393 | 
394 |     original_width, original_height = image.size
395 | 
396 |     # Resize image according to UITARS requirements
397 |     if image.width * image.height > max_pixels:
398 |         resize_factor = math.sqrt(max_pixels / (image.width * image.height))
399 |         width = int(image.width * resize_factor)
400 |         height = int(image.height * resize_factor)
401 |         image = image.resize((width, height))
402 | 
403 |     if image.width * image.height < min_pixels:
404 |         resize_factor = math.sqrt(min_pixels / (image.width * image.height))
405 |         width = math.ceil(image.width * resize_factor)
406 |         height = math.ceil(image.height * resize_factor)
407 |         image = image.resize((width, height))
408 | 
409 |     if image.mode != "RGB":
410 |         image = image.convert("RGB")
411 | 
412 |     return image, original_width, original_height
413 | 
414 | 
415 | def sanitize_message(msg: Any) -> Any:
416 |     """Return a copy of the message with image_url ommited within content parts"""
417 |     if isinstance(msg, dict):
418 |         result = {}
419 |         for key, value in msg.items():
420 |             if key == "content" and isinstance(value, list):
421 |                 result[key] = [
422 |                     (
423 |                         {k: v for k, v in item.items() if k != "image_url"}
424 |                         if isinstance(item, dict)
425 |                         else item
426 |                     )
427 |                     for item in value
428 |                 ]
429 |             else:
430 |                 result[key] = value
431 |         return result
432 |     elif isinstance(msg, list):
433 |         return [sanitize_message(item) for item in msg]
434 |     else:
435 |         return msg
436 | 
437 | 
438 | def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any]]:
439 |     """
440 |     Convert UITARS internal message format back to LiteLLM format.
441 | 
442 |     This function processes reasoning, computer_call, and computer_call_output messages
443 |     and converts them to the appropriate LiteLLM assistant message format.
444 | 
445 |     Args:
446 |         messages: List of UITARS internal messages
447 | 
448 |     Returns:
449 |         List of LiteLLM formatted messages
450 |     """
451 |     litellm_messages = []
452 |     current_assistant_content = []
453 | 
454 |     for message in messages:
455 |         if isinstance(message, dict):
456 |             message_type = message.get("type")
457 | 
458 |             if message_type == "reasoning":
459 |                 # Extract reasoning text from summary
460 |                 summary = message.get("summary", [])
461 |                 if summary and isinstance(summary, list):
462 |                     for summary_item in summary:
463 |                         if (
464 |                             isinstance(summary_item, dict)
465 |                             and summary_item.get("type") == "summary_text"
466 |                         ):
467 |                             reasoning_text = summary_item.get("text", "")
468 |                             if reasoning_text:
469 |                                 current_assistant_content.append(f"Thought: {reasoning_text}")
470 | 
471 |             elif message_type == "computer_call":
472 |                 # Convert computer action to UITARS action format
473 |                 action = message.get("action", {})
474 |                 action_type = action.get("type")
475 | 
476 |                 if action_type == "click":
477 |                     x, y = action.get("x", 0), action.get("y", 0)
478 |                     button = action.get("button", "left")
479 |                     if button == "left":
480 |                         action_text = f"Action: click(start_box='({x},{y})')"
481 |                     elif button == "right":
482 |                         action_text = f"Action: right_single(start_box='({x},{y})')"
483 |                     else:
484 |                         action_text = f"Action: click(start_box='({x},{y})')"
485 | 
486 |                 elif action_type == "double_click":
487 |                     x, y = action.get("x", 0), action.get("y", 0)
488 |                     action_text = f"Action: left_double(start_box='({x},{y})')"
489 | 
490 |                 elif action_type == "drag":
491 |                     start_x, start_y = action.get("start_x", 0), action.get("start_y", 0)
492 |                     end_x, end_y = action.get("end_x", 0), action.get("end_y", 0)
493 |                     action_text = f"Action: drag(start_box='({start_x},{start_y})', end_box='({end_x},{end_y})')"
494 | 
495 |                 elif action_type == "key":
496 |                     key = action.get("key", "")
497 |                     action_text = f"Action: hotkey(key='{key}')"
498 | 
499 |                 elif action_type == "type":
500 |                     text = action.get("text", "")
501 |                     # Escape single quotes in the text
502 |                     escaped_text = escape_single_quotes(text)
503 |                     action_text = f"Action: type(content='{escaped_text}')"
504 | 
505 |                 elif action_type == "scroll":
506 |                     x, y = action.get("x", 0), action.get("y", 0)
507 |                     direction = action.get("direction", "down")
508 |                     action_text = f"Action: scroll(start_box='({x},{y})', direction='{direction}')"
509 | 
510 |                 elif action_type == "wait":
511 |                     action_text = "Action: wait()"
512 | 
513 |                 else:
514 |                     # Fallback for unknown action types
515 |                     action_text = f"Action: {action_type}({action})"
516 | 
517 |                 current_assistant_content.append(action_text)
518 | 
519 |                 # When we hit a computer_call_output, finalize the current assistant message
520 |                 if current_assistant_content:
521 |                     litellm_messages.append(
522 |                         {
523 |                             "role": "assistant",
524 |                             "content": [
525 |                                 {"type": "text", "text": "\n".join(current_assistant_content)}
526 |                             ],
527 |                         }
528 |                     )
529 |                     current_assistant_content = []
530 | 
531 |             elif message_type == "computer_call_output":
532 |                 # Add screenshot from computer call output
533 |                 output = message.get("output", {})
534 |                 if isinstance(output, dict) and output.get("type") == "input_image":
535 |                     image_url = output.get("image_url", "")
536 |                     if image_url:
537 |                         litellm_messages.append(
538 |                             {
539 |                                 "role": "user",
540 |                                 "content": [{"type": "image_url", "image_url": {"url": image_url}}],
541 |                             }
542 |                         )
543 | 
544 |             elif message.get("role") == "user":
545 |                 # # Handle user messages
546 |                 # content = message.get("content", "")
547 |                 # if isinstance(content, str):
548 |                 #     litellm_messages.append({
549 |                 #         "role": "user",
550 |                 #         "content": content
551 |                 #     })
552 |                 # elif isinstance(content, list):
553 |                 #     litellm_messages.append({
554 |                 #         "role": "user",
555 |                 #         "content": content
556 |                 #     })
557 |                 pass
558 | 
559 |     # Add any remaining assistant content
560 |     if current_assistant_content:
561 |         litellm_messages.append({"role": "assistant", "content": current_assistant_content})
562 | 
563 |     return litellm_messages
564 | 
565 | 
566 | @register_agent(models=r"(?i).*ui-?tars.*", priority=-1)
567 | class UITARSConfig:
568 |     """
569 |     UITARS agent configuration using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B model.
570 | 
571 |     Supports UITARS vision-language models for computer control.
572 |     """
573 | 
574 |     async def predict_step(
575 |         self,
576 |         messages: List[Dict[str, Any]],
577 |         model: str,
578 |         tools: Optional[List[Dict[str, Any]]] = None,
579 |         max_retries: Optional[int] = None,
580 |         stream: bool = False,
581 |         computer_handler=None,
582 |         use_prompt_caching: Optional[bool] = False,
583 |         _on_api_start=None,
584 |         _on_api_end=None,
585 |         _on_usage=None,
586 |         _on_screenshot=None,
587 |         **kwargs,
588 |     ) -> Dict[str, Any]:
589 |         """
590 |         Predict the next step based on input messages.
591 | 
592 |         Args:
593 |             messages: Input messages following Responses format
594 |             model: Model name to use
595 |             tools: Optional list of tool schemas
596 |             max_retries: Maximum number of retries
597 |             stream: Whether to stream responses
598 |             computer_handler: Computer handler instance
599 |             _on_api_start: Callback for API start
600 |             _on_api_end: Callback for API end
601 |             _on_usage: Callback for usage tracking
602 |             _on_screenshot: Callback for screenshot events
603 |             **kwargs: Additional arguments
604 | 
605 |         Returns:
606 |             Dictionary with "output" (output items) and "usage" array
607 |         """
608 |         tools = tools or []
609 | 
610 |         # Create response items
611 |         response_items = []
612 | 
613 |         # Find computer tool for screen dimensions
614 |         computer_tool = None
615 |         for tool_schema in tools:
616 |             if tool_schema["type"] == "computer":
617 |                 computer_tool = tool_schema["computer"]
618 |                 break
619 | 
620 |         # Get screen dimensions
621 |         screen_width, screen_height = 1024, 768
622 |         if computer_tool:
623 |             try:
624 |                 screen_width, screen_height = await computer_tool.get_dimensions()
625 |             except:
626 |                 pass
627 | 
628 |         # Process messages to extract instruction and image
629 |         instruction = ""
630 |         image_data = None
631 | 
632 |         # Convert messages to list if string
633 |         if isinstance(messages, str):
634 |             messages = [{"role": "user", "content": messages}]
635 | 
636 |         # Extract instruction and latest screenshot
637 |         for message in reversed(messages):
638 |             if isinstance(message, dict):
639 |                 content = message.get("content", "")
640 | 
641 |                 # Handle different content formats
642 |                 if isinstance(content, str):
643 |                     if not instruction and message.get("role") == "user":
644 |                         instruction = content
645 |                 elif isinstance(content, list):
646 |                     for item in content:
647 |                         if isinstance(item, dict):
648 |                             if item.get("type") == "text" and not instruction:
649 |                                 instruction = item.get("text", "")
650 |                             elif item.get("type") == "image_url" and not image_data:
651 |                                 image_url = item.get("image_url", {})
652 |                                 if isinstance(image_url, dict):
653 |                                     image_data = image_url.get("url", "")
654 |                                 else:
655 |                                     image_data = image_url
656 | 
657 |             # Also check for computer_call_output with screenshots
658 |             if message.get("type") == "computer_call_output" and not image_data:
659 |                 output = message.get("output", {})
660 |                 if isinstance(output, dict) and output.get("type") == "input_image":
661 |                     image_data = output.get("image_url", "")
662 | 
663 |             if instruction and image_data:
664 |                 break
665 | 
666 |         if not instruction:
667 |             instruction = (
668 |                 "Help me complete this task by analyzing the screen and taking appropriate actions."
669 |             )
670 | 
671 |         # Create prompt
672 |         user_prompt = UITARS_PROMPT_TEMPLATE.format(
673 |             instruction=instruction, action_space=UITARS_ACTION_SPACE, language="English"
674 |         )
675 | 
676 |         # Convert conversation history to LiteLLM format
677 |         history_messages = convert_uitars_messages_to_litellm(messages)
678 | 
679 |         # Prepare messages for liteLLM
680 |         litellm_messages = [{"role": "system", "content": "You are a helpful assistant."}]
681 | 
682 |         # Add current user instruction with screenshot
683 |         current_user_message = {
684 |             "role": "user",
685 |             "content": [
686 |                 {"type": "text", "text": user_prompt},
687 |             ],
688 |         }
689 |         litellm_messages.append(current_user_message)
690 | 
691 |         # Process image for UITARS
692 |         if not image_data:
693 |             # Take screenshot if none found in messages
694 |             if computer_handler:
695 |                 image_data = await computer_handler.screenshot()
696 |                 await _on_screenshot(image_data, "screenshot_before")
697 | 
698 |                 # Add screenshot to output items so it can be retained in history
699 |                 response_items.append(make_input_image_item(image_data))
700 |             else:
701 |                 raise ValueError("No screenshot found in messages and no computer_handler provided")
702 |         processed_image, original_width, original_height = process_image_for_uitars(image_data)
703 |         encoded_image = pil_to_base64(processed_image)
704 | 
705 |         # Add conversation history
706 |         if history_messages:
707 |             litellm_messages.extend(history_messages)
708 |         else:
709 |             litellm_messages.append(
710 |                 {
711 |                     "role": "user",
712 |                     "content": [
713 |                         {
714 |                             "type": "image_url",
715 |                             "image_url": {"url": f"data:image/png;base64,{encoded_image}"},
716 |                         }
717 |                     ],
718 |                 }
719 |             )
720 | 
721 |         # Prepare API call kwargs
722 |         api_kwargs = {
723 |             "model": model,
724 |             "messages": litellm_messages,
725 |             "max_tokens": kwargs.get("max_tokens", 500),
726 |             "temperature": kwargs.get("temperature", 0.0),
727 |             "do_sample": kwargs.get("temperature", 0.0) > 0.0,
728 |             "num_retries": max_retries,
729 |             **{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]},
730 |         }
731 | 
732 |         # Call API start hook
733 |         if _on_api_start:
734 |             await _on_api_start(api_kwargs)
735 | 
736 |         # Call liteLLM with UITARS model
737 |         response = await litellm.acompletion(**api_kwargs)
738 | 
739 |         # Call API end hook
740 |         if _on_api_end:
741 |             await _on_api_end(api_kwargs, response)
742 | 
743 |         # Extract response content
744 |         response_content = response.choices[0].message.content.strip()  # type: ignore
745 | 
746 |         # Parse UITARS response
747 |         parsed_responses = parse_uitars_response(response_content, original_width, original_height)
748 | 
749 |         # Convert to computer actions
750 |         computer_actions = convert_to_computer_actions(
751 |             parsed_responses, original_width, original_height
752 |         )
753 | 
754 |         # Add computer actions to response items
755 |         thought = parsed_responses[0].get("thought", "")
756 |         if thought:
757 |             response_items.append(make_reasoning_item(thought))
758 |         response_items.extend(computer_actions)
759 | 
760 |         # Extract usage information
761 |         response_usage = {
762 |             **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
763 |                 response.usage
764 |             ).model_dump(),
765 |             "response_cost": response._hidden_params.get("response_cost", 0.0),
766 |         }
767 |         if _on_usage:
768 |             await _on_usage(response_usage)
769 | 
770 |         # Create agent response
771 |         agent_response = {"output": response_items, "usage": response_usage}
772 | 
773 |         return agent_response
774 | 
775 |     async def predict_click(
776 |         self, model: str, image_b64: str, instruction: str, **kwargs
777 |     ) -> Optional[Tuple[int, int]]:
778 |         """
779 |         Predict click coordinates based on image and instruction.
780 | 
781 |         UITARS supports click prediction through its action parsing.
782 | 
783 |         Args:
784 |             model: Model name to use
785 |             image_b64: Base64 encoded image
786 |             instruction: Instruction for where to click
787 | 
788 |         Returns:
789 |             Tuple with (x, y) coordinates or None
790 |         """
791 |         try:
792 |             # Create prompt using grounding template
793 |             user_prompt = GROUNDING_UITARS_PROMPT_TEMPLATE.format(instruction=instruction)
794 | 
795 |             # Process image for UITARS
796 |             processed_image, original_width, original_height = process_image_for_uitars(image_b64)
797 |             encoded_image = pil_to_base64(processed_image)
798 | 
799 |             # Prepare messages for liteLLM
800 |             litellm_messages = [
801 |                 {"role": "system", "content": "You are a helpful assistant."},
802 |                 {
803 |                     "role": "user",
804 |                     "content": [
805 |                         {"type": "text", "text": user_prompt},
806 |                         {
807 |                             "type": "image_url",
808 |                             "image_url": {"url": f"data:image/png;base64,{encoded_image}"},
809 |                         },
810 |                     ],
811 |                 },
812 |             ]
813 | 
814 |             # Prepare API call kwargs
815 |             api_kwargs = {
816 |                 "model": model,
817 |                 "messages": litellm_messages,
818 |                 "max_tokens": 2056,
819 |                 "temperature": 0.0,
820 |                 "do_sample": False,
821 |             }
822 |             api_kwargs.update({k: v for k, v in (kwargs or {}).items()})
823 | 
824 |             # Call liteLLM with UITARS model
825 |             response = await litellm.acompletion(**api_kwargs)
826 | 
827 |             # Extract response content
828 |             response_content = response.choices[0].message.content.strip()  # type: ignore
829 | 
830 |             print(response_content)
831 | 
832 |             # Parse the response to extract click coordinates
833 |             # Look for click action with coordinates (with special tokens)
834 |             click_pattern = r"click\(point='<\|box_start\|>\((\d+),(\d+)\)<\|box_end\|>'\)"
835 |             match = re.search(click_pattern, response_content)
836 | 
837 |             # Fallback: Look for simpler format without special tokens
838 |             if not match:
839 |                 # Pattern for: click(start_box='(x,y)') or click(point='(x,y)')
840 |                 fallback_pattern = r"click\((?:start_box|point)='\((\d+),(\d+)\)'\)"
841 |                 match = re.search(fallback_pattern, response_content)
842 | 
843 |             if match:
844 |                 x, y = int(match.group(1)), int(match.group(2))
845 |                 # Scale coordinates back to original image dimensions
846 |                 scale_x = original_width / processed_image.width
847 |                 scale_y = original_height / processed_image.height
848 | 
849 |                 scaled_x = int(x * scale_x)
850 |                 scaled_y = int(y * scale_y)
851 | 
852 |                 return (scaled_x, scaled_y)
853 | 
854 |             return None
855 | 
856 |         except Exception as e:
857 |             # Log error and return None
858 |             print(f"Error in predict_click: {e}")
859 |             return None
860 | 
861 |     def get_capabilities(self) -> List[AgentCapability]:
862 |         """
863 |         Get list of capabilities supported by this agent config.
864 | 
865 |         Returns:
866 |             List of capability strings
867 |         """
868 |         return ["step", "click"]
869 | 
```

--------------------------------------------------------------------------------
/libs/lume/src/VM/VM.swift:
--------------------------------------------------------------------------------

```swift
  1 | import Foundation
  2 | 
  3 | // MARK: - Support Types
  4 | 
  5 | /// Base context for virtual machine directory and configuration
  6 | struct VMDirContext {
  7 |     let dir: VMDirectory
  8 |     var config: VMConfig
  9 |     let home: Home
 10 |     let storage: String?
 11 | 
 12 |     func saveConfig() throws {
 13 |         try dir.saveConfig(config)
 14 |     }
 15 | 
 16 |     var name: String { dir.name }
 17 |     var initialized: Bool { dir.initialized() }
 18 |     var diskPath: Path { dir.diskPath }
 19 |     var nvramPath: Path { dir.nvramPath }
 20 | 
 21 |     func setDisk(_ size: UInt64) throws {
 22 |         try dir.setDisk(size)
 23 |     }
 24 | 
 25 |     func finalize(to name: String) throws {
 26 |         let vmDir = try home.getVMDirectory(name)
 27 |         try FileManager.default.moveItem(at: dir.dir.url, to: vmDir.dir.url)
 28 |     }
 29 | }
 30 | 
 31 | // MARK: - Base VM Class
 32 | 
 33 | /// Base class for virtual machine implementations
 34 | @MainActor
 35 | class VM {
 36 |     // MARK: - Properties
 37 | 
 38 |     var vmDirContext: VMDirContext
 39 | 
 40 |     @MainActor
 41 |     private var virtualizationService: VMVirtualizationService?
 42 |     private let vncService: VNCService
 43 |     internal let virtualizationServiceFactory:
 44 |         (VMVirtualizationServiceContext) throws -> VMVirtualizationService
 45 |     private let vncServiceFactory: (VMDirectory) -> VNCService
 46 | 
 47 |     // MARK: - Initialization
 48 | 
 49 |     init(
 50 |         vmDirContext: VMDirContext,
 51 |         virtualizationServiceFactory: @escaping (VMVirtualizationServiceContext) throws ->
 52 |             VMVirtualizationService = { try DarwinVirtualizationService(configuration: $0) },
 53 |         vncServiceFactory: @escaping (VMDirectory) -> VNCService = {
 54 |             DefaultVNCService(vmDirectory: $0)
 55 |         }
 56 |     ) {
 57 |         self.vmDirContext = vmDirContext
 58 |         self.virtualizationServiceFactory = virtualizationServiceFactory
 59 |         self.vncServiceFactory = vncServiceFactory
 60 | 
 61 |         // Initialize VNC service
 62 |         self.vncService = vncServiceFactory(vmDirContext.dir)
 63 |     }
 64 | 
 65 |     // MARK: - VM State Management
 66 | 
 67 |     private var isRunning: Bool {
 68 |         // First check if we have a MAC address
 69 |         guard let macAddress = vmDirContext.config.macAddress else {
 70 |             Logger.info(
 71 |                 "Cannot check if VM is running: macAddress is nil",
 72 |                 metadata: ["name": vmDirContext.name])
 73 |             return false
 74 |         }
 75 | 
 76 |         // Then check if we have an IP address
 77 |         guard let ipAddress = DHCPLeaseParser.getIPAddress(forMAC: macAddress) else {
 78 |             return false
 79 |         }
 80 | 
 81 |         // Then check if it's reachable
 82 |         return NetworkUtils.isReachable(ipAddress: ipAddress)
 83 |     }
 84 | 
 85 |     var details: VMDetails {
 86 |         let isRunning: Bool = self.isRunning
 87 |         let vncUrl = isRunning ? getVNCUrl() : nil
 88 | 
 89 |         // Safely get disk size with fallback
 90 |         let diskSizeValue: DiskSize
 91 |         do {
 92 |             diskSizeValue = try getDiskSize()
 93 |         } catch {
 94 |             Logger.error(
 95 |                 "Failed to get disk size",
 96 |                 metadata: ["name": vmDirContext.name, "error": "\(error)"])
 97 |             // Provide a fallback value to avoid crashing
 98 |             diskSizeValue = DiskSize(allocated: 0, total: vmDirContext.config.diskSize ?? 0)
 99 |         }
100 | 
101 |         // Safely access MAC address
102 |         let macAddress = vmDirContext.config.macAddress
103 |         let ipAddress: String? =
104 |             isRunning && macAddress != nil ? DHCPLeaseParser.getIPAddress(forMAC: macAddress!) : nil
105 | 
106 |         return VMDetails(
107 |             name: vmDirContext.name,
108 |             os: getOSType(),
109 |             cpuCount: vmDirContext.config.cpuCount ?? 0,
110 |             memorySize: vmDirContext.config.memorySize ?? 0,
111 |             diskSize: diskSizeValue,
112 |             display: vmDirContext.config.display.string,
113 |             status: isRunning ? "running" : "stopped",
114 |             vncUrl: vncUrl,
115 |             ipAddress: ipAddress,
116 |             locationName: vmDirContext.storage ?? "default"
117 |         )
118 |     }
119 | 
120 |     // MARK: - VM Lifecycle Management
121 | 
122 |     func run(
123 |         noDisplay: Bool, sharedDirectories: [SharedDirectory], mount: Path?, vncPort: Int = 0,
124 |         recoveryMode: Bool = false, usbMassStoragePaths: [Path]? = nil
125 |     ) async throws {
126 |         Logger.info(
127 |             "VM.run method called",
128 |             metadata: [
129 |                 "name": vmDirContext.name,
130 |                 "noDisplay": "\(noDisplay)",
131 |                 "recoveryMode": "\(recoveryMode)",
132 |             ])
133 | 
134 |         guard vmDirContext.initialized else {
135 |             Logger.error("VM not initialized", metadata: ["name": vmDirContext.name])
136 |             throw VMError.notInitialized(vmDirContext.name)
137 |         }
138 | 
139 |         guard let cpuCount = vmDirContext.config.cpuCount,
140 |             let memorySize = vmDirContext.config.memorySize
141 |         else {
142 |             Logger.error("VM missing cpuCount or memorySize", metadata: ["name": vmDirContext.name])
143 |             throw VMError.notInitialized(vmDirContext.name)
144 |         }
145 | 
146 |         // Try to acquire lock on config file
147 |         Logger.info(
148 |             "Attempting to acquire lock on config file",
149 |             metadata: [
150 |                 "path": vmDirContext.dir.configPath.path,
151 |                 "name": vmDirContext.name,
152 |             ])
153 |         var fileHandle = try FileHandle(forWritingTo: vmDirContext.dir.configPath.url)
154 | 
155 |         if flock(fileHandle.fileDescriptor, LOCK_EX | LOCK_NB) != 0 {
156 |             try? fileHandle.close()
157 |             Logger.error(
158 |                 "VM already running (failed to acquire lock)", metadata: ["name": vmDirContext.name]
159 |             )
160 | 
161 |             // Try to forcibly clear the lock before giving up
162 |             Logger.info("Attempting emergency lock cleanup", metadata: ["name": vmDirContext.name])
163 |             unlockConfigFile()
164 | 
165 |             // Try one more time to acquire the lock
166 |             if let retryHandle = try? FileHandle(forWritingTo: vmDirContext.dir.configPath.url),
167 |                 flock(retryHandle.fileDescriptor, LOCK_EX | LOCK_NB) == 0
168 |             {
169 |                 Logger.info("Emergency lock cleanup worked", metadata: ["name": vmDirContext.name])
170 |                 // Continue with a fresh file handle
171 |                 try? retryHandle.close()
172 |                 // Get a completely new file handle to be safe
173 |                 guard let newHandle = try? FileHandle(forWritingTo: vmDirContext.dir.configPath.url)
174 |                 else {
175 |                     throw VMError.internalError("Failed to open file handle after lock cleanup")
176 |                 }
177 |                 // Update our main file handle
178 |                 fileHandle = newHandle
179 |             } else {
180 |                 // If we still can't get the lock, give up
181 |                 Logger.error(
182 |                     "Could not acquire lock even after emergency cleanup",
183 |                     metadata: ["name": vmDirContext.name])
184 |                 throw VMError.alreadyRunning(vmDirContext.name)
185 |             }
186 |         }
187 |         Logger.info("Successfully acquired lock", metadata: ["name": vmDirContext.name])
188 | 
189 |         Logger.info(
190 |             "Running VM with configuration",
191 |             metadata: [
192 |                 "name": vmDirContext.name,
193 |                 "cpuCount": "\(cpuCount)",
194 |                 "memorySize": "\(memorySize)",
195 |                 "diskSize": "\(vmDirContext.config.diskSize ?? 0)",
196 |                 "sharedDirectories": sharedDirectories.map { $0.string }.joined(separator: ", "),
197 |                 "recoveryMode": "\(recoveryMode)",
198 |             ])
199 | 
200 |         // Create and configure the VM
201 |         do {
202 |             Logger.info(
203 |                 "Creating virtualization service context", metadata: ["name": vmDirContext.name])
204 |             let config = try createVMVirtualizationServiceContext(
205 |                 cpuCount: cpuCount,
206 |                 memorySize: memorySize,
207 |                 display: vmDirContext.config.display.string,
208 |                 sharedDirectories: sharedDirectories,
209 |                 mount: mount,
210 |                 recoveryMode: recoveryMode,
211 |                 usbMassStoragePaths: usbMassStoragePaths
212 |             )
213 |             Logger.info(
214 |                 "Successfully created virtualization service context",
215 |                 metadata: ["name": vmDirContext.name])
216 | 
217 |             Logger.info(
218 |                 "Initializing virtualization service", metadata: ["name": vmDirContext.name])
219 |             virtualizationService = try virtualizationServiceFactory(config)
220 |             Logger.info(
221 |                 "Successfully initialized virtualization service",
222 |                 metadata: ["name": vmDirContext.name])
223 | 
224 |             Logger.info(
225 |                 "Setting up VNC",
226 |                 metadata: [
227 |                     "name": vmDirContext.name,
228 |                     "noDisplay": "\(noDisplay)",
229 |                     "port": "\(vncPort)",
230 |                 ])
231 |             let vncInfo = try await setupSession(
232 |                 noDisplay: noDisplay, port: vncPort, sharedDirectories: sharedDirectories)
233 |             Logger.info(
234 |                 "VNC setup successful", metadata: ["name": vmDirContext.name, "vncInfo": vncInfo])
235 | 
236 |             // Start the VM
237 |             guard let service = virtualizationService else {
238 |                 Logger.error("Virtualization service is nil", metadata: ["name": vmDirContext.name])
239 |                 throw VMError.internalError("Virtualization service not initialized")
240 |             }
241 |             Logger.info(
242 |                 "Starting VM via virtualization service", metadata: ["name": vmDirContext.name])
243 |             try await service.start()
244 |             Logger.info("VM started successfully", metadata: ["name": vmDirContext.name])
245 | 
246 |             while true {
247 |                 try await Task.sleep(nanoseconds: UInt64(1e9))
248 |             }
249 |         } catch {
250 |             Logger.error(
251 |                 "Failed in VM.run",
252 |                 metadata: [
253 |                     "name": vmDirContext.name,
254 |                     "error": error.localizedDescription,
255 |                     "errorType": "\(type(of: error))",
256 |                 ])
257 |             virtualizationService = nil
258 |             vncService.stop()
259 | 
260 |             // Release lock
261 |             Logger.info("Releasing file lock after error", metadata: ["name": vmDirContext.name])
262 |             flock(fileHandle.fileDescriptor, LOCK_UN)
263 |             try? fileHandle.close()
264 | 
265 |             // Additionally, perform our aggressive unlock to ensure no locks remain
266 |             Logger.info(
267 |                 "Performing additional lock cleanup after error",
268 |                 metadata: ["name": vmDirContext.name])
269 |             unlockConfigFile()
270 | 
271 |             throw error
272 |         }
273 |     }
274 | 
275 |     @MainActor
276 |     func stop() async throws {
277 |         guard vmDirContext.initialized else {
278 |             throw VMError.notInitialized(vmDirContext.name)
279 |         }
280 | 
281 |         Logger.info("Attempting to stop VM", metadata: ["name": vmDirContext.name])
282 | 
283 |         // If we have a virtualization service, try to stop it cleanly first
284 |         if let service = virtualizationService {
285 |             do {
286 |                 Logger.info(
287 |                     "Stopping VM via virtualization service", metadata: ["name": vmDirContext.name])
288 |                 try await service.stop()
289 |                 virtualizationService = nil
290 |                 vncService.stop()
291 |                 Logger.info(
292 |                     "VM stopped successfully via virtualization service",
293 |                     metadata: ["name": vmDirContext.name])
294 | 
295 |                 // Try to ensure any existing locks are released
296 |                 Logger.info(
297 |                     "Attempting to clear any locks on config file",
298 |                     metadata: ["name": vmDirContext.name])
299 |                 unlockConfigFile()
300 | 
301 |                 return
302 |             } catch let error {
303 |                 Logger.error(
304 |                     "Failed to stop VM via virtualization service",
305 |                     metadata: [
306 |                         "name": vmDirContext.name,
307 |                         "error": error.localizedDescription,
308 |                     ])
309 |                 // Fall through to process termination
310 |             }
311 |         }
312 | 
313 |         // Try to open config file to get file descriptor
314 |         Logger.info(
315 |             "Attempting to access config file lock",
316 |             metadata: [
317 |                 "path": vmDirContext.dir.configPath.path,
318 |                 "name": vmDirContext.name,
319 |             ])
320 |         let fileHandle = try? FileHandle(forReadingFrom: vmDirContext.dir.configPath.url)
321 |         guard let fileHandle = fileHandle else {
322 |             Logger.info(
323 |                 "Failed to open config file - VM may not be running",
324 |                 metadata: ["name": vmDirContext.name])
325 | 
326 |             // Even though we couldn't open the file, try to force unlock anyway
327 |             unlockConfigFile()
328 | 
329 |             throw VMError.notRunning(vmDirContext.name)
330 |         }
331 | 
332 |         // Get the PID of the process holding the lock using lsof command
333 |         Logger.info(
334 |             "Finding process holding lock on config file", metadata: ["name": vmDirContext.name])
335 |         let task = Process()
336 |         task.executableURL = URL(fileURLWithPath: "/usr/sbin/lsof")
337 |         task.arguments = ["-F", "p", vmDirContext.dir.configPath.path]
338 | 
339 |         let outputPipe = Pipe()
340 |         task.standardOutput = outputPipe
341 | 
342 |         try task.run()
343 |         task.waitUntilExit()
344 | 
345 |         let outputData = try outputPipe.fileHandleForReading.readToEnd() ?? Data()
346 |         guard let outputString = String(data: outputData, encoding: .utf8),
347 |             let pidString = outputString.split(separator: "\n").first?.dropFirst(),  // Drop the 'p' prefix
348 |             let pid = pid_t(pidString)
349 |         else {
350 |             try? fileHandle.close()
351 |             Logger.info(
352 |                 "Failed to find process holding lock - VM may not be running",
353 |                 metadata: ["name": vmDirContext.name])
354 | 
355 |             // Even though we couldn't find the process, try to force unlock
356 |             unlockConfigFile()
357 | 
358 |             throw VMError.notRunning(vmDirContext.name)
359 |         }
360 | 
361 |         Logger.info(
362 |             "Found process \(pid) holding lock on config file",
363 |             metadata: ["name": vmDirContext.name])
364 | 
365 |         // First try graceful shutdown with SIGINT
366 |         if kill(pid, SIGINT) == 0 {
367 |             Logger.info("Sent SIGINT to VM process \(pid)", metadata: ["name": vmDirContext.name])
368 |         }
369 | 
370 |         // Wait for process to stop with timeout
371 |         var attempts = 0
372 |         while attempts < 10 {
373 |             Logger.info(
374 |                 "Waiting for process \(pid) to terminate (attempt \(attempts + 1)/10)",
375 |                 metadata: ["name": vmDirContext.name])
376 |             try await Task.sleep(nanoseconds: 1_000_000_000)
377 | 
378 |             // Check if process still exists
379 |             if kill(pid, 0) != 0 {
380 |                 // Process is gone, do final cleanup
381 |                 Logger.info("Process \(pid) has terminated", metadata: ["name": vmDirContext.name])
382 |                 virtualizationService = nil
383 |                 vncService.stop()
384 |                 try? fileHandle.close()
385 | 
386 |                 // Force unlock the config file
387 |                 unlockConfigFile()
388 | 
389 |                 Logger.info(
390 |                     "VM stopped successfully via process termination",
391 |                     metadata: ["name": vmDirContext.name])
392 |                 return
393 |             }
394 |             attempts += 1
395 |         }
396 | 
397 |         // If graceful shutdown failed, force kill the process
398 |         Logger.info(
399 |             "Graceful shutdown failed, forcing termination of process \(pid)",
400 |             metadata: ["name": vmDirContext.name])
401 |         if kill(pid, SIGKILL) == 0 {
402 |             Logger.info("Sent SIGKILL to process \(pid)", metadata: ["name": vmDirContext.name])
403 | 
404 |             // Wait a moment for the process to be fully killed
405 |             try await Task.sleep(nanoseconds: 2_000_000_000)
406 | 
407 |             // Do final cleanup
408 |             virtualizationService = nil
409 |             vncService.stop()
410 |             try? fileHandle.close()
411 | 
412 |             // Force unlock the config file
413 |             unlockConfigFile()
414 | 
415 |             Logger.info("VM forcefully stopped", metadata: ["name": vmDirContext.name])
416 |             return
417 |         }
418 | 
419 |         // If we get here, something went very wrong
420 |         try? fileHandle.close()
421 |         Logger.error(
422 |             "Failed to stop VM - could not terminate process \(pid)",
423 |             metadata: ["name": vmDirContext.name])
424 | 
425 |         // As a last resort, try to force unlock
426 |         unlockConfigFile()
427 | 
428 |         throw VMError.internalError("Failed to stop VM process")
429 |     }
430 | 
431 |     // Helper method to forcibly clear any locks on the config file
432 |     private func unlockConfigFile() {
433 |         Logger.info(
434 |             "Forcibly clearing locks on config file",
435 |             metadata: [
436 |                 "path": vmDirContext.dir.configPath.path,
437 |                 "name": vmDirContext.name,
438 |             ])
439 | 
440 |         // First attempt: standard unlock methods
441 |         if let fileHandle = try? FileHandle(forWritingTo: vmDirContext.dir.configPath.url) {
442 |             // Use F_GETLK and F_SETLK to check and clear locks
443 |             var lockInfo = flock()
444 |             lockInfo.l_type = Int16(F_UNLCK)
445 |             lockInfo.l_whence = Int16(SEEK_SET)
446 |             lockInfo.l_start = 0
447 |             lockInfo.l_len = 0
448 | 
449 |             // Try to unlock the file using fcntl
450 |             _ = fcntl(fileHandle.fileDescriptor, F_SETLK, &lockInfo)
451 | 
452 |             // Also try the regular flock method
453 |             flock(fileHandle.fileDescriptor, LOCK_UN)
454 | 
455 |             try? fileHandle.close()
456 |             Logger.info("Standard unlock attempts performed", metadata: ["name": vmDirContext.name])
457 |         }
458 | 
459 |         // Second attempt: try to acquire and immediately release a fresh lock
460 |         if let tempHandle = try? FileHandle(forWritingTo: vmDirContext.dir.configPath.url) {
461 |             if flock(tempHandle.fileDescriptor, LOCK_EX | LOCK_NB) == 0 {
462 |                 Logger.info(
463 |                     "Successfully acquired and released lock to reset state",
464 |                     metadata: ["name": vmDirContext.name])
465 |                 flock(tempHandle.fileDescriptor, LOCK_UN)
466 |             } else {
467 |                 Logger.info(
468 |                     "Could not acquire lock for resetting - may still be locked",
469 |                     metadata: ["name": vmDirContext.name])
470 |             }
471 |             try? tempHandle.close()
472 |         }
473 | 
474 |         // Third attempt (most aggressive): copy the config file, remove the original, and restore
475 |         Logger.info(
476 |             "Trying aggressive method: backup and restore config file",
477 |             metadata: ["name": vmDirContext.name])
478 |         // Only proceed if the config file exists
479 |         let fileManager = FileManager.default
480 |         let configPath = vmDirContext.dir.configPath.path
481 |         let backupPath = configPath + ".backup"
482 | 
483 |         if fileManager.fileExists(atPath: configPath) {
484 |             // Create a backup of the config file
485 |             if let configData = try? Data(contentsOf: URL(fileURLWithPath: configPath)) {
486 |                 // Make backup
487 |                 try? configData.write(to: URL(fileURLWithPath: backupPath))
488 | 
489 |                 // Remove the original file to clear all locks
490 |                 try? fileManager.removeItem(atPath: configPath)
491 |                 Logger.info(
492 |                     "Removed original config file to clear locks",
493 |                     metadata: ["name": vmDirContext.name])
494 | 
495 |                 // Wait a moment for OS to fully release resources
496 |                 Thread.sleep(forTimeInterval: 0.1)
497 | 
498 |                 // Restore from backup
499 |                 try? configData.write(to: URL(fileURLWithPath: configPath))
500 |                 Logger.info(
501 |                     "Restored config file from backup", metadata: ["name": vmDirContext.name])
502 |             } else {
503 |                 Logger.error(
504 |                     "Could not read config file content for backup",
505 |                     metadata: ["name": vmDirContext.name])
506 |             }
507 |         } else {
508 |             Logger.info(
509 |                 "Config file does not exist, cannot perform aggressive unlock",
510 |                 metadata: ["name": vmDirContext.name])
511 |         }
512 | 
513 |         // Final check
514 |         if let finalHandle = try? FileHandle(forWritingTo: vmDirContext.dir.configPath.url) {
515 |             let lockResult = flock(finalHandle.fileDescriptor, LOCK_EX | LOCK_NB)
516 |             if lockResult == 0 {
517 |                 Logger.info(
518 |                     "Lock successfully cleared - verified by acquiring test lock",
519 |                     metadata: ["name": vmDirContext.name])
520 |                 flock(finalHandle.fileDescriptor, LOCK_UN)
521 |             } else {
522 |                 Logger.info(
523 |                     "Lock still present after all clearing attempts",
524 |                     metadata: ["name": vmDirContext.name, "severity": "warning"])
525 |             }
526 |             try? finalHandle.close()
527 |         }
528 |     }
529 | 
530 |     // MARK: - Resource Management
531 | 
532 |     func updateVMConfig(vmConfig: VMConfig) throws {
533 |         vmDirContext.config = vmConfig
534 |         try vmDirContext.saveConfig()
535 |     }
536 | 
537 |     private func getDiskSize() throws -> DiskSize {
538 |         let resourceValues = try vmDirContext.diskPath.url.resourceValues(forKeys: [
539 |             .totalFileAllocatedSizeKey,
540 |             .totalFileSizeKey,
541 |         ])
542 | 
543 |         guard let allocated = resourceValues.totalFileAllocatedSize,
544 |             let total = resourceValues.totalFileSize
545 |         else {
546 |             throw VMConfigError.invalidDiskSize
547 |         }
548 | 
549 |         return DiskSize(allocated: UInt64(allocated), total: UInt64(total))
550 |     }
551 | 
552 |     func resizeDisk(_ newSize: UInt64) throws {
553 |         let currentSize = try getDiskSize()
554 | 
555 |         guard newSize >= currentSize.total else {
556 |             throw VMError.resizeTooSmall(current: currentSize.total, requested: newSize)
557 |         }
558 | 
559 |         try setDiskSize(newSize)
560 |     }
561 | 
562 |     func setCpuCount(_ newCpuCount: Int) throws {
563 |         guard !isRunning else {
564 |             throw VMError.alreadyRunning(vmDirContext.name)
565 |         }
566 |         vmDirContext.config.setCpuCount(newCpuCount)
567 |         try vmDirContext.saveConfig()
568 |     }
569 | 
570 |     func setMemorySize(_ newMemorySize: UInt64) throws {
571 |         guard !isRunning else {
572 |             throw VMError.alreadyRunning(vmDirContext.name)
573 |         }
574 |         vmDirContext.config.setMemorySize(newMemorySize)
575 |         try vmDirContext.saveConfig()
576 |     }
577 | 
578 |     func setDiskSize(_ newDiskSize: UInt64) throws {
579 |         try vmDirContext.setDisk(newDiskSize)
580 |         vmDirContext.config.setDiskSize(newDiskSize)
581 |         try vmDirContext.saveConfig()
582 |     }
583 | 
584 |     func setDisplay(_ newDisplay: String) throws {
585 |         guard !isRunning else {
586 |             throw VMError.alreadyRunning(vmDirContext.name)
587 |         }
588 |         guard let display: VMDisplayResolution = VMDisplayResolution(string: newDisplay) else {
589 |             throw VMError.invalidDisplayResolution(newDisplay)
590 |         }
591 |         vmDirContext.config.setDisplay(display)
592 |         try vmDirContext.saveConfig()
593 |     }
594 | 
595 |     func setHardwareModel(_ newHardwareModel: Data) throws {
596 |         guard !isRunning else {
597 |             throw VMError.alreadyRunning(vmDirContext.name)
598 |         }
599 |         vmDirContext.config.setHardwareModel(newHardwareModel)
600 |         try vmDirContext.saveConfig()
601 |     }
602 | 
603 |     func setMachineIdentifier(_ newMachineIdentifier: Data) throws {
604 |         guard !isRunning else {
605 |             throw VMError.alreadyRunning(vmDirContext.name)
606 |         }
607 |         vmDirContext.config.setMachineIdentifier(newMachineIdentifier)
608 |         try vmDirContext.saveConfig()
609 |     }
610 | 
611 |     func setMacAddress(_ newMacAddress: String) throws {
612 |         guard !isRunning else {
613 |             throw VMError.alreadyRunning(vmDirContext.name)
614 |         }
615 |         vmDirContext.config.setMacAddress(newMacAddress)
616 |         try vmDirContext.saveConfig()
617 |     }
618 | 
619 |     // MARK: - VNC Management
620 | 
621 |     func getVNCUrl() -> String? {
622 |         return vncService.url
623 |     }
624 | 
625 |     /// Sets up the VNC service and returns the VNC URL
626 |     private func startVNCService(port: Int = 0) async throws -> String {
627 |         guard let service = virtualizationService else {
628 |             throw VMError.internalError("Virtualization service not initialized")
629 |         }
630 | 
631 |         try await vncService.start(port: port, virtualMachine: service.getVirtualMachine())
632 | 
633 |         guard let url = vncService.url else {
634 |             throw VMError.vncNotConfigured
635 |         }
636 | 
637 |         return url
638 |     }
639 | 
640 |     /// Saves the session information including shared directories to disk
641 |     private func saveSessionData(url: String, sharedDirectories: [SharedDirectory]) {
642 |         do {
643 |             let session = VNCSession(
644 |                 url: url, sharedDirectories: sharedDirectories.isEmpty ? nil : sharedDirectories)
645 |             try vmDirContext.dir.saveSession(session)
646 |             Logger.info(
647 |                 "Saved VNC session with shared directories",
648 |                 metadata: [
649 |                     "count": "\(sharedDirectories.count)",
650 |                     "dirs": "\(sharedDirectories.map { $0.hostPath }.joined(separator: ", "))",
651 |                     "sessionsPath": "\(vmDirContext.dir.sessionsPath.path)",
652 |                 ])
653 |         } catch {
654 |             Logger.error("Failed to save VNC session", metadata: ["error": "\(error)"])
655 |         }
656 |     }
657 | 
658 |     /// Main session setup method that handles VNC and persists session data
659 |     private func setupSession(
660 |         noDisplay: Bool, port: Int = 0, sharedDirectories: [SharedDirectory] = []
661 |     ) async throws -> String {
662 |         // Start the VNC service and get the URL
663 |         let url = try await startVNCService(port: port)
664 | 
665 |         // Save the session data
666 |         saveSessionData(url: url, sharedDirectories: sharedDirectories)
667 | 
668 |         // Open the VNC client if needed
669 |         if !noDisplay {
670 |             Logger.info("Starting VNC session", metadata: ["name": vmDirContext.name])
671 |             try await vncService.openClient(url: url)
672 |         }
673 | 
674 |         return url
675 |     }
676 | 
677 |     // MARK: - Platform-specific Methods
678 | 
679 |     func getOSType() -> String {
680 |         fatalError("Must be implemented by subclass")
681 |     }
682 | 
683 |     func createVMVirtualizationServiceContext(
684 |         cpuCount: Int,
685 |         memorySize: UInt64,
686 |         display: String,
687 |         sharedDirectories: [SharedDirectory] = [],
688 |         mount: Path? = nil,
689 |         recoveryMode: Bool = false,
690 |         usbMassStoragePaths: [Path]? = nil
691 |     ) throws -> VMVirtualizationServiceContext {
692 |         // This is a diagnostic log to track actual file paths on disk for debugging
693 |         try validateDiskState()
694 | 
695 |         return VMVirtualizationServiceContext(
696 |             cpuCount: cpuCount,
697 |             memorySize: memorySize,
698 |             display: display,
699 |             sharedDirectories: sharedDirectories,
700 |             mount: mount,
701 |             hardwareModel: vmDirContext.config.hardwareModel,
702 |             machineIdentifier: vmDirContext.config.machineIdentifier,
703 |             macAddress: vmDirContext.config.macAddress!,
704 |             diskPath: vmDirContext.diskPath,
705 |             nvramPath: vmDirContext.nvramPath,
706 |             recoveryMode: recoveryMode,
707 |             usbMassStoragePaths: usbMassStoragePaths
708 |         )
709 |     }
710 | 
711 |     /// Validates the disk state to help diagnose storage attachment issues
712 |     private func validateDiskState() throws {
713 |         // Check disk image state
714 |         let diskPath = vmDirContext.diskPath.path
715 |         let diskExists = FileManager.default.fileExists(atPath: diskPath)
716 |         var diskSize: UInt64 = 0
717 |         var diskPermissions = ""
718 | 
719 |         if diskExists {
720 |             if let attrs = try? FileManager.default.attributesOfItem(atPath: diskPath) {
721 |                 diskSize = attrs[.size] as? UInt64 ?? 0
722 |                 let posixPerms = attrs[.posixPermissions] as? Int ?? 0
723 |                 diskPermissions = String(format: "%o", posixPerms)
724 |             }
725 |         }
726 | 
727 |         // Check disk container directory permissions
728 |         let diskDir = (diskPath as NSString).deletingLastPathComponent
729 |         let dirPerms =
730 |             try? FileManager.default.attributesOfItem(atPath: diskDir)[.posixPermissions] as? Int
731 |             ?? 0
732 |         let dirPermsString = dirPerms != nil ? String(format: "%o", dirPerms!) : "unknown"
733 | 
734 |         // Log detailed diagnostics
735 |         Logger.info(
736 |             "Validating VM disk state",
737 |             metadata: [
738 |                 "diskPath": diskPath,
739 |                 "diskExists": "\(diskExists)",
740 |                 "diskSize":
741 |                     "\(ByteCountFormatter.string(fromByteCount: Int64(diskSize), countStyle: .file))",
742 |                 "diskPermissions": diskPermissions,
743 |                 "dirPermissions": dirPermsString,
744 |                 "locationName": vmDirContext.storage ?? "default",
745 |             ])
746 | 
747 |         if !diskExists {
748 |             Logger.error("VM disk image does not exist", metadata: ["diskPath": diskPath])
749 |         } else if diskSize == 0 {
750 |             Logger.error("VM disk image exists but has zero size", metadata: ["diskPath": diskPath])
751 |         }
752 |     }
753 | 
754 |     func setup(
755 |         ipswPath: String,
756 |         cpuCount: Int,
757 |         memorySize: UInt64,
758 |         diskSize: UInt64,
759 |         display: String
760 |     ) async throws {
761 |         fatalError("Must be implemented by subclass")
762 |     }
763 | 
764 |     // MARK: - Finalization
765 | 
766 |     /// Post-installation step to move the VM directory to the home directory
767 |     func finalize(to name: String, home: Home, storage: String? = nil) throws {
768 |         let vmDir = try home.getVMDirectory(name, storage: storage)
769 |         try FileManager.default.moveItem(at: vmDirContext.dir.dir.url, to: vmDir.dir.url)
770 |     }
771 | 
772 |     // Method to run VM with additional USB mass storage devices
773 |     func runWithUSBStorage(
774 |         noDisplay: Bool, sharedDirectories: [SharedDirectory], mount: Path?, vncPort: Int = 0,
775 |         recoveryMode: Bool = false, usbImagePaths: [Path]
776 |     ) async throws {
777 |         guard vmDirContext.initialized else {
778 |             throw VMError.notInitialized(vmDirContext.name)
779 |         }
780 | 
781 |         guard let cpuCount = vmDirContext.config.cpuCount,
782 |             let memorySize = vmDirContext.config.memorySize
783 |         else {
784 |             throw VMError.notInitialized(vmDirContext.name)
785 |         }
786 | 
787 |         // Try to acquire lock on config file
788 |         let fileHandle = try FileHandle(forWritingTo: vmDirContext.dir.configPath.url)
789 |         guard flock(fileHandle.fileDescriptor, LOCK_EX | LOCK_NB) == 0 else {
790 |             try? fileHandle.close()
791 |             throw VMError.alreadyRunning(vmDirContext.name)
792 |         }
793 | 
794 |         Logger.info(
795 |             "Running VM with USB storage devices",
796 |             metadata: [
797 |                 "cpuCount": "\(cpuCount)",
798 |                 "memorySize": "\(memorySize)",
799 |                 "diskSize": "\(vmDirContext.config.diskSize ?? 0)",
800 |                 "usbImageCount": "\(usbImagePaths.count)",
801 |                 "recoveryMode": "\(recoveryMode)",
802 |             ])
803 | 
804 |         // Create and configure the VM
805 |         do {
806 |             let config = try createVMVirtualizationServiceContext(
807 |                 cpuCount: cpuCount,
808 |                 memorySize: memorySize,
809 |                 display: vmDirContext.config.display.string,
810 |                 sharedDirectories: sharedDirectories,
811 |                 mount: mount,
812 |                 recoveryMode: recoveryMode,
813 |                 usbMassStoragePaths: usbImagePaths
814 |             )
815 |             virtualizationService = try virtualizationServiceFactory(config)
816 | 
817 |             let vncInfo = try await setupSession(
818 |                 noDisplay: noDisplay, port: vncPort, sharedDirectories: sharedDirectories)
819 |             Logger.info("VNC info", metadata: ["vncInfo": vncInfo])
820 | 
821 |             // Start the VM
822 |             guard let service = virtualizationService else {
823 |                 throw VMError.internalError("Virtualization service not initialized")
824 |             }
825 |             try await service.start()
826 | 
827 |             while true {
828 |                 try await Task.sleep(nanoseconds: UInt64(1e9))
829 |             }
830 |         } catch {
831 |             Logger.error(
832 |                 "Failed to create/start VM with USB storage",
833 |                 metadata: [
834 |                     "error": "\(error)",
835 |                     "errorType": "\(type(of: error))",
836 |                 ])
837 |             virtualizationService = nil
838 |             vncService.stop()
839 |             // Release lock
840 |             flock(fileHandle.fileDescriptor, LOCK_UN)
841 |             try? fileHandle.close()
842 |             throw error
843 |         }
844 |     }
845 | }
846 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/agent.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | ComputerAgent - Main agent class that selects and runs agent loops
  3 | """
  4 | 
  5 | import asyncio
  6 | import inspect
  7 | import json
  8 | from pathlib import Path
  9 | from typing import (
 10 |     Any,
 11 |     AsyncGenerator,
 12 |     Callable,
 13 |     Dict,
 14 |     List,
 15 |     Optional,
 16 |     Set,
 17 |     Tuple,
 18 |     Union,
 19 |     cast,
 20 | )
 21 | 
 22 | import litellm
 23 | import litellm.utils
 24 | from litellm.responses.utils import Usage
 25 | 
 26 | from .adapters import (
 27 |     AzureMLAdapter,
 28 |     CUAAdapter,
 29 |     HuggingFaceLocalAdapter,
 30 |     HumanAdapter,
 31 |     MLXVLMAdapter,
 32 | )
 33 | from .callbacks import (
 34 |     BudgetManagerCallback,
 35 |     ImageRetentionCallback,
 36 |     LoggingCallback,
 37 |     OperatorNormalizerCallback,
 38 |     PromptInstructionsCallback,
 39 |     TelemetryCallback,
 40 |     TrajectorySaverCallback,
 41 | )
 42 | from .computers import AsyncComputerHandler, is_agent_computer, make_computer_handler
 43 | from .decorators import find_agent_config
 44 | from .responses import (
 45 |     make_tool_error_item,
 46 |     replace_failed_computer_calls_with_function_calls,
 47 | )
 48 | from .tools.base import BaseComputerTool, BaseTool
 49 | from .types import AgentCapability, IllegalArgumentError, Messages, ToolError
 50 | 
 51 | 
 52 | def assert_callable_with(f, *args, **kwargs):
 53 |     """Check if function can be called with given arguments."""
 54 |     try:
 55 |         inspect.signature(f).bind(*args, **kwargs)
 56 |         return True
 57 |     except TypeError as e:
 58 |         sig = inspect.signature(f)
 59 |         raise IllegalArgumentError(f"Expected {sig}, got args={args} kwargs={kwargs}") from e
 60 | 
 61 | 
 62 | def get_json(obj: Any, max_depth: int = 10) -> Any:
 63 |     def custom_serializer(o: Any, depth: int = 0, seen: Optional[Set[int]] = None) -> Any:
 64 |         if seen is None:
 65 |             seen = set()
 66 | 
 67 |         # Use model_dump() if available
 68 |         if hasattr(o, "model_dump"):
 69 |             return o.model_dump()
 70 | 
 71 |         # Check depth limit
 72 |         if depth > max_depth:
 73 |             return f"<max_depth_exceeded:{max_depth}>"
 74 | 
 75 |         # Check for circular references using object id
 76 |         obj_id = id(o)
 77 |         if obj_id in seen:
 78 |             return f"<circular_reference:{type(o).__name__}>"
 79 | 
 80 |         # Handle Computer objects
 81 |         if hasattr(o, "__class__") and "computer" in o.__class__.__name__.lower():
 82 |             return f"<computer:{o.__class__.__name__}>"
 83 | 
 84 |         # Handle objects with __dict__
 85 |         if hasattr(o, "__dict__"):
 86 |             seen.add(obj_id)
 87 |             try:
 88 |                 result = {}
 89 |                 for k, v in o.__dict__.items():
 90 |                     if v is not None:
 91 |                         # Recursively serialize with updated depth and seen set
 92 |                         serialized_value = custom_serializer(v, depth + 1, seen.copy())
 93 |                         result[k] = serialized_value
 94 |                 return result
 95 |             finally:
 96 |                 seen.discard(obj_id)
 97 | 
 98 |         # Handle common types that might contain nested objects
 99 |         elif isinstance(o, dict):
100 |             seen.add(obj_id)
101 |             try:
102 |                 return {
103 |                     k: custom_serializer(v, depth + 1, seen.copy())
104 |                     for k, v in o.items()
105 |                     if v is not None
106 |                 }
107 |             finally:
108 |                 seen.discard(obj_id)
109 | 
110 |         elif isinstance(o, (list, tuple, set)):
111 |             seen.add(obj_id)
112 |             try:
113 |                 return [
114 |                     custom_serializer(item, depth + 1, seen.copy())
115 |                     for item in o
116 |                     if item is not None
117 |                 ]
118 |             finally:
119 |                 seen.discard(obj_id)
120 | 
121 |         # For basic types that json.dumps can handle
122 |         elif isinstance(o, (str, int, float, bool)) or o is None:
123 |             return o
124 | 
125 |         # Fallback to string representation
126 |         else:
127 |             return str(o)
128 | 
129 |     def remove_nones(obj: Any) -> Any:
130 |         if isinstance(obj, dict):
131 |             return {k: remove_nones(v) for k, v in obj.items() if v is not None}
132 |         elif isinstance(obj, list):
133 |             return [remove_nones(item) for item in obj if item is not None]
134 |         return obj
135 | 
136 |     # Serialize with circular reference and depth protection
137 |     serialized = custom_serializer(obj)
138 | 
139 |     # Convert to JSON string and back to ensure JSON compatibility
140 |     json_str = json.dumps(serialized)
141 |     parsed = json.loads(json_str)
142 | 
143 |     # Final cleanup of any remaining None values
144 |     return remove_nones(parsed)
145 | 
146 | 
147 | def sanitize_message(msg: Any) -> Any:
148 |     """Return a copy of the message with image_url omitted for computer_call_output messages."""
149 |     if msg.get("type") == "computer_call_output":
150 |         output = msg.get("output", {})
151 |         if isinstance(output, dict):
152 |             sanitized = msg.copy()
153 |             sanitized["output"] = {**output, "image_url": "[omitted]"}
154 |             return sanitized
155 |     return msg
156 | 
157 | 
158 | def get_output_call_ids(messages: List[Dict[str, Any]]) -> List[str]:
159 |     call_ids = []
160 |     for message in messages:
161 |         if (
162 |             message.get("type") == "computer_call_output"
163 |             or message.get("type") == "function_call_output"
164 |         ):
165 |             call_ids.append(message.get("call_id"))
166 |     return call_ids
167 | 
168 | 
169 | class ComputerAgent:
170 |     """
171 |     Main agent class that automatically selects the appropriate agent loop
172 |     based on the model and executes tool calls.
173 |     """
174 | 
175 |     def __init__(
176 |         self,
177 |         model: str,
178 |         tools: Optional[List[Any]] = None,
179 |         custom_loop: Optional[Callable] = None,
180 |         only_n_most_recent_images: Optional[int] = None,
181 |         callbacks: Optional[List[Any]] = None,
182 |         instructions: Optional[str] = None,
183 |         verbosity: Optional[int] = None,
184 |         trajectory_dir: Optional[str | Path | dict] = None,
185 |         max_retries: Optional[int] = 3,
186 |         screenshot_delay: Optional[float | int] = 0.5,
187 |         use_prompt_caching: Optional[bool] = False,
188 |         max_trajectory_budget: Optional[float | dict] = None,
189 |         telemetry_enabled: Optional[bool] = True,
190 |         trust_remote_code: Optional[bool] = False,
191 |         api_key: Optional[str] = None,
192 |         api_base: Optional[str] = None,
193 |         **additional_generation_kwargs,
194 |     ):
195 |         """
196 |         Initialize ComputerAgent.
197 | 
198 |         Args:
199 |             model: Model name (e.g., "claude-sonnet-4-5-20250929", "computer-use-preview", "omni+vertex_ai/gemini-pro")
200 |             tools: List of tools (computer objects, decorated functions, etc.)
201 |             custom_loop: Custom agent loop function to use instead of auto-selection
202 |             only_n_most_recent_images: If set, only keep the N most recent images in message history. Adds ImageRetentionCallback automatically.
203 |             callbacks: List of AsyncCallbackHandler instances for preprocessing/postprocessing
204 |             instructions: Optional system instructions to be passed to the model
205 |             verbosity: Logging level (logging.DEBUG, logging.INFO, etc.). If set, adds LoggingCallback automatically
206 |             trajectory_dir: If set, saves trajectory data (screenshots, responses) to this directory. Adds TrajectorySaverCallback automatically.
207 |             max_retries: Maximum number of retries for failed API calls
208 |             screenshot_delay: Delay before screenshots in seconds
209 |             use_prompt_caching: If set, use prompt caching to avoid reprocessing the same prompt. Intended for use with anthropic providers.
210 |             max_trajectory_budget: If set, adds BudgetManagerCallback to track usage costs and stop when budget is exceeded
211 |             telemetry_enabled: If set, adds TelemetryCallback to track anonymized usage data. Enabled by default.
212 |             trust_remote_code: If set, trust remote code when loading local models. Disabled by default.
213 |             api_key: Optional API key override for the model provider
214 |             api_base: Optional API base URL override for the model provider
215 |             **additional_generation_kwargs: Additional arguments passed to the model provider
216 |         """
217 |         # If the loop is "human/human", we need to prefix a grounding model fallback
218 |         if model in ["human/human", "human"]:
219 |             model = "openai/computer-use-preview+human/human"
220 | 
221 |         self.model = model
222 |         self.tools = tools or []
223 |         self.custom_loop = custom_loop
224 |         self.only_n_most_recent_images = only_n_most_recent_images
225 |         self.callbacks = callbacks or []
226 |         self.instructions = instructions
227 |         self.verbosity = verbosity
228 |         self.trajectory_dir = trajectory_dir
229 |         self.max_retries = max_retries
230 |         self.screenshot_delay = screenshot_delay
231 |         self.use_prompt_caching = use_prompt_caching
232 |         self.telemetry_enabled = telemetry_enabled
233 |         self.kwargs = additional_generation_kwargs
234 |         self.trust_remote_code = trust_remote_code
235 |         self.api_key = api_key
236 |         self.api_base = api_base
237 | 
238 |         # == Add built-in callbacks ==
239 | 
240 |         # Prepend operator normalizer callback
241 |         self.callbacks.insert(0, OperatorNormalizerCallback())
242 | 
243 |         # Add prompt instructions callback if provided
244 |         if self.instructions:
245 |             self.callbacks.append(PromptInstructionsCallback(self.instructions))
246 | 
247 |         # Add logging callback if verbosity is set
248 |         if self.verbosity is not None:
249 |             self.callbacks.append(LoggingCallback(level=self.verbosity))
250 | 
251 |         # Add image retention callback if only_n_most_recent_images is set
252 |         if self.only_n_most_recent_images:
253 |             self.callbacks.append(ImageRetentionCallback(self.only_n_most_recent_images))
254 | 
255 |         # Add trajectory saver callback if trajectory_dir is set
256 |         if self.trajectory_dir:
257 |             if isinstance(self.trajectory_dir, dict):
258 |                 self.callbacks.append(TrajectorySaverCallback(**self.trajectory_dir))
259 |             elif isinstance(self.trajectory_dir, (str, Path)):
260 |                 self.callbacks.append(TrajectorySaverCallback(str(self.trajectory_dir)))
261 | 
262 |         # Add budget manager if max_trajectory_budget is set
263 |         if max_trajectory_budget:
264 |             if isinstance(max_trajectory_budget, dict):
265 |                 self.callbacks.append(BudgetManagerCallback(**max_trajectory_budget))
266 |             else:
267 |                 self.callbacks.append(BudgetManagerCallback(max_trajectory_budget))
268 | 
269 |         # == Enable local model providers w/ LiteLLM ==
270 | 
271 |         # Register local model providers
272 |         hf_adapter = HuggingFaceLocalAdapter(
273 |             device="auto", trust_remote_code=self.trust_remote_code or False
274 |         )
275 |         human_adapter = HumanAdapter()
276 |         mlx_adapter = MLXVLMAdapter()
277 |         cua_adapter = CUAAdapter()
278 |         azure_ml_adapter = AzureMLAdapter()
279 |         litellm.custom_provider_map = [
280 |             {"provider": "huggingface-local", "custom_handler": hf_adapter},
281 |             {"provider": "human", "custom_handler": human_adapter},
282 |             {"provider": "mlx", "custom_handler": mlx_adapter},
283 |             {"provider": "cua", "custom_handler": cua_adapter},
284 |             {"provider": "azure_ml", "custom_handler": azure_ml_adapter},
285 |         ]
286 |         litellm.suppress_debug_info = True
287 | 
288 |         # == Initialize computer agent ==
289 | 
290 |         # Find the appropriate agent loop
291 |         if custom_loop:
292 |             self.agent_loop = custom_loop
293 |             self.agent_config_info = None
294 |         else:
295 |             config_info = find_agent_config(model)
296 |             if not config_info:
297 |                 raise ValueError(f"No agent config found for model: {model}")
298 |             # Instantiate the agent config class
299 |             self.agent_loop = config_info.agent_class()
300 |             self.agent_config_info = config_info
301 | 
302 |         # Add telemetry callback AFTER agent_loop is set so it can capture the correct agent_type
303 |         if self.telemetry_enabled:
304 |             if isinstance(self.telemetry_enabled, bool):
305 |                 self.callbacks.append(TelemetryCallback(self))
306 |             else:
307 |                 self.callbacks.append(TelemetryCallback(self, **self.telemetry_enabled))
308 | 
309 |         self.tool_schemas = []
310 |         self.computer_handler = None
311 | 
312 |     async def _initialize_computers(self):
313 |         """Initialize computer objects"""
314 |         if not self.tool_schemas:
315 |             # Process tools and create tool schemas
316 |             self.tool_schemas = self._process_tools()
317 | 
318 |             # Find computer tool and create interface adapter
319 |             computer_handler = None
320 | 
321 |             # First check if any tool is a BaseComputerTool instance
322 |             for tool in self.tools:
323 |                 if isinstance(tool, BaseComputerTool):
324 |                     computer_handler = tool
325 |                     break
326 | 
327 |             # If no BaseComputerTool found, look for traditional computer objects
328 |             if computer_handler is None:
329 |                 for schema in self.tool_schemas:
330 |                     if schema["type"] == "computer":
331 |                         computer_handler = await make_computer_handler(schema["computer"])
332 |                         break
333 | 
334 |             self.computer_handler = computer_handler
335 | 
336 |     def _process_input(self, input: Messages) -> List[Dict[str, Any]]:
337 |         """Process input messages and create schemas for the agent loop"""
338 |         if isinstance(input, str):
339 |             return [{"role": "user", "content": input}]
340 |         return [get_json(msg) for msg in input]
341 | 
342 |     def _process_tools(self) -> List[Dict[str, Any]]:
343 |         """Process tools and create schemas for the agent loop"""
344 |         schemas = []
345 | 
346 |         for tool in self.tools:
347 |             # Check if it's a computer object (has interface attribute)
348 |             if is_agent_computer(tool):
349 |                 # This is a computer tool - will be handled by agent loop
350 |                 schemas.append({"type": "computer", "computer": tool})
351 |             elif isinstance(tool, BaseTool):
352 |                 # BaseTool instance - extract schema from its properties
353 |                 function_schema = {
354 |                     "name": tool.name,
355 |                     "description": tool.description,
356 |                     "parameters": tool.parameters,
357 |                 }
358 |                 schemas.append({"type": "function", "function": function_schema})
359 |             elif callable(tool):
360 |                 # Use litellm.utils.function_to_dict to extract schema from docstring
361 |                 try:
362 |                     function_schema = litellm.utils.function_to_dict(tool)
363 |                     schemas.append({"type": "function", "function": function_schema})
364 |                 except Exception as e:
365 |                     print(f"Warning: Could not process tool {tool}: {e}")
366 |             else:
367 |                 print(f"Warning: Unknown tool type: {tool}")
368 | 
369 |         return schemas
370 | 
371 |     def _get_tool(self, name: str) -> Optional[Union[Callable, BaseTool]]:
372 |         """Get a tool by name"""
373 |         for tool in self.tools:
374 |             # Check if it's a BaseTool instance
375 |             if isinstance(tool, BaseTool) and tool.name == name:
376 |                 return tool
377 |             # Check if it's a regular callable
378 |             elif hasattr(tool, "__name__") and tool.__name__ == name:
379 |                 return tool
380 |             elif hasattr(tool, "func") and tool.func.__name__ == name:
381 |                 return tool
382 |         return None
383 | 
384 |     # ============================================================================
385 |     # AGENT RUN LOOP LIFECYCLE HOOKS
386 |     # ============================================================================
387 | 
388 |     async def _on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
389 |         """Initialize run tracking by calling callbacks."""
390 |         for callback in self.callbacks:
391 |             if hasattr(callback, "on_run_start"):
392 |                 await callback.on_run_start(kwargs, old_items)
393 | 
394 |     async def _on_run_end(
395 |         self,
396 |         kwargs: Dict[str, Any],
397 |         old_items: List[Dict[str, Any]],
398 |         new_items: List[Dict[str, Any]],
399 |     ) -> None:
400 |         """Finalize run tracking by calling callbacks."""
401 |         for callback in self.callbacks:
402 |             if hasattr(callback, "on_run_end"):
403 |                 await callback.on_run_end(kwargs, old_items, new_items)
404 | 
405 |     async def _on_run_continue(
406 |         self,
407 |         kwargs: Dict[str, Any],
408 |         old_items: List[Dict[str, Any]],
409 |         new_items: List[Dict[str, Any]],
410 |     ) -> bool:
411 |         """Check if run should continue by calling callbacks."""
412 |         for callback in self.callbacks:
413 |             if hasattr(callback, "on_run_continue"):
414 |                 should_continue = await callback.on_run_continue(kwargs, old_items, new_items)
415 |                 if not should_continue:
416 |                     return False
417 |         return True
418 | 
419 |     async def _on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
420 |         """Prepare messages for the LLM call by applying callbacks."""
421 |         result = messages
422 |         for callback in self.callbacks:
423 |             if hasattr(callback, "on_llm_start"):
424 |                 result = await callback.on_llm_start(result)
425 |         return result
426 | 
427 |     async def _on_llm_end(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
428 |         """Postprocess messages after the LLM call by applying callbacks."""
429 |         result = messages
430 |         for callback in self.callbacks:
431 |             if hasattr(callback, "on_llm_end"):
432 |                 result = await callback.on_llm_end(result)
433 |         return result
434 | 
435 |     async def _on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
436 |         """Called when responses are received."""
437 |         for callback in self.callbacks:
438 |             if hasattr(callback, "on_responses"):
439 |                 await callback.on_responses(get_json(kwargs), get_json(responses))
440 | 
441 |     async def _on_computer_call_start(self, item: Dict[str, Any]) -> None:
442 |         """Called when a computer call is about to start."""
443 |         for callback in self.callbacks:
444 |             if hasattr(callback, "on_computer_call_start"):
445 |                 await callback.on_computer_call_start(get_json(item))
446 | 
447 |     async def _on_computer_call_end(
448 |         self, item: Dict[str, Any], result: List[Dict[str, Any]]
449 |     ) -> None:
450 |         """Called when a computer call has completed."""
451 |         for callback in self.callbacks:
452 |             if hasattr(callback, "on_computer_call_end"):
453 |                 await callback.on_computer_call_end(get_json(item), get_json(result))
454 | 
455 |     async def _on_function_call_start(self, item: Dict[str, Any]) -> None:
456 |         """Called when a function call is about to start."""
457 |         for callback in self.callbacks:
458 |             if hasattr(callback, "on_function_call_start"):
459 |                 await callback.on_function_call_start(get_json(item))
460 | 
461 |     async def _on_function_call_end(
462 |         self, item: Dict[str, Any], result: List[Dict[str, Any]]
463 |     ) -> None:
464 |         """Called when a function call has completed."""
465 |         for callback in self.callbacks:
466 |             if hasattr(callback, "on_function_call_end"):
467 |                 await callback.on_function_call_end(get_json(item), get_json(result))
468 | 
469 |     async def _on_text(self, item: Dict[str, Any]) -> None:
470 |         """Called when a text message is encountered."""
471 |         for callback in self.callbacks:
472 |             if hasattr(callback, "on_text"):
473 |                 await callback.on_text(get_json(item))
474 | 
475 |     async def _on_api_start(self, kwargs: Dict[str, Any]) -> None:
476 |         """Called when an LLM API call is about to start."""
477 |         for callback in self.callbacks:
478 |             if hasattr(callback, "on_api_start"):
479 |                 await callback.on_api_start(get_json(kwargs))
480 | 
481 |     async def _on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
482 |         """Called when an LLM API call has completed."""
483 |         for callback in self.callbacks:
484 |             if hasattr(callback, "on_api_end"):
485 |                 await callback.on_api_end(get_json(kwargs), get_json(result))
486 | 
487 |     async def _on_usage(self, usage: Dict[str, Any]) -> None:
488 |         """Called when usage information is received."""
489 |         for callback in self.callbacks:
490 |             if hasattr(callback, "on_usage"):
491 |                 await callback.on_usage(get_json(usage))
492 | 
493 |     async def _on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
494 |         """Called when a screenshot is taken."""
495 |         for callback in self.callbacks:
496 |             if hasattr(callback, "on_screenshot"):
497 |                 await callback.on_screenshot(screenshot, name)
498 | 
499 |     # ============================================================================
500 |     # AGENT OUTPUT PROCESSING
501 |     # ============================================================================
502 | 
503 |     async def _handle_item(
504 |         self,
505 |         item: Any,
506 |         computer: Optional[AsyncComputerHandler] = None,
507 |         ignore_call_ids: Optional[List[str]] = None,
508 |     ) -> List[Dict[str, Any]]:
509 |         """Handle each item; may cause a computer action + screenshot."""
510 |         call_id = item.get("call_id")
511 |         if ignore_call_ids and call_id and call_id in ignore_call_ids:
512 |             return []
513 | 
514 |         item_type = item.get("type", None)
515 | 
516 |         if item_type == "message":
517 |             await self._on_text(item)
518 |             # # Print messages
519 |             # if item.get("content"):
520 |             #     for content_item in item.get("content"):
521 |             #         if content_item.get("text"):
522 |             #             print(content_item.get("text"))
523 |             return []
524 | 
525 |         try:
526 |             if item_type == "computer_call":
527 |                 await self._on_computer_call_start(item)
528 |                 if not computer:
529 |                     raise ValueError("Computer handler is required for computer calls")
530 | 
531 |                 # Perform computer actions
532 |                 action = item.get("action")
533 |                 action_type = action.get("type")
534 |                 if action_type is None:
535 |                     print(
536 |                         f"Action type cannot be `None`: action={action}, action_type={action_type}"
537 |                     )
538 |                     return []
539 | 
540 |                 # Extract action arguments (all fields except 'type')
541 |                 action_args = {k: v for k, v in action.items() if k != "type"}
542 | 
543 |                 # print(f"{action_type}({action_args})")
544 | 
545 |                 # Execute the computer action
546 |                 computer_method = getattr(computer, action_type, None)
547 |                 if computer_method:
548 |                     assert_callable_with(computer_method, **action_args)
549 |                     await computer_method(**action_args)
550 |                 else:
551 |                     raise ToolError(f"Unknown computer action: {action_type}")
552 | 
553 |                 # Take screenshot after action
554 |                 if self.screenshot_delay and self.screenshot_delay > 0:
555 |                     await asyncio.sleep(self.screenshot_delay)
556 |                 screenshot_base64 = await computer.screenshot()
557 |                 await self._on_screenshot(screenshot_base64, "screenshot_after")
558 | 
559 |                 # Handle safety checks
560 |                 pending_checks = item.get("pending_safety_checks", [])
561 |                 acknowledged_checks = []
562 |                 for check in pending_checks:
563 |                     check_message = check.get("message", str(check))
564 |                     acknowledged_checks.append(check)
565 |                     # TODO: implement a callback for safety checks
566 |                     # if acknowledge_safety_check_callback(check_message, allow_always=True):
567 |                     #     acknowledged_checks.append(check)
568 |                     # else:
569 |                     #     raise ValueError(f"Safety check failed: {check_message}")
570 | 
571 |                 # Create call output
572 |                 call_output = {
573 |                     "type": "computer_call_output",
574 |                     "call_id": item.get("call_id"),
575 |                     "acknowledged_safety_checks": acknowledged_checks,
576 |                     "output": {
577 |                         "type": "input_image",
578 |                         "image_url": f"data:image/png;base64,{screenshot_base64}",
579 |                     },
580 |                 }
581 | 
582 |                 # # Additional URL safety checks for browser environments
583 |                 # if await computer.get_environment() == "browser":
584 |                 #     current_url = await computer.get_current_url()
585 |                 #     call_output["output"]["current_url"] = current_url
586 |                 #     # TODO: implement a callback for URL safety checks
587 |                 #     # check_blocklisted_url(current_url)
588 | 
589 |                 result = [call_output]
590 |                 await self._on_computer_call_end(item, result)
591 |                 return result
592 | 
593 |             if item_type == "function_call":
594 |                 await self._on_function_call_start(item)
595 |                 # Perform function call
596 |                 function = self._get_tool(item.get("name"))
597 |                 if not function:
598 |                     raise ToolError(f"Function {item.get('name')} not found")
599 | 
600 |                 args = json.loads(item.get("arguments"))
601 | 
602 |                 # Handle BaseTool instances
603 |                 if isinstance(function, BaseTool):
604 |                     # BaseTool.call() handles its own execution
605 |                     result = function.call(args)
606 |                 else:
607 |                     # Validate arguments before execution for regular callables
608 |                     assert_callable_with(function, **args)
609 | 
610 |                     # Execute function - use asyncio.to_thread for non-async functions
611 |                     if inspect.iscoroutinefunction(function):
612 |                         result = await function(**args)
613 |                     else:
614 |                         result = await asyncio.to_thread(function, **args)
615 | 
616 |                 # Create function call output
617 |                 call_output = {
618 |                     "type": "function_call_output",
619 |                     "call_id": item.get("call_id"),
620 |                     "output": str(result),
621 |                 }
622 | 
623 |                 result = [call_output]
624 |                 await self._on_function_call_end(item, result)
625 |                 return result
626 |         except ToolError as e:
627 |             return [make_tool_error_item(repr(e), call_id)]
628 | 
629 |         return []
630 | 
631 |     # ============================================================================
632 |     # MAIN AGENT LOOP
633 |     # ============================================================================
634 | 
635 |     async def run(
636 |         self,
637 |         messages: Messages,
638 |         stream: bool = False,
639 |         api_key: Optional[str] = None,
640 |         api_base: Optional[str] = None,
641 |         **additional_generation_kwargs,
642 |     ) -> AsyncGenerator[Dict[str, Any], None]:
643 |         """
644 |         Run the agent with the given messages using Computer protocol handler pattern.
645 | 
646 |         Args:
647 |             messages: List of message dictionaries
648 |             stream: Whether to stream the response
649 |             api_key: Optional API key override for the model provider
650 |             api_base: Optional API base URL override for the model provider
651 |             **additional_generation_kwargs: Additional arguments passed to the model provider
652 | 
653 |         Returns:
654 |             AsyncGenerator that yields response chunks
655 |         """
656 |         if not self.agent_config_info:
657 |             raise ValueError("Agent configuration not found")
658 | 
659 |         capabilities = self.get_capabilities()
660 |         if "step" not in capabilities:
661 |             raise ValueError(
662 |                 f"Agent loop {self.agent_config_info.agent_class.__name__} does not support step predictions"
663 |             )
664 | 
665 |         await self._initialize_computers()
666 | 
667 |         # Merge kwargs and thread api credentials (run overrides constructor)
668 |         merged_kwargs = {**self.kwargs, **additional_generation_kwargs}
669 |         if (api_key is not None) or (self.api_key is not None):
670 |             merged_kwargs["api_key"] = api_key if api_key is not None else self.api_key
671 |         if (api_base is not None) or (self.api_base is not None):
672 |             merged_kwargs["api_base"] = api_base if api_base is not None else self.api_base
673 | 
674 |         old_items = self._process_input(messages)
675 |         new_items = []
676 | 
677 |         # Initialize run tracking
678 |         run_kwargs = {
679 |             "messages": messages,
680 |             "stream": stream,
681 |             "model": self.model,
682 |             "agent_loop": self.agent_config_info.agent_class.__name__,
683 |             **merged_kwargs,
684 |         }
685 |         await self._on_run_start(run_kwargs, old_items)
686 | 
687 |         while new_items[-1].get("role") != "assistant" if new_items else True:
688 |             # Lifecycle hook: Check if we should continue based on callbacks (e.g., budget manager)
689 |             should_continue = await self._on_run_continue(run_kwargs, old_items, new_items)
690 |             if not should_continue:
691 |                 break
692 | 
693 |             # Lifecycle hook: Prepare messages for the LLM call
694 |             # Use cases:
695 |             # - PII anonymization
696 |             # - Image retention policy
697 |             combined_messages = old_items + new_items
698 |             combined_messages = replace_failed_computer_calls_with_function_calls(combined_messages)
699 |             preprocessed_messages = await self._on_llm_start(combined_messages)
700 | 
701 |             loop_kwargs = {
702 |                 "messages": preprocessed_messages,
703 |                 "model": self.model,
704 |                 "tools": self.tool_schemas,
705 |                 "stream": False,
706 |                 "computer_handler": self.computer_handler,
707 |                 "max_retries": self.max_retries,
708 |                 "use_prompt_caching": self.use_prompt_caching,
709 |                 **merged_kwargs,
710 |             }
711 | 
712 |             # Run agent loop iteration
713 |             result = await self.agent_loop.predict_step(
714 |                 **loop_kwargs,
715 |                 _on_api_start=self._on_api_start,
716 |                 _on_api_end=self._on_api_end,
717 |                 _on_usage=self._on_usage,
718 |                 _on_screenshot=self._on_screenshot,
719 |             )
720 |             result = get_json(result)
721 | 
722 |             # Lifecycle hook: Postprocess messages after the LLM call
723 |             # Use cases:
724 |             # - PII deanonymization (if you want tool calls to see PII)
725 |             result["output"] = await self._on_llm_end(result.get("output", []))
726 |             await self._on_responses(loop_kwargs, result)
727 | 
728 |             # Yield agent response
729 |             yield result
730 | 
731 |             # Add agent response to new_items
732 |             new_items += result.get("output")
733 | 
734 |             # Get output call ids
735 |             output_call_ids = get_output_call_ids(result.get("output", []))
736 | 
737 |             # Handle computer actions
738 |             for item in result.get("output"):
739 |                 partial_items = await self._handle_item(
740 |                     item, self.computer_handler, ignore_call_ids=output_call_ids
741 |                 )
742 |                 new_items += partial_items
743 | 
744 |                 # Yield partial response
745 |                 yield {
746 |                     "output": partial_items,
747 |                     "usage": Usage(
748 |                         prompt_tokens=0,
749 |                         completion_tokens=0,
750 |                         total_tokens=0,
751 |                     ),
752 |                 }
753 | 
754 |         await self._on_run_end(loop_kwargs, old_items, new_items)
755 | 
756 |     async def predict_click(
757 |         self, instruction: str, image_b64: Optional[str] = None
758 |     ) -> Optional[Tuple[int, int]]:
759 |         """
760 |         Predict click coordinates based on image and instruction.
761 | 
762 |         Args:
763 |             instruction: Instruction for where to click
764 |             image_b64: Base64 encoded image (optional, will take screenshot if not provided)
765 | 
766 |         Returns:
767 |             None or tuple with (x, y) coordinates
768 |         """
769 |         if not self.agent_config_info:
770 |             raise ValueError("Agent configuration not found")
771 | 
772 |         capabilities = self.get_capabilities()
773 |         if "click" not in capabilities:
774 |             raise ValueError(
775 |                 f"Agent loop {self.agent_config_info.agent_class.__name__} does not support click predictions"
776 |             )
777 |         if hasattr(self.agent_loop, "predict_click"):
778 |             if not image_b64:
779 |                 if not self.computer_handler:
780 |                     raise ValueError("Computer tool or image_b64 is required for predict_click")
781 |                 image_b64 = await self.computer_handler.screenshot()
782 |             # Pass along api credentials if available
783 |             click_kwargs: Dict[str, Any] = {}
784 |             if self.api_key is not None:
785 |                 click_kwargs["api_key"] = self.api_key
786 |             if self.api_base is not None:
787 |                 click_kwargs["api_base"] = self.api_base
788 |             return await self.agent_loop.predict_click(
789 |                 model=self.model, image_b64=image_b64, instruction=instruction, **click_kwargs
790 |             )
791 |         return None
792 | 
793 |     def get_capabilities(self) -> List[AgentCapability]:
794 |         """
795 |         Get list of capabilities supported by the current agent config.
796 | 
797 |         Returns:
798 |             List of capability strings (e.g., ["step", "click"])
799 |         """
800 |         if not self.agent_config_info:
801 |             raise ValueError("Agent configuration not found")
802 | 
803 |         if hasattr(self.agent_loop, "get_capabilities"):
804 |             return self.agent_loop.get_capabilities()
805 |         return ["step"]  # Default capability
806 | 
807 |     def open(self, port: Optional[int] = None):
808 |         """
809 |         Start the playground server and open it in the browser.
810 | 
811 |         This method starts a local HTTP server that exposes the /responses endpoint
812 |         and automatically opens the CUA playground interface in the default browser.
813 | 
814 |         Args:
815 |             port: Port to run the server on. If None, finds an available port automatically.
816 | 
817 |         Example:
818 |             >>> agent = ComputerAgent(model="claude-sonnet-4")
819 |             >>> agent.open()  # Starts server and opens browser
820 |         """
821 |         from .playground import PlaygroundServer
822 | 
823 |         server = PlaygroundServer(agent_instance=self)
824 |         server.start(port=port, open_browser=True)
825 | 
```