This is page 22 of 29. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .cursorignore
├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github
│ ├── FUNDING.yml
│ ├── scripts
│ │ ├── get_pyproject_version.py
│ │ └── tests
│ │ ├── __init__.py
│ │ ├── README.md
│ │ └── test_get_pyproject_version.py
│ └── workflows
│ ├── bump-version.yml
│ ├── ci-lume.yml
│ ├── docker-publish-cua-linux.yml
│ ├── docker-publish-cua-windows.yml
│ ├── docker-publish-kasm.yml
│ ├── docker-publish-xfce.yml
│ ├── docker-reusable-publish.yml
│ ├── link-check.yml
│ ├── lint.yml
│ ├── npm-publish-cli.yml
│ ├── npm-publish-computer.yml
│ ├── npm-publish-core.yml
│ ├── publish-lume.yml
│ ├── pypi-publish-agent.yml
│ ├── pypi-publish-computer-server.yml
│ ├── pypi-publish-computer.yml
│ ├── pypi-publish-core.yml
│ ├── pypi-publish-mcp-server.yml
│ ├── pypi-publish-som.yml
│ ├── pypi-reusable-publish.yml
│ ├── python-tests.yml
│ ├── test-cua-models.yml
│ └── test-validation-script.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierignore
├── .prettierrc.yaml
├── .vscode
│ ├── docs.code-workspace
│ ├── extensions.json
│ ├── launch.json
│ ├── libs-ts.code-workspace
│ ├── lume.code-workspace
│ ├── lumier.code-workspace
│ ├── py.code-workspace
│ └── settings.json
├── blog
│ ├── app-use.md
│ ├── assets
│ │ ├── composite-agents.png
│ │ ├── docker-ubuntu-support.png
│ │ ├── hack-booth.png
│ │ ├── hack-closing-ceremony.jpg
│ │ ├── hack-cua-ollama-hud.jpeg
│ │ ├── hack-leaderboard.png
│ │ ├── hack-the-north.png
│ │ ├── hack-winners.jpeg
│ │ ├── hack-workshop.jpeg
│ │ ├── hud-agent-evals.png
│ │ └── trajectory-viewer.jpeg
│ ├── bringing-computer-use-to-the-web.md
│ ├── build-your-own-operator-on-macos-1.md
│ ├── build-your-own-operator-on-macos-2.md
│ ├── cloud-windows-ga-macos-preview.md
│ ├── composite-agents.md
│ ├── computer-use-agents-for-growth-hacking.md
│ ├── cua-hackathon.md
│ ├── cua-playground-preview.md
│ ├── cua-vlm-router.md
│ ├── hack-the-north.md
│ ├── hud-agent-evals.md
│ ├── human-in-the-loop.md
│ ├── introducing-cua-cli.md
│ ├── introducing-cua-cloud-containers.md
│ ├── lume-to-containerization.md
│ ├── neurips-2025-cua-papers.md
│ ├── sandboxed-python-execution.md
│ ├── training-computer-use-models-trajectories-1.md
│ ├── trajectory-viewer.md
│ ├── ubuntu-docker-support.md
│ └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│ ├── .env.example
│ ├── .gitignore
│ ├── content
│ │ └── docs
│ │ ├── agent-sdk
│ │ │ ├── agent-loops.mdx
│ │ │ ├── benchmarks
│ │ │ │ ├── index.mdx
│ │ │ │ ├── interactive.mdx
│ │ │ │ ├── introduction.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── osworld-verified.mdx
│ │ │ │ ├── screenspot-pro.mdx
│ │ │ │ └── screenspot-v2.mdx
│ │ │ ├── callbacks
│ │ │ │ ├── agent-lifecycle.mdx
│ │ │ │ ├── cost-saving.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── logging.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── pii-anonymization.mdx
│ │ │ │ └── trajectories.mdx
│ │ │ ├── chat-history.mdx
│ │ │ ├── custom-tools.mdx
│ │ │ ├── customizing-computeragent.mdx
│ │ │ ├── integrations
│ │ │ │ ├── hud.mdx
│ │ │ │ ├── meta.json
│ │ │ │ └── observability.mdx
│ │ │ ├── mcp-server
│ │ │ │ ├── client-integrations.mdx
│ │ │ │ ├── configuration.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ ├── llm-integrations.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── tools.mdx
│ │ │ │ └── usage.mdx
│ │ │ ├── message-format.mdx
│ │ │ ├── meta.json
│ │ │ ├── migration-guide.mdx
│ │ │ ├── prompt-caching.mdx
│ │ │ ├── supported-agents
│ │ │ │ ├── composed-agents.mdx
│ │ │ │ ├── computer-use-agents.mdx
│ │ │ │ ├── grounding-models.mdx
│ │ │ │ ├── human-in-the-loop.mdx
│ │ │ │ └── meta.json
│ │ │ ├── supported-model-providers
│ │ │ │ ├── cua-vlm-router.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ └── local-models.mdx
│ │ │ ├── telemetry.mdx
│ │ │ └── usage-tracking.mdx
│ │ ├── cli-playbook
│ │ │ ├── commands.mdx
│ │ │ ├── index.mdx
│ │ │ └── meta.json
│ │ ├── computer-sdk
│ │ │ ├── cloud-vm-management.mdx
│ │ │ ├── commands.mdx
│ │ │ ├── computer-server
│ │ │ │ ├── Commands.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── REST-API.mdx
│ │ │ │ └── WebSocket-API.mdx
│ │ │ ├── computer-ui.mdx
│ │ │ ├── computers.mdx
│ │ │ ├── custom-computer-handlers.mdx
│ │ │ ├── meta.json
│ │ │ ├── sandboxed-python.mdx
│ │ │ └── tracing-api.mdx
│ │ ├── example-usecases
│ │ │ ├── form-filling.mdx
│ │ │ ├── gemini-complex-ui-navigation.mdx
│ │ │ ├── meta.json
│ │ │ ├── post-event-contact-export.mdx
│ │ │ └── windows-app-behind-vpn.mdx
│ │ ├── get-started
│ │ │ ├── meta.json
│ │ │ └── quickstart.mdx
│ │ ├── index.mdx
│ │ ├── macos-vm-cli-playbook
│ │ │ ├── lume
│ │ │ │ ├── cli-reference.mdx
│ │ │ │ ├── faq.md
│ │ │ │ ├── http-api.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ ├── meta.json
│ │ │ │ └── prebuilt-images.mdx
│ │ │ ├── lumier
│ │ │ │ ├── building-lumier.mdx
│ │ │ │ ├── docker-compose.mdx
│ │ │ │ ├── docker.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ └── meta.json
│ │ │ └── meta.json
│ │ └── meta.json
│ ├── next.config.mjs
│ ├── package-lock.json
│ ├── package.json
│ ├── pnpm-lock.yaml
│ ├── postcss.config.mjs
│ ├── public
│ │ └── img
│ │ ├── agent_gradio_ui.png
│ │ ├── agent.png
│ │ ├── bg-dark.jpg
│ │ ├── bg-light.jpg
│ │ ├── cli.png
│ │ ├── computer.png
│ │ ├── grounding-with-gemini3.gif
│ │ ├── hero.png
│ │ ├── laminar_trace_example.png
│ │ ├── som_box_threshold.png
│ │ └── som_iou_threshold.png
│ ├── README.md
│ ├── source.config.ts
│ ├── src
│ │ ├── app
│ │ │ ├── (home)
│ │ │ │ ├── [[...slug]]
│ │ │ │ │ └── page.tsx
│ │ │ │ └── layout.tsx
│ │ │ ├── api
│ │ │ │ ├── posthog
│ │ │ │ │ └── [...path]
│ │ │ │ │ └── route.ts
│ │ │ │ └── search
│ │ │ │ └── route.ts
│ │ │ ├── favicon.ico
│ │ │ ├── global.css
│ │ │ ├── layout.config.tsx
│ │ │ ├── layout.tsx
│ │ │ ├── llms.mdx
│ │ │ │ └── [[...slug]]
│ │ │ │ └── route.ts
│ │ │ ├── llms.txt
│ │ │ │ └── route.ts
│ │ │ ├── robots.ts
│ │ │ └── sitemap.ts
│ │ ├── assets
│ │ │ ├── discord-black.svg
│ │ │ ├── discord-white.svg
│ │ │ ├── logo-black.svg
│ │ │ └── logo-white.svg
│ │ ├── components
│ │ │ ├── analytics-tracker.tsx
│ │ │ ├── cookie-consent.tsx
│ │ │ ├── doc-actions-menu.tsx
│ │ │ ├── editable-code-block.tsx
│ │ │ ├── footer.tsx
│ │ │ ├── hero.tsx
│ │ │ ├── iou.tsx
│ │ │ ├── mermaid.tsx
│ │ │ └── page-feedback.tsx
│ │ ├── lib
│ │ │ ├── llms.ts
│ │ │ └── source.ts
│ │ ├── mdx-components.tsx
│ │ └── providers
│ │ └── posthog-provider.tsx
│ └── tsconfig.json
├── examples
│ ├── agent_examples.py
│ ├── agent_ui_examples.py
│ ├── browser_agent_example.py
│ ├── browser_tool_example.py
│ ├── cloud_api_examples.py
│ ├── computer_examples_windows.py
│ ├── computer_examples.py
│ ├── computer_ui_examples.py
│ ├── computer-example-ts
│ │ ├── .env.example
│ │ ├── .gitignore
│ │ ├── package-lock.json
│ │ ├── package.json
│ │ ├── pnpm-lock.yaml
│ │ ├── README.md
│ │ ├── src
│ │ │ ├── helpers.ts
│ │ │ └── index.ts
│ │ └── tsconfig.json
│ ├── docker_examples.py
│ ├── evals
│ │ ├── hud_eval_examples.py
│ │ └── wikipedia_most_linked.txt
│ ├── pylume_examples.py
│ ├── sandboxed_functions_examples.py
│ ├── som_examples.py
│ ├── tracing_examples.py
│ ├── utils.py
│ └── winsandbox_example.py
├── img
│ ├── agent_gradio_ui.png
│ ├── agent.png
│ ├── cli.png
│ ├── computer.png
│ ├── logo_black.png
│ └── logo_white.png
├── libs
│ ├── kasm
│ │ ├── Dockerfile
│ │ ├── LICENSE
│ │ ├── README.md
│ │ └── src
│ │ └── ubuntu
│ │ └── install
│ │ └── firefox
│ │ ├── custom_startup.sh
│ │ ├── firefox.desktop
│ │ └── install_firefox.sh
│ ├── lume
│ │ ├── .cursorignore
│ │ ├── CONTRIBUTING.md
│ │ ├── Development.md
│ │ ├── img
│ │ │ └── cli.png
│ │ ├── Package.resolved
│ │ ├── Package.swift
│ │ ├── README.md
│ │ ├── resources
│ │ │ └── lume.entitlements
│ │ ├── scripts
│ │ │ ├── build
│ │ │ │ ├── build-debug.sh
│ │ │ │ ├── build-release-notarized.sh
│ │ │ │ └── build-release.sh
│ │ │ └── install.sh
│ │ ├── src
│ │ │ ├── Commands
│ │ │ │ ├── Clone.swift
│ │ │ │ ├── Config.swift
│ │ │ │ ├── Create.swift
│ │ │ │ ├── Delete.swift
│ │ │ │ ├── Get.swift
│ │ │ │ ├── Images.swift
│ │ │ │ ├── IPSW.swift
│ │ │ │ ├── List.swift
│ │ │ │ ├── Logs.swift
│ │ │ │ ├── Options
│ │ │ │ │ └── FormatOption.swift
│ │ │ │ ├── Prune.swift
│ │ │ │ ├── Pull.swift
│ │ │ │ ├── Push.swift
│ │ │ │ ├── Run.swift
│ │ │ │ ├── Serve.swift
│ │ │ │ ├── Set.swift
│ │ │ │ └── Stop.swift
│ │ │ ├── ContainerRegistry
│ │ │ │ ├── ImageContainerRegistry.swift
│ │ │ │ ├── ImageList.swift
│ │ │ │ └── ImagesPrinter.swift
│ │ │ ├── Errors
│ │ │ │ └── Errors.swift
│ │ │ ├── FileSystem
│ │ │ │ ├── Home.swift
│ │ │ │ ├── Settings.swift
│ │ │ │ ├── VMConfig.swift
│ │ │ │ ├── VMDirectory.swift
│ │ │ │ └── VMLocation.swift
│ │ │ ├── LumeController.swift
│ │ │ ├── Main.swift
│ │ │ ├── Server
│ │ │ │ ├── Handlers.swift
│ │ │ │ ├── HTTP.swift
│ │ │ │ ├── Requests.swift
│ │ │ │ ├── Responses.swift
│ │ │ │ └── Server.swift
│ │ │ ├── Utils
│ │ │ │ ├── CommandRegistry.swift
│ │ │ │ ├── CommandUtils.swift
│ │ │ │ ├── Logger.swift
│ │ │ │ ├── NetworkUtils.swift
│ │ │ │ ├── Path.swift
│ │ │ │ ├── ProcessRunner.swift
│ │ │ │ ├── ProgressLogger.swift
│ │ │ │ ├── String.swift
│ │ │ │ └── Utils.swift
│ │ │ ├── Virtualization
│ │ │ │ ├── DarwinImageLoader.swift
│ │ │ │ ├── DHCPLeaseParser.swift
│ │ │ │ ├── ImageLoaderFactory.swift
│ │ │ │ └── VMVirtualizationService.swift
│ │ │ ├── VM
│ │ │ │ ├── DarwinVM.swift
│ │ │ │ ├── LinuxVM.swift
│ │ │ │ ├── VM.swift
│ │ │ │ ├── VMDetails.swift
│ │ │ │ ├── VMDetailsPrinter.swift
│ │ │ │ ├── VMDisplayResolution.swift
│ │ │ │ └── VMFactory.swift
│ │ │ └── VNC
│ │ │ ├── PassphraseGenerator.swift
│ │ │ └── VNCService.swift
│ │ └── tests
│ │ ├── Mocks
│ │ │ ├── MockVM.swift
│ │ │ ├── MockVMVirtualizationService.swift
│ │ │ └── MockVNCService.swift
│ │ ├── VM
│ │ │ └── VMDetailsPrinterTests.swift
│ │ ├── VMTests.swift
│ │ ├── VMVirtualizationServiceTests.swift
│ │ └── VNCServiceTests.swift
│ ├── lumier
│ │ ├── .dockerignore
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ └── src
│ │ ├── bin
│ │ │ └── entry.sh
│ │ ├── config
│ │ │ └── constants.sh
│ │ ├── hooks
│ │ │ └── on-logon.sh
│ │ └── lib
│ │ ├── utils.sh
│ │ └── vm.sh
│ ├── python
│ │ ├── agent
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── agent
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── adapters
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── azure_ml_adapter.py
│ │ │ │ │ ├── cua_adapter.py
│ │ │ │ │ ├── huggingfacelocal_adapter.py
│ │ │ │ │ ├── human_adapter.py
│ │ │ │ │ ├── mlxvlm_adapter.py
│ │ │ │ │ └── models
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── internvl.py
│ │ │ │ │ ├── opencua.py
│ │ │ │ │ └── qwen2_5_vl.py
│ │ │ │ ├── agent.py
│ │ │ │ ├── callbacks
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── budget_manager.py
│ │ │ │ │ ├── image_retention.py
│ │ │ │ │ ├── logging.py
│ │ │ │ │ ├── operator_validator.py
│ │ │ │ │ ├── pii_anonymization.py
│ │ │ │ │ ├── prompt_instructions.py
│ │ │ │ │ ├── telemetry.py
│ │ │ │ │ └── trajectory_saver.py
│ │ │ │ ├── cli.py
│ │ │ │ ├── computers
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── cua.py
│ │ │ │ │ └── custom.py
│ │ │ │ ├── decorators.py
│ │ │ │ ├── human_tool
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── __main__.py
│ │ │ │ │ ├── server.py
│ │ │ │ │ └── ui.py
│ │ │ │ ├── integrations
│ │ │ │ │ └── hud
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── agent.py
│ │ │ │ │ └── proxy.py
│ │ │ │ ├── loops
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── anthropic.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── composed_grounded.py
│ │ │ │ │ ├── fara.py
│ │ │ │ │ ├── gelato.py
│ │ │ │ │ ├── gemini.py
│ │ │ │ │ ├── generic_vlm.py
│ │ │ │ │ ├── glm45v.py
│ │ │ │ │ ├── gta1.py
│ │ │ │ │ ├── holo.py
│ │ │ │ │ ├── internvl.py
│ │ │ │ │ ├── model_types.csv
│ │ │ │ │ ├── moondream3.py
│ │ │ │ │ ├── omniparser.py
│ │ │ │ │ ├── openai.py
│ │ │ │ │ ├── opencua.py
│ │ │ │ │ ├── uiins.py
│ │ │ │ │ ├── uitars.py
│ │ │ │ │ └── uitars2.py
│ │ │ │ ├── playground
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── server.py
│ │ │ │ ├── proxy
│ │ │ │ │ ├── examples.py
│ │ │ │ │ └── handlers.py
│ │ │ │ ├── responses.py
│ │ │ │ ├── tools
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ └── browser_tool.py
│ │ │ │ ├── types.py
│ │ │ │ └── ui
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ └── gradio
│ │ │ │ ├── __init__.py
│ │ │ │ ├── app.py
│ │ │ │ └── ui_components.py
│ │ │ ├── benchmarks
│ │ │ │ ├── .gitignore
│ │ │ │ ├── contrib.md
│ │ │ │ ├── interactive.py
│ │ │ │ ├── models
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ └── gta1.py
│ │ │ │ ├── README.md
│ │ │ │ ├── ss-pro.py
│ │ │ │ ├── ss-v2.py
│ │ │ │ └── utils.py
│ │ │ ├── example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_computer_agent.py
│ │ ├── bench-ui
│ │ │ ├── bench_ui
│ │ │ │ ├── __init__.py
│ │ │ │ ├── api.py
│ │ │ │ └── child.py
│ │ │ ├── examples
│ │ │ │ ├── folder_example.py
│ │ │ │ ├── gui
│ │ │ │ │ ├── index.html
│ │ │ │ │ ├── logo.svg
│ │ │ │ │ └── styles.css
│ │ │ │ ├── output_overlay.png
│ │ │ │ └── simple_example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ └── test_port_detection.py
│ │ ├── computer
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── computer
│ │ │ │ ├── __init__.py
│ │ │ │ ├── computer.py
│ │ │ │ ├── diorama_computer.py
│ │ │ │ ├── helpers.py
│ │ │ │ ├── interface
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── linux.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ ├── models.py
│ │ │ │ │ └── windows.py
│ │ │ │ ├── logger.py
│ │ │ │ ├── models.py
│ │ │ │ ├── providers
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── cloud
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── docker
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── lume
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── lume_api.py
│ │ │ │ │ ├── lumier
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── types.py
│ │ │ │ │ └── winsandbox
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── provider.py
│ │ │ │ │ └── setup_script.ps1
│ │ │ │ ├── tracing_wrapper.py
│ │ │ │ ├── tracing.py
│ │ │ │ ├── ui
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── __main__.py
│ │ │ │ │ └── gradio
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── app.py
│ │ │ │ └── utils.py
│ │ │ ├── poetry.toml
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ ├── test_computer.py
│ │ │ └── test_helpers.py
│ │ ├── computer-server
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── computer_server
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── browser.py
│ │ │ │ ├── cli.py
│ │ │ │ ├── diorama
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── diorama_computer.py
│ │ │ │ │ ├── diorama.py
│ │ │ │ │ ├── draw.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ └── safezone.py
│ │ │ │ ├── handlers
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── linux.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ └── windows.py
│ │ │ │ ├── main.py
│ │ │ │ ├── server.py
│ │ │ │ ├── utils
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── wallpaper.py
│ │ │ │ └── watchdog.py
│ │ │ ├── examples
│ │ │ │ ├── __init__.py
│ │ │ │ └── usage_example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ ├── run_server.py
│ │ │ ├── test_connection.py
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_server.py
│ │ ├── core
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── core
│ │ │ │ ├── __init__.py
│ │ │ │ └── telemetry
│ │ │ │ ├── __init__.py
│ │ │ │ └── posthog.py
│ │ │ ├── poetry.toml
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_telemetry.py
│ │ ├── mcp-server
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── build-extension.py
│ │ │ ├── CONCURRENT_SESSIONS.md
│ │ │ ├── desktop-extension
│ │ │ │ ├── cua-extension.mcpb
│ │ │ │ ├── desktop_extension.png
│ │ │ │ ├── manifest.json
│ │ │ │ ├── README.md
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── run_server.sh
│ │ │ │ └── setup.py
│ │ │ ├── mcp_server
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── server.py
│ │ │ │ └── session_manager.py
│ │ │ ├── pdm.lock
│ │ │ ├── pyproject.toml
│ │ │ ├── QUICK_TEST_COMMANDS.sh
│ │ │ ├── quick_test_local_option.py
│ │ │ ├── README.md
│ │ │ ├── scripts
│ │ │ │ ├── install_mcp_server.sh
│ │ │ │ └── start_mcp_server.sh
│ │ │ ├── test_mcp_server_local_option.py
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_mcp_server.py
│ │ ├── pylume
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_pylume.py
│ │ └── som
│ │ ├── .bumpversion.cfg
│ │ ├── LICENSE
│ │ ├── poetry.toml
│ │ ├── pyproject.toml
│ │ ├── README.md
│ │ ├── som
│ │ │ ├── __init__.py
│ │ │ ├── detect.py
│ │ │ ├── detection.py
│ │ │ ├── models.py
│ │ │ ├── ocr.py
│ │ │ ├── util
│ │ │ │ └── utils.py
│ │ │ └── visualization.py
│ │ └── tests
│ │ ├── conftest.py
│ │ └── test_omniparser.py
│ ├── qemu-docker
│ │ ├── linux
│ │ │ ├── Dockerfile
│ │ │ ├── README.md
│ │ │ └── src
│ │ │ ├── entry.sh
│ │ │ └── vm
│ │ │ ├── image
│ │ │ │ └── README.md
│ │ │ └── setup
│ │ │ ├── install.sh
│ │ │ ├── setup-cua-server.sh
│ │ │ └── setup.sh
│ │ ├── README.md
│ │ └── windows
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ └── src
│ │ ├── entry.sh
│ │ └── vm
│ │ ├── image
│ │ │ └── README.md
│ │ └── setup
│ │ ├── install.bat
│ │ ├── on-logon.ps1
│ │ ├── setup-cua-server.ps1
│ │ ├── setup-utils.psm1
│ │ └── setup.ps1
│ ├── typescript
│ │ ├── .gitignore
│ │ ├── .nvmrc
│ │ ├── agent
│ │ │ ├── examples
│ │ │ │ ├── playground-example.html
│ │ │ │ └── README.md
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── client.ts
│ │ │ │ ├── index.ts
│ │ │ │ └── types.ts
│ │ │ ├── tests
│ │ │ │ └── client.test.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── computer
│ │ │ ├── .editorconfig
│ │ │ ├── .gitattributes
│ │ │ ├── .gitignore
│ │ │ ├── LICENSE
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── computer
│ │ │ │ │ ├── index.ts
│ │ │ │ │ ├── providers
│ │ │ │ │ │ ├── base.ts
│ │ │ │ │ │ ├── cloud.ts
│ │ │ │ │ │ └── index.ts
│ │ │ │ │ └── types.ts
│ │ │ │ ├── index.ts
│ │ │ │ ├── interface
│ │ │ │ │ ├── base.ts
│ │ │ │ │ ├── factory.ts
│ │ │ │ │ ├── index.ts
│ │ │ │ │ ├── linux.ts
│ │ │ │ │ ├── macos.ts
│ │ │ │ │ └── windows.ts
│ │ │ │ └── types.ts
│ │ │ ├── tests
│ │ │ │ ├── computer
│ │ │ │ │ └── cloud.test.ts
│ │ │ │ ├── interface
│ │ │ │ │ ├── factory.test.ts
│ │ │ │ │ ├── index.test.ts
│ │ │ │ │ ├── linux.test.ts
│ │ │ │ │ ├── macos.test.ts
│ │ │ │ │ └── windows.test.ts
│ │ │ │ └── setup.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── core
│ │ │ ├── .editorconfig
│ │ │ ├── .gitattributes
│ │ │ ├── .gitignore
│ │ │ ├── LICENSE
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── index.ts
│ │ │ │ └── telemetry
│ │ │ │ ├── clients
│ │ │ │ │ ├── index.ts
│ │ │ │ │ └── posthog.ts
│ │ │ │ └── index.ts
│ │ │ ├── tests
│ │ │ │ └── telemetry.test.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── cua-cli
│ │ │ ├── .gitignore
│ │ │ ├── .prettierrc
│ │ │ ├── bun.lock
│ │ │ ├── CLAUDE.md
│ │ │ ├── index.ts
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── auth.ts
│ │ │ │ ├── cli.ts
│ │ │ │ ├── commands
│ │ │ │ │ ├── auth.ts
│ │ │ │ │ └── sandbox.ts
│ │ │ │ ├── config.ts
│ │ │ │ ├── http.ts
│ │ │ │ ├── storage.ts
│ │ │ │ └── util.ts
│ │ │ └── tsconfig.json
│ │ ├── package.json
│ │ ├── pnpm-lock.yaml
│ │ ├── pnpm-workspace.yaml
│ │ └── README.md
│ └── xfce
│ ├── .dockerignore
│ ├── .gitignore
│ ├── Development.md
│ ├── Dockerfile
│ ├── Dockerfile.dev
│ ├── README.md
│ └── src
│ ├── scripts
│ │ ├── resize-display.sh
│ │ ├── start-computer-server.sh
│ │ ├── start-novnc.sh
│ │ ├── start-vnc.sh
│ │ └── xstartup.sh
│ ├── supervisor
│ │ └── supervisord.conf
│ └── xfce-config
│ ├── helpers.rc
│ ├── xfce4-power-manager.xml
│ └── xfce4-session.xml
├── LICENSE.md
├── Makefile
├── notebooks
│ ├── agent_nb.ipynb
│ ├── blog
│ │ ├── build-your-own-operator-on-macos-1.ipynb
│ │ └── build-your-own-operator-on-macos-2.ipynb
│ ├── composite_agents_docker_nb.ipynb
│ ├── computer_nb.ipynb
│ ├── computer_server_nb.ipynb
│ ├── customizing_computeragent.ipynb
│ ├── eval_osworld.ipynb
│ ├── ollama_nb.ipynb
│ ├── README.md
│ ├── sota_hackathon_cloud.ipynb
│ └── sota_hackathon.ipynb
├── package-lock.json
├── package.json
├── pnpm-lock.yaml
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── scripts
│ ├── install-cli.ps1
│ ├── install-cli.sh
│ ├── playground-docker.sh
│ ├── playground.sh
│ ├── run-docker-dev.sh
│ └── typescript-typecheck.js
├── TESTING.md
├── tests
│ ├── agent_loop_testing
│ │ ├── agent_test.py
│ │ └── README.md
│ ├── pytest.ini
│ ├── shell_cmd.py
│ ├── test_files.py
│ ├── test_mcp_server_session_management.py
│ ├── test_mcp_server_streaming.py
│ ├── test_shell_bash.py
│ ├── test_telemetry.py
│ ├── test_tracing.py
│ ├── test_venv.py
│ └── test_watchdog.py
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/libs/python/agent/agent/human_tool/ui.py:
--------------------------------------------------------------------------------
```python
1 | import base64
2 | import io
3 | import json
4 | import time
5 | from datetime import datetime
6 | from typing import Any, Dict, List, Optional
7 |
8 | import gradio as gr
9 | import requests
10 | from PIL import Image
11 |
12 | from .server import completion_queue
13 |
14 |
15 | class HumanCompletionUI:
16 | def __init__(self, server_url: str = "http://localhost:8002"):
17 | self.server_url = server_url
18 | self.current_call_id: Optional[str] = None
19 | self.refresh_interval = 2.0 # seconds
20 | self.last_image = None # Store the last image for display
21 | # Track current interactive action controls
22 | self.current_action_type: str = "click"
23 | self.current_button: str = "left"
24 | self.current_scroll_x: int = 0
25 | self.current_scroll_y: int = -120
26 |
27 | def format_messages_for_chatbot(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
28 | """Format messages for display in gr.Chatbot with type='messages'."""
29 | formatted = []
30 | for msg in messages:
31 | role = msg.get("role", "user")
32 | content = msg.get("content", "")
33 | tool_calls = msg.get("tool_calls", [])
34 |
35 | # Handle different content formats
36 | if isinstance(content, list):
37 | # Multi-modal content - can include text and images
38 | formatted_content = []
39 | for item in content:
40 | if item.get("type") == "text":
41 | text = item.get("text", "")
42 | if text.strip(): # Only add non-empty text
43 | formatted_content.append(text)
44 | elif item.get("type") == "image_url":
45 | image_url = item.get("image_url", {}).get("url", "")
46 | if image_url:
47 | # Check if it's a base64 image or URL
48 | if image_url.startswith("data:image"):
49 | # For base64 images, decode and create gr.Image
50 | try:
51 | header, data = image_url.split(",", 1)
52 | image_data = base64.b64decode(data)
53 | image = Image.open(io.BytesIO(image_data))
54 | formatted_content.append(gr.Image(value=image))
55 | except Exception as e:
56 | print(f"Error loading image: {e}")
57 | formatted_content.append(f"[Image loading error: {e}]")
58 | else:
59 | # For URL images, create gr.Image with URL
60 | formatted_content.append(gr.Image(value=image_url))
61 |
62 | # Determine final content format
63 | if len(formatted_content) == 1:
64 | content = formatted_content[0]
65 | elif len(formatted_content) > 1:
66 | content = formatted_content
67 | else:
68 | content = "[Empty content]"
69 |
70 | # Ensure role is valid for Gradio Chatbot
71 | if role not in ["user", "assistant"]:
72 | role = "assistant" if role == "system" else "user"
73 |
74 | # Invert roles for better display in human UI context
75 | # (what the AI says becomes "user", what human should respond becomes "assistant")
76 | if role == "user":
77 | role = "assistant"
78 | else:
79 | role = "user"
80 |
81 | # Add the main message if it has content
82 | if content and str(content).strip():
83 | formatted.append({"role": role, "content": content})
84 |
85 | # Handle tool calls - create separate messages for each tool call
86 | if tool_calls:
87 | for tool_call in tool_calls:
88 | function_name = tool_call.get("function", {}).get("name", "unknown")
89 | arguments_str = tool_call.get("function", {}).get("arguments", "{}")
90 |
91 | try:
92 | # Parse arguments to format them nicely
93 | arguments = json.loads(arguments_str)
94 | formatted_args = json.dumps(arguments, indent=2)
95 | except json.JSONDecodeError:
96 | # If parsing fails, use the raw string
97 | formatted_args = arguments_str
98 |
99 | # Create a formatted message for the tool call
100 | tool_call_content = f"```json\n{formatted_args}\n```"
101 |
102 | formatted.append(
103 | {
104 | "role": role,
105 | "content": tool_call_content,
106 | "metadata": {"title": f"🛠️ Used {function_name}"},
107 | }
108 | )
109 |
110 | return formatted
111 |
112 | def get_pending_calls(self) -> List[Dict[str, Any]]:
113 | """Get pending calls from the server."""
114 | try:
115 | response = requests.get(f"{self.server_url}/pending", timeout=5)
116 | if response.status_code == 200:
117 | return response.json().get("pending_calls", [])
118 | except Exception as e:
119 | print(f"Error fetching pending calls: {e}")
120 | return []
121 |
122 | def complete_call_with_response(self, call_id: str, response: str) -> bool:
123 | """Complete a call with a text response."""
124 | try:
125 | response_data = {"response": response}
126 | response_obj = requests.post(
127 | f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
128 | )
129 | response_obj.raise_for_status()
130 | return True
131 | except requests.RequestException as e:
132 | print(f"Error completing call: {e}")
133 | return False
134 |
135 | def complete_call_with_tool_calls(self, call_id: str, tool_calls: List[Dict[str, Any]]) -> bool:
136 | """Complete a call with tool calls."""
137 | try:
138 | response_data = {"tool_calls": tool_calls}
139 | response_obj = requests.post(
140 | f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
141 | )
142 | response_obj.raise_for_status()
143 | return True
144 | except requests.RequestException as e:
145 | print(f"Error completing call: {e}")
146 | return False
147 |
148 | def complete_call(
149 | self,
150 | call_id: str,
151 | response: Optional[str] = None,
152 | tool_calls: Optional[List[Dict[str, Any]]] = None,
153 | ) -> bool:
154 | """Complete a call with either a response or tool calls."""
155 | try:
156 | response_data = {}
157 | if response:
158 | response_data["response"] = response
159 | if tool_calls:
160 | response_data["tool_calls"] = tool_calls
161 |
162 | response_obj = requests.post(
163 | f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
164 | )
165 | response_obj.raise_for_status()
166 | return True
167 | except requests.RequestException as e:
168 | print(f"Error completing call: {e}")
169 | return False
170 |
171 | def get_last_image_from_messages(self, messages: List[Dict[str, Any]]) -> Optional[Any]:
172 | """Extract the last image from the messages for display above conversation."""
173 | last_image = None
174 |
175 | for msg in reversed(messages): # Start from the last message
176 | content = msg.get("content", "")
177 |
178 | if isinstance(content, list):
179 | for item in reversed(content): # Get the last image in the message
180 | if item.get("type") == "image_url":
181 | image_url = item.get("image_url", {}).get("url", "")
182 | if image_url:
183 | if image_url.startswith("data:image"):
184 | # For base64 images, create a gr.Image component
185 | try:
186 | header, data = image_url.split(",", 1)
187 | image_data = base64.b64decode(data)
188 | image = Image.open(io.BytesIO(image_data))
189 | return image
190 | except Exception as e:
191 | print(f"Error loading image: {e}")
192 | continue
193 | else:
194 | # For URL images, return the URL
195 | return image_url
196 |
197 | return last_image
198 |
199 | def refresh_pending_calls(self):
200 | """Refresh the list of pending calls."""
201 | pending_calls = self.get_pending_calls()
202 |
203 | if not pending_calls:
204 | return (
205 | gr.update(choices=["latest"], value="latest"), # dropdown
206 | gr.update(value=None), # image (no image)
207 | gr.update(value=[]), # chatbot (empty messages)
208 | gr.update(interactive=False), # submit button
209 | gr.update(visible=False), # click_actions_group hidden
210 | gr.update(visible=False), # actions_group hidden
211 | )
212 |
213 | # Sort pending calls by created_at to get oldest first
214 | sorted_calls = sorted(pending_calls, key=lambda x: x.get("created_at", ""))
215 |
216 | # Create choices for dropdown
217 | choices = [("latest", "latest")] # Add "latest" option first
218 |
219 | for call in sorted_calls:
220 | call_id = call["id"]
221 | model = call.get("model", "unknown")
222 | created_at = call.get("created_at", "")
223 | # Format timestamp
224 | try:
225 | dt = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
226 | time_str = dt.strftime("%H:%M:%S")
227 | except:
228 | time_str = created_at
229 |
230 | choice_label = f"{call_id[:8]}... ({model}) - {time_str}"
231 | choices.append((choice_label, call_id))
232 |
233 | # Default to "latest" which shows the oldest pending conversation
234 | selected_call_id = "latest"
235 | if selected_call_id == "latest" and sorted_calls:
236 | # Use the oldest call (first in sorted list)
237 | selected_call = sorted_calls[0]
238 | conversation = self.format_messages_for_chatbot(selected_call.get("messages", []))
239 | self.current_call_id = selected_call["id"]
240 | # Get the last image from messages
241 | self.last_image = self.get_last_image_from_messages(selected_call.get("messages", []))
242 | else:
243 | conversation = []
244 | self.current_call_id = None
245 | self.last_image = None
246 |
247 | return (
248 | gr.update(choices=choices, value="latest"),
249 | gr.update(value=self.last_image),
250 | gr.update(value=conversation),
251 | gr.update(interactive=bool(choices)),
252 | gr.update(visible=True), # click_actions_group visible when there is a call
253 | gr.update(visible=True), # actions_group visible when there is a call
254 | )
255 |
256 | def on_call_selected(self, selected_choice):
257 | """Handle when a call is selected from the dropdown."""
258 | if not selected_choice:
259 | return (
260 | gr.update(value=None), # no image
261 | gr.update(value=[]), # empty chatbot
262 | gr.update(interactive=False),
263 | gr.update(visible=False), # click_actions_group hidden
264 | gr.update(visible=False), # actions_group hidden
265 | )
266 |
267 | pending_calls = self.get_pending_calls()
268 | if not pending_calls:
269 | return (
270 | gr.update(value=None), # no image
271 | gr.update(value=[]), # empty chatbot
272 | gr.update(interactive=False),
273 | gr.update(visible=False), # click_actions_group hidden
274 | gr.update(visible=False), # actions_group hidden
275 | )
276 |
277 | # Handle "latest" option
278 | if selected_choice == "latest":
279 | # Sort calls by created_at to get oldest first
280 | sorted_calls = sorted(pending_calls, key=lambda x: x.get("created_at", ""))
281 | selected_call = sorted_calls[0] # Get the oldest call
282 | call_id = selected_call["id"]
283 | else:
284 | # Extract call_id from the choice for specific calls
285 | call_id = None
286 | for call in pending_calls:
287 | call_id_short = call["id"][:8]
288 | if call_id_short in selected_choice:
289 | call_id = call["id"]
290 | break
291 |
292 | if not call_id:
293 | return (
294 | gr.update(value=None), # no image
295 | gr.update(value=[]), # empty chatbot
296 | gr.update(interactive=False),
297 | )
298 |
299 | # Find the selected call
300 | selected_call = next((c for c in pending_calls if c["id"] == call_id), None)
301 |
302 | if not selected_call:
303 | return (
304 | gr.update(value=None), # no image
305 | gr.update(value=[]), # empty chatbot
306 | gr.update(interactive=False),
307 | gr.update(visible=False), # click_actions_group hidden
308 | gr.update(visible=False), # actions_group hidden
309 | )
310 |
311 | conversation = self.format_messages_for_chatbot(selected_call.get("messages", []))
312 | self.current_call_id = call_id
313 | # Get the last image from messages
314 | self.last_image = self.get_last_image_from_messages(selected_call.get("messages", []))
315 |
316 | return (
317 | gr.update(value=self.last_image),
318 | gr.update(value=conversation),
319 | gr.update(interactive=True),
320 | gr.update(visible=True), # click_actions_group visible
321 | gr.update(visible=True), # actions_group visible
322 | )
323 |
324 | def submit_response(self, response_text: str):
325 | """Submit a text response to the current call."""
326 | if not self.current_call_id:
327 | return (
328 | gr.update(value=response_text), # keep response text
329 | gr.update(value="❌ No call selected"), # status
330 | )
331 |
332 | if not response_text.strip():
333 | return (
334 | gr.update(value=response_text), # keep response text
335 | gr.update(value="❌ Response cannot be empty"), # status
336 | )
337 |
338 | success = self.complete_call_with_response(self.current_call_id, response_text)
339 |
340 | if success:
341 | status_msg = "✅ Response submitted successfully!"
342 | return (
343 | gr.update(value=""), # clear response text
344 | gr.update(value=status_msg), # status
345 | )
346 | else:
347 | return (
348 | gr.update(value=response_text), # keep response text
349 | gr.update(value="❌ Failed to submit response"), # status
350 | )
351 |
352 | def submit_action(self, action_type: str, **kwargs) -> str:
353 | """Submit a computer action as a tool call."""
354 | if not self.current_call_id:
355 | return "❌ No call selected"
356 |
357 | import uuid
358 |
359 | # Create tool call structure
360 | action_data = {"type": action_type, **kwargs}
361 | tool_call = {
362 | "id": f"call_{uuid.uuid4().hex[:24]}",
363 | "type": "function",
364 | "function": {"name": "computer", "arguments": json.dumps(action_data)},
365 | }
366 |
367 | success = self.complete_call_with_tool_calls(self.current_call_id, [tool_call])
368 |
369 | if success:
370 | return f"✅ {action_type.capitalize()} action submitted as tool call"
371 | else:
372 | return f"❌ Failed to submit {action_type} action"
373 |
374 | def submit_click_action(
375 | self, x: int, y: int, action_type: str = "click", button: str = "left"
376 | ) -> str:
377 | """Submit a coordinate-based action."""
378 | if action_type == "click":
379 | return self.submit_action(action_type, x=x, y=y, button=button)
380 | else:
381 | return self.submit_action(action_type, x=x, y=y)
382 |
383 | def submit_type_action(self, text: str) -> str:
384 | """Submit a type action."""
385 | return self.submit_action("type", text=text)
386 |
387 | def submit_hotkey_action(self, keys: str) -> str:
388 | """Submit a hotkey action."""
389 | return self.submit_action("keypress", keys=keys)
390 |
391 | def submit_wait_action(self) -> str:
392 | """Submit a wait action with no kwargs."""
393 | return self.submit_action("wait")
394 |
395 | def submit_description_click(
396 | self, description: str, action_type: str = "click", button: str = "left"
397 | ) -> str:
398 | """Submit a description-based action."""
399 | if action_type == "click":
400 | return self.submit_action(action_type, element_description=description, button=button)
401 | else:
402 | return self.submit_action(action_type, element_description=description)
403 |
404 | def wait_for_pending_calls(self, max_seconds: float = 10.0, check_interval: float = 0.2):
405 | """Wait for pending calls to appear or until max_seconds elapsed.
406 |
407 | This method loops and checks for pending calls at regular intervals,
408 | returning as soon as a pending call is found or the maximum wait time is reached.
409 |
410 | Args:
411 | max_seconds: Maximum number of seconds to wait
412 | check_interval: How often to check for pending calls (in seconds)
413 | """
414 | import time
415 |
416 | start_time = time.time()
417 |
418 | while time.time() - start_time < max_seconds:
419 | # Check if there are any pending calls
420 | pending_calls = self.get_pending_calls()
421 | if pending_calls:
422 | # Found pending calls, return immediately
423 | return self.refresh_pending_calls()
424 |
425 | # Wait before checking again
426 | time.sleep(check_interval)
427 |
428 | # Max wait time reached, return current state
429 | return self.refresh_pending_calls()
430 |
431 |
432 | def create_ui():
433 | """Create the Gradio interface."""
434 | ui_handler = HumanCompletionUI()
435 |
436 | with gr.Blocks(title="Human-in-the-Loop Agent Tool", fill_width=True) as demo:
437 | gr.Markdown("# 🤖 Human-in-the-Loop Agent Tool")
438 | gr.Markdown("Review AI conversation requests and provide human responses.")
439 |
440 | with gr.Row():
441 | with gr.Column(scale=2):
442 | with gr.Group():
443 | screenshot_image = gr.Image(
444 | label="Interactive Screenshot", interactive=False, height=600
445 | )
446 |
447 | # Action type selection for image clicks (wrapped for visibility control)
448 | with gr.Group(visible=False) as click_actions_group:
449 | with gr.Row():
450 | action_type_radio = gr.Dropdown(
451 | label="Interactive Action",
452 | choices=[
453 | "click",
454 | "double_click",
455 | "move",
456 | "left_mouse_up",
457 | "left_mouse_down",
458 | "scroll",
459 | ],
460 | value="click",
461 | scale=2,
462 | )
463 | action_button_radio = gr.Dropdown(
464 | label="Button",
465 | choices=["left", "right", "wheel", "back", "forward"],
466 | value="left",
467 | visible=True,
468 | scale=1,
469 | )
470 | scroll_x_input = gr.Number(
471 | label="scroll_x", value=0, visible=False, scale=1
472 | )
473 | scroll_y_input = gr.Number(
474 | label="scroll_y", value=-120, visible=False, scale=1
475 | )
476 |
477 | conversation_chatbot = gr.Chatbot(
478 | label="Conversation", type="messages", height=500, show_copy_button=True
479 | )
480 |
481 | with gr.Column(scale=1):
482 | with gr.Group():
483 | call_dropdown = gr.Dropdown(
484 | label="Select a pending conversation request",
485 | choices=["latest"],
486 | interactive=True,
487 | value="latest",
488 | )
489 | refresh_btn = gr.Button("🔄 Refresh", variant="secondary")
490 | status_display = gr.Textbox(
491 | label="Status", interactive=False, value="Ready to receive requests..."
492 | )
493 |
494 | with gr.Group():
495 | response_text = gr.Textbox(
496 | label="Message", lines=3, placeholder="Enter your message here..."
497 | )
498 | submit_btn = gr.Button(
499 | "📤 Submit Message", variant="primary", interactive=False
500 | )
501 |
502 | # Action Accordions (wrapped for visibility control)
503 | with gr.Group(visible=False) as actions_group:
504 | with gr.Tabs():
505 | with gr.Tab("🖱️ Click Actions"):
506 | with gr.Group():
507 | description_text = gr.Textbox(
508 | label="Element Description",
509 | placeholder="e.g., 'Privacy and security option in left sidebar'",
510 | )
511 | with gr.Row():
512 | description_action_type = gr.Dropdown(
513 | label="Action",
514 | choices=[
515 | "click",
516 | "double_click",
517 | "move",
518 | "left_mouse_up",
519 | "left_mouse_down",
520 | ],
521 | value="click",
522 | )
523 | description_button = gr.Dropdown(
524 | label="Button",
525 | choices=["left", "right", "wheel", "back", "forward"],
526 | value="left",
527 | )
528 | description_submit_btn = gr.Button("Submit Click Action")
529 |
530 | with gr.Tab("📝 Type Action"):
531 | with gr.Group():
532 | type_text = gr.Textbox(
533 | label="Text to Type", placeholder="Enter text to type..."
534 | )
535 | type_submit_btn = gr.Button("Submit Type")
536 |
537 | with gr.Tab("⌨️ Keypress Action"):
538 | with gr.Group():
539 | keypress_text = gr.Textbox(
540 | label="Keys", placeholder="e.g., ctrl+c, alt+tab"
541 | )
542 | keypress_submit_btn = gr.Button("Submit Keypress")
543 |
544 | with gr.Tab("🧰 Misc Actions"):
545 | with gr.Group():
546 | misc_action_dropdown = gr.Dropdown(
547 | label="Action", choices=["wait"], value="wait"
548 | )
549 | misc_submit_btn = gr.Button("Submit Action")
550 |
551 | # Event handlers
552 | refresh_btn.click(
553 | fn=ui_handler.refresh_pending_calls,
554 | outputs=[
555 | call_dropdown,
556 | screenshot_image,
557 | conversation_chatbot,
558 | submit_btn,
559 | click_actions_group,
560 | actions_group,
561 | ],
562 | )
563 |
564 | call_dropdown.change(
565 | fn=ui_handler.on_call_selected,
566 | inputs=[call_dropdown],
567 | outputs=[
568 | screenshot_image,
569 | conversation_chatbot,
570 | submit_btn,
571 | click_actions_group,
572 | actions_group,
573 | ],
574 | )
575 |
576 | def handle_image_click(evt: gr.SelectData):
577 | if evt.index is not None:
578 | x, y = evt.index
579 | action_type = ui_handler.current_action_type or "click"
580 | button = ui_handler.current_button or "left"
581 | if action_type == "scroll":
582 | sx_i = int(ui_handler.current_scroll_x or 0)
583 | sy_i = int(ui_handler.current_scroll_y or 0)
584 | # Submit a scroll action with x,y position and scroll deltas
585 | result = ui_handler.submit_action(
586 | "scroll", x=x, y=y, scroll_x=sx_i, scroll_y=sy_i
587 | )
588 | else:
589 | result = ui_handler.submit_click_action(x, y, action_type, button)
590 | ui_handler.wait_for_pending_calls()
591 | return result
592 | return "No coordinates selected"
593 |
594 | screenshot_image.select(fn=handle_image_click, outputs=[status_display]).then(
595 | fn=ui_handler.wait_for_pending_calls,
596 | outputs=[
597 | call_dropdown,
598 | screenshot_image,
599 | conversation_chatbot,
600 | submit_btn,
601 | click_actions_group,
602 | actions_group,
603 | ],
604 | )
605 |
606 | # Response submission
607 | submit_btn.click(
608 | fn=ui_handler.submit_response,
609 | inputs=[response_text],
610 | outputs=[response_text, status_display],
611 | ).then(
612 | fn=ui_handler.refresh_pending_calls,
613 | outputs=[
614 | call_dropdown,
615 | screenshot_image,
616 | conversation_chatbot,
617 | submit_btn,
618 | click_actions_group,
619 | actions_group,
620 | ],
621 | )
622 |
623 | # Toggle visibility of controls based on action type
624 | def toggle_action_controls(action_type):
625 | # Button visible only for click
626 | button_vis = gr.update(visible=(action_type == "click"))
627 | # Scroll inputs visible only for scroll
628 | scroll_x_vis = gr.update(visible=(action_type == "scroll"))
629 | scroll_y_vis = gr.update(visible=(action_type == "scroll"))
630 | # Update state
631 | ui_handler.current_action_type = action_type or "click"
632 | return button_vis, scroll_x_vis, scroll_y_vis
633 |
634 | action_type_radio.change(
635 | fn=toggle_action_controls,
636 | inputs=[action_type_radio],
637 | outputs=[action_button_radio, scroll_x_input, scroll_y_input],
638 | )
639 |
640 | # Keep other control values in ui_handler state
641 | def on_button_change(val):
642 | ui_handler.current_button = val or "left"
643 |
644 | action_button_radio.change(fn=on_button_change, inputs=[action_button_radio])
645 |
646 | def on_scroll_x_change(val):
647 | try:
648 | ui_handler.current_scroll_x = int(val) if val is not None else 0
649 | except Exception:
650 | ui_handler.current_scroll_x = 0
651 |
652 | scroll_x_input.change(fn=on_scroll_x_change, inputs=[scroll_x_input])
653 |
654 | def on_scroll_y_change(val):
655 | try:
656 | ui_handler.current_scroll_y = int(val) if val is not None else 0
657 | except Exception:
658 | ui_handler.current_scroll_y = 0
659 |
660 | scroll_y_input.change(fn=on_scroll_y_change, inputs=[scroll_y_input])
661 |
662 | type_submit_btn.click(
663 | fn=ui_handler.submit_type_action, inputs=[type_text], outputs=[status_display]
664 | ).then(
665 | fn=ui_handler.wait_for_pending_calls,
666 | outputs=[
667 | call_dropdown,
668 | screenshot_image,
669 | conversation_chatbot,
670 | submit_btn,
671 | click_actions_group,
672 | actions_group,
673 | ],
674 | )
675 |
676 | keypress_submit_btn.click(
677 | fn=ui_handler.submit_hotkey_action, inputs=[keypress_text], outputs=[status_display]
678 | ).then(
679 | fn=ui_handler.wait_for_pending_calls,
680 | outputs=[
681 | call_dropdown,
682 | screenshot_image,
683 | conversation_chatbot,
684 | submit_btn,
685 | click_actions_group,
686 | actions_group,
687 | ],
688 | )
689 |
690 | def handle_description_submit(description, action_type, button):
691 | if description:
692 | result = ui_handler.submit_description_click(description, action_type, button)
693 | ui_handler.wait_for_pending_calls()
694 | return result
695 | return "Please enter a description"
696 |
697 | description_submit_btn.click(
698 | fn=handle_description_submit,
699 | inputs=[description_text, description_action_type, description_button],
700 | outputs=[status_display],
701 | ).then(
702 | fn=ui_handler.wait_for_pending_calls,
703 | outputs=[
704 | call_dropdown,
705 | screenshot_image,
706 | conversation_chatbot,
707 | submit_btn,
708 | click_actions_group,
709 | actions_group,
710 | ],
711 | )
712 |
713 | # Misc action handler
714 | def handle_misc_submit(selected_action):
715 | if selected_action == "wait":
716 | result = ui_handler.submit_wait_action()
717 | ui_handler.wait_for_pending_calls()
718 | return result
719 | return f"Unsupported misc action: {selected_action}"
720 |
721 | misc_submit_btn.click(
722 | fn=handle_misc_submit, inputs=[misc_action_dropdown], outputs=[status_display]
723 | ).then(
724 | fn=ui_handler.wait_for_pending_calls,
725 | outputs=[
726 | call_dropdown,
727 | screenshot_image,
728 | conversation_chatbot,
729 | submit_btn,
730 | click_actions_group,
731 | actions_group,
732 | ],
733 | )
734 |
735 | # Load initial data
736 | demo.load(
737 | fn=ui_handler.refresh_pending_calls,
738 | outputs=[
739 | call_dropdown,
740 | screenshot_image,
741 | conversation_chatbot,
742 | submit_btn,
743 | click_actions_group,
744 | actions_group,
745 | ],
746 | )
747 |
748 | return demo
749 |
750 |
751 | if __name__ == "__main__":
752 | demo = create_ui()
753 | demo.queue()
754 | demo.launch(server_name="0.0.0.0", server_port=7860)
755 |
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/uitars.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | UITARS agent loop implementation using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B
3 | Paper: https://arxiv.org/abs/2501.12326
4 | Code: https://github.com/bytedance/UI-TARS
5 | """
6 |
7 | import ast
8 | import asyncio
9 | import base64
10 | import json
11 | import math
12 | import re
13 | from ctypes import cast
14 | from io import BytesIO
15 | from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
16 |
17 | import litellm
18 | from litellm.responses.litellm_completion_transformation.transformation import (
19 | LiteLLMCompletionResponsesConfig,
20 | )
21 | from litellm.responses.utils import Usage
22 | from litellm.types.utils import ModelResponse
23 | from openai.types.responses.response_computer_tool_call_param import (
24 | ActionType,
25 | ResponseComputerToolCallParam,
26 | )
27 | from openai.types.responses.response_input_param import ComputerCallOutput
28 | from openai.types.responses.response_output_message_param import (
29 | ResponseOutputMessageParam,
30 | )
31 | from openai.types.responses.response_reasoning_item_param import (
32 | ResponseReasoningItemParam,
33 | Summary,
34 | )
35 | from PIL import Image
36 |
37 | from ..decorators import register_agent
38 | from ..responses import (
39 | make_click_item,
40 | make_double_click_item,
41 | make_drag_item,
42 | make_input_image_item,
43 | make_keypress_item,
44 | make_output_text_item,
45 | make_reasoning_item,
46 | make_scroll_item,
47 | make_type_item,
48 | make_wait_item,
49 | )
50 | from ..types import AgentCapability, AgentResponse, Messages, Tools
51 |
52 | # Constants from reference code
53 | IMAGE_FACTOR = 28
54 | MIN_PIXELS = 100 * 28 * 28
55 | MAX_PIXELS = 16384 * 28 * 28
56 | MAX_RATIO = 200
57 |
58 | FINISH_WORD = "finished"
59 | WAIT_WORD = "wait"
60 | ENV_FAIL_WORD = "error_env"
61 | CALL_USER = "call_user"
62 |
63 | # Action space prompt for UITARS
64 | UITARS_ACTION_SPACE = """
65 | click(start_box='<|box_start|>(x1,y1)<|box_end|>')
66 | left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
67 | right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
68 | drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
69 | hotkey(key='')
70 | type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
71 | scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
72 | wait() #Sleep for 5s and take a screenshot to check for any changes.
73 | finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
74 | """
75 |
76 | UITARS_PROMPT_TEMPLATE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
77 |
78 | ## Output Format
79 | ```
80 | Thought: ...
81 | Action: ...
82 | ```
83 |
84 | ## Action Space
85 | {action_space}
86 |
87 | ## Note
88 | - Use {language} in `Thought` part.
89 | - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
90 |
91 | ## User Instruction
92 | {instruction}
93 | """
94 |
95 | GROUNDING_UITARS_PROMPT_TEMPLATE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
96 |
97 | ## Output Format
98 |
99 | Action: ...
100 |
101 |
102 | ## Action Space
103 | click(point='<|box_start|>(x1,y1)<|box_end|>')
104 |
105 | ## User Instruction
106 | {instruction}"""
107 |
108 |
109 | def round_by_factor(number: float, factor: int) -> int:
110 | """Returns the closest integer to 'number' that is divisible by 'factor'."""
111 | return round(number / factor) * factor
112 |
113 |
114 | def ceil_by_factor(number: float, factor: int) -> int:
115 | """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
116 | return math.ceil(number / factor) * factor
117 |
118 |
119 | def floor_by_factor(number: float, factor: int) -> int:
120 | """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
121 | return math.floor(number / factor) * factor
122 |
123 |
124 | def smart_resize(
125 | height: int,
126 | width: int,
127 | factor: int = IMAGE_FACTOR,
128 | min_pixels: int = MIN_PIXELS,
129 | max_pixels: int = MAX_PIXELS,
130 | ) -> tuple[int, int]:
131 | """
132 | Rescales the image so that the following conditions are met:
133 | 1. Both dimensions (height and width) are divisible by 'factor'.
134 | 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
135 | 3. The aspect ratio of the image is maintained as closely as possible.
136 | """
137 | if max(height, width) / min(height, width) > MAX_RATIO:
138 | raise ValueError(
139 | f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
140 | )
141 | h_bar = max(factor, round_by_factor(height, factor))
142 | w_bar = max(factor, round_by_factor(width, factor))
143 | if h_bar * w_bar > max_pixels:
144 | beta = math.sqrt((height * width) / max_pixels)
145 | h_bar = floor_by_factor(height / beta, factor)
146 | w_bar = floor_by_factor(width / beta, factor)
147 | elif h_bar * w_bar < min_pixels:
148 | beta = math.sqrt(min_pixels / (height * width))
149 | h_bar = ceil_by_factor(height * beta, factor)
150 | w_bar = ceil_by_factor(width * beta, factor)
151 | return h_bar, w_bar
152 |
153 |
154 | def escape_single_quotes(text):
155 | """Escape single quotes in text for safe string formatting."""
156 | pattern = r"(?<!\\)'"
157 | return re.sub(pattern, r"\\'", text)
158 |
159 |
160 | def parse_action(action_str):
161 | """Parse action string into structured format."""
162 | try:
163 | node = ast.parse(action_str, mode="eval")
164 | if not isinstance(node, ast.Expression):
165 | raise ValueError("Not an expression")
166 |
167 | call = node.body
168 | if not isinstance(call, ast.Call):
169 | raise ValueError("Not a function call")
170 |
171 | # Get function name
172 | if isinstance(call.func, ast.Name):
173 | func_name = call.func.id
174 | elif isinstance(call.func, ast.Attribute):
175 | func_name = call.func.attr
176 | else:
177 | func_name = None
178 |
179 | # Get keyword arguments
180 | kwargs = {}
181 | for kw in call.keywords:
182 | key = kw.arg
183 | if isinstance(kw.value, ast.Constant):
184 | value = kw.value.value
185 | elif isinstance(kw.value, ast.Str): # Compatibility with older Python
186 | value = kw.value.s
187 | else:
188 | value = None
189 | kwargs[key] = value
190 |
191 | return {"function": func_name, "args": kwargs}
192 |
193 | except Exception as e:
194 | print(f"Failed to parse action '{action_str}': {e}")
195 | return None
196 |
197 |
198 | def parse_uitars_response(text: str, image_width: int, image_height: int) -> List[Dict[str, Any]]:
199 | """Parse UITARS model response into structured actions."""
200 | text = text.strip()
201 |
202 | # Extract thought
203 | thought = None
204 | if text.startswith("Thought:"):
205 | thought_match = re.search(r"Thought: (.+?)(?=\s*Action:|$)", text, re.DOTALL)
206 | if thought_match:
207 | thought = thought_match.group(1).strip()
208 |
209 | # Extract action
210 | if "Action:" not in text:
211 | raise ValueError("No Action found in response")
212 |
213 | action_str = text.split("Action:")[-1].strip()
214 |
215 | # Handle special case for type actions
216 | if "type(content" in action_str:
217 |
218 | def escape_quotes(match):
219 | return match.group(1)
220 |
221 | pattern = r"type\(content='(.*?)'\)"
222 | content = re.sub(pattern, escape_quotes, action_str)
223 | action_str = escape_single_quotes(content)
224 | action_str = "type(content='" + action_str + "')"
225 |
226 | # Parse the action
227 | parsed_action = parse_action(action_str.replace("\n", "\\n").lstrip())
228 | if parsed_action is None:
229 | raise ValueError(f"Action can't parse: {action_str}")
230 |
231 | action_type = parsed_action["function"]
232 | params = parsed_action["args"]
233 |
234 | # Process parameters
235 | action_inputs = {}
236 | for param_name, param in params.items():
237 | if param == "":
238 | continue
239 | param = str(param).lstrip()
240 | action_inputs[param_name.strip()] = param
241 |
242 | # Handle coordinate parameters
243 | if "start_box" in param_name or "end_box" in param_name:
244 | # Parse coordinates like '<|box_start|>(x,y)<|box_end|>' or '(x,y)'
245 | # First, remove special tokens
246 | clean_param = param.replace("<|box_start|>", "").replace("<|box_end|>", "")
247 | # Then remove parentheses and split
248 | numbers = clean_param.replace("(", "").replace(")", "").split(",")
249 |
250 | try:
251 | float_numbers = [
252 | float(num.strip()) / 1000 for num in numbers
253 | ] # Normalize to 0-1 range
254 |
255 | if len(float_numbers) == 2:
256 | # Single point, duplicate for box format
257 | float_numbers = [
258 | float_numbers[0],
259 | float_numbers[1],
260 | float_numbers[0],
261 | float_numbers[1],
262 | ]
263 |
264 | action_inputs[param_name.strip()] = str(float_numbers)
265 | except ValueError as e:
266 | # If parsing fails, keep the original parameter value
267 | print(f"Warning: Could not parse coordinates '{param}': {e}")
268 | action_inputs[param_name.strip()] = param
269 |
270 | return [
271 | {
272 | "thought": thought,
273 | "action_type": action_type,
274 | "action_inputs": action_inputs,
275 | "text": text,
276 | }
277 | ]
278 |
279 |
280 | def convert_to_computer_actions(
281 | parsed_responses: List[Dict[str, Any]], image_width: int, image_height: int
282 | ) -> List[ResponseComputerToolCallParam | ResponseOutputMessageParam]:
283 | """Convert parsed UITARS responses to computer actions."""
284 | computer_actions = []
285 |
286 | for response in parsed_responses:
287 | action_type = response.get("action_type")
288 | action_inputs = response.get("action_inputs", {})
289 |
290 | if action_type == "finished":
291 | finished_text = action_inputs.get("content", "Task completed successfully.")
292 | computer_actions.append(make_output_text_item(finished_text))
293 | break
294 |
295 | elif action_type == "wait":
296 | computer_actions.append(make_wait_item())
297 |
298 | elif action_type == "call_user":
299 | computer_actions.append(
300 | make_output_text_item("I need assistance from the user to proceed with this task.")
301 | )
302 |
303 | elif action_type in ["click", "left_single"]:
304 | start_box = action_inputs.get("start_box")
305 | if start_box:
306 | coords = eval(start_box)
307 | x = int((coords[0] + coords[2]) / 2 * image_width)
308 | y = int((coords[1] + coords[3]) / 2 * image_height)
309 |
310 | computer_actions.append(make_click_item(x, y, "left"))
311 |
312 | elif action_type == "double_click":
313 | start_box = action_inputs.get("start_box")
314 | if start_box:
315 | coords = eval(start_box)
316 | x = int((coords[0] + coords[2]) / 2 * image_width)
317 | y = int((coords[1] + coords[3]) / 2 * image_height)
318 |
319 | computer_actions.append(make_double_click_item(x, y))
320 |
321 | elif action_type == "right_click":
322 | start_box = action_inputs.get("start_box")
323 | if start_box:
324 | coords = eval(start_box)
325 | x = int((coords[0] + coords[2]) / 2 * image_width)
326 | y = int((coords[1] + coords[3]) / 2 * image_height)
327 |
328 | computer_actions.append(make_click_item(x, y, "right"))
329 |
330 | elif action_type == "type":
331 | content = action_inputs.get("content", "")
332 | computer_actions.append(make_type_item(content))
333 |
334 | elif action_type == "hotkey":
335 | key = action_inputs.get("key", "")
336 | keys = key.split()
337 | computer_actions.append(make_keypress_item(keys))
338 |
339 | elif action_type == "press":
340 | key = action_inputs.get("key", "")
341 | computer_actions.append(make_keypress_item([key]))
342 |
343 | elif action_type == "scroll":
344 | start_box = action_inputs.get("start_box")
345 | direction = action_inputs.get("direction", "down")
346 |
347 | if start_box:
348 | coords = eval(start_box)
349 | x = int((coords[0] + coords[2]) / 2 * image_width)
350 | y = int((coords[1] + coords[3]) / 2 * image_height)
351 | else:
352 | x, y = image_width // 2, image_height // 2
353 |
354 | scroll_y = 5 if "up" in direction.lower() else -5
355 | computer_actions.append(make_scroll_item(x, y, 0, scroll_y))
356 |
357 | elif action_type == "drag":
358 | start_box = action_inputs.get("start_box")
359 | end_box = action_inputs.get("end_box")
360 |
361 | if start_box and end_box:
362 | start_coords = eval(start_box)
363 | end_coords = eval(end_box)
364 |
365 | start_x = int((start_coords[0] + start_coords[2]) / 2 * image_width)
366 | start_y = int((start_coords[1] + start_coords[3]) / 2 * image_height)
367 | end_x = int((end_coords[0] + end_coords[2]) / 2 * image_width)
368 | end_y = int((end_coords[1] + end_coords[3]) / 2 * image_height)
369 |
370 | path = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
371 | computer_actions.append(make_drag_item(path))
372 |
373 | return computer_actions
374 |
375 |
376 | def pil_to_base64(image: Image.Image) -> str:
377 | """Convert PIL image to base64 string."""
378 | buffer = BytesIO()
379 | image.save(buffer, format="PNG")
380 | return base64.b64encode(buffer.getvalue()).decode("utf-8")
381 |
382 |
383 | def process_image_for_uitars(
384 | image_data: str, max_pixels: int = MAX_PIXELS, min_pixels: int = MIN_PIXELS
385 | ) -> tuple[Image.Image, int, int]:
386 | """Process image for UITARS model input."""
387 | # Decode base64 image
388 | if image_data.startswith("data:image"):
389 | image_data = image_data.split(",")[1]
390 |
391 | image_bytes = base64.b64decode(image_data)
392 | image = Image.open(BytesIO(image_bytes))
393 |
394 | original_width, original_height = image.size
395 |
396 | # Resize image according to UITARS requirements
397 | if image.width * image.height > max_pixels:
398 | resize_factor = math.sqrt(max_pixels / (image.width * image.height))
399 | width = int(image.width * resize_factor)
400 | height = int(image.height * resize_factor)
401 | image = image.resize((width, height))
402 |
403 | if image.width * image.height < min_pixels:
404 | resize_factor = math.sqrt(min_pixels / (image.width * image.height))
405 | width = math.ceil(image.width * resize_factor)
406 | height = math.ceil(image.height * resize_factor)
407 | image = image.resize((width, height))
408 |
409 | if image.mode != "RGB":
410 | image = image.convert("RGB")
411 |
412 | return image, original_width, original_height
413 |
414 |
415 | def sanitize_message(msg: Any) -> Any:
416 | """Return a copy of the message with image_url ommited within content parts"""
417 | if isinstance(msg, dict):
418 | result = {}
419 | for key, value in msg.items():
420 | if key == "content" and isinstance(value, list):
421 | result[key] = [
422 | (
423 | {k: v for k, v in item.items() if k != "image_url"}
424 | if isinstance(item, dict)
425 | else item
426 | )
427 | for item in value
428 | ]
429 | else:
430 | result[key] = value
431 | return result
432 | elif isinstance(msg, list):
433 | return [sanitize_message(item) for item in msg]
434 | else:
435 | return msg
436 |
437 |
438 | def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any]]:
439 | """
440 | Convert UITARS internal message format back to LiteLLM format.
441 |
442 | This function processes reasoning, computer_call, and computer_call_output messages
443 | and converts them to the appropriate LiteLLM assistant message format.
444 |
445 | Args:
446 | messages: List of UITARS internal messages
447 |
448 | Returns:
449 | List of LiteLLM formatted messages
450 | """
451 | litellm_messages = []
452 | current_assistant_content = []
453 |
454 | for message in messages:
455 | if isinstance(message, dict):
456 | message_type = message.get("type")
457 |
458 | if message_type == "reasoning":
459 | # Extract reasoning text from summary
460 | summary = message.get("summary", [])
461 | if summary and isinstance(summary, list):
462 | for summary_item in summary:
463 | if (
464 | isinstance(summary_item, dict)
465 | and summary_item.get("type") == "summary_text"
466 | ):
467 | reasoning_text = summary_item.get("text", "")
468 | if reasoning_text:
469 | current_assistant_content.append(f"Thought: {reasoning_text}")
470 |
471 | elif message_type == "computer_call":
472 | # Convert computer action to UITARS action format
473 | action = message.get("action", {})
474 | action_type = action.get("type")
475 |
476 | if action_type == "click":
477 | x, y = action.get("x", 0), action.get("y", 0)
478 | button = action.get("button", "left")
479 | if button == "left":
480 | action_text = f"Action: click(start_box='({x},{y})')"
481 | elif button == "right":
482 | action_text = f"Action: right_single(start_box='({x},{y})')"
483 | else:
484 | action_text = f"Action: click(start_box='({x},{y})')"
485 |
486 | elif action_type == "double_click":
487 | x, y = action.get("x", 0), action.get("y", 0)
488 | action_text = f"Action: left_double(start_box='({x},{y})')"
489 |
490 | elif action_type == "drag":
491 | start_x, start_y = action.get("start_x", 0), action.get("start_y", 0)
492 | end_x, end_y = action.get("end_x", 0), action.get("end_y", 0)
493 | action_text = f"Action: drag(start_box='({start_x},{start_y})', end_box='({end_x},{end_y})')"
494 |
495 | elif action_type == "key":
496 | key = action.get("key", "")
497 | action_text = f"Action: hotkey(key='{key}')"
498 |
499 | elif action_type == "type":
500 | text = action.get("text", "")
501 | # Escape single quotes in the text
502 | escaped_text = escape_single_quotes(text)
503 | action_text = f"Action: type(content='{escaped_text}')"
504 |
505 | elif action_type == "scroll":
506 | x, y = action.get("x", 0), action.get("y", 0)
507 | direction = action.get("direction", "down")
508 | action_text = f"Action: scroll(start_box='({x},{y})', direction='{direction}')"
509 |
510 | elif action_type == "wait":
511 | action_text = "Action: wait()"
512 |
513 | else:
514 | # Fallback for unknown action types
515 | action_text = f"Action: {action_type}({action})"
516 |
517 | current_assistant_content.append(action_text)
518 |
519 | # When we hit a computer_call_output, finalize the current assistant message
520 | if current_assistant_content:
521 | litellm_messages.append(
522 | {
523 | "role": "assistant",
524 | "content": [
525 | {"type": "text", "text": "\n".join(current_assistant_content)}
526 | ],
527 | }
528 | )
529 | current_assistant_content = []
530 |
531 | elif message_type == "computer_call_output":
532 | # Add screenshot from computer call output
533 | output = message.get("output", {})
534 | if isinstance(output, dict) and output.get("type") == "input_image":
535 | image_url = output.get("image_url", "")
536 | if image_url:
537 | litellm_messages.append(
538 | {
539 | "role": "user",
540 | "content": [{"type": "image_url", "image_url": {"url": image_url}}],
541 | }
542 | )
543 |
544 | elif message.get("role") == "user":
545 | # # Handle user messages
546 | # content = message.get("content", "")
547 | # if isinstance(content, str):
548 | # litellm_messages.append({
549 | # "role": "user",
550 | # "content": content
551 | # })
552 | # elif isinstance(content, list):
553 | # litellm_messages.append({
554 | # "role": "user",
555 | # "content": content
556 | # })
557 | pass
558 |
559 | # Add any remaining assistant content
560 | if current_assistant_content:
561 | litellm_messages.append({"role": "assistant", "content": current_assistant_content})
562 |
563 | return litellm_messages
564 |
565 |
566 | @register_agent(models=r"(?i).*ui-?tars.*", priority=-1)
567 | class UITARSConfig:
568 | """
569 | UITARS agent configuration using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B model.
570 |
571 | Supports UITARS vision-language models for computer control.
572 | """
573 |
574 | async def predict_step(
575 | self,
576 | messages: List[Dict[str, Any]],
577 | model: str,
578 | tools: Optional[List[Dict[str, Any]]] = None,
579 | max_retries: Optional[int] = None,
580 | stream: bool = False,
581 | computer_handler=None,
582 | use_prompt_caching: Optional[bool] = False,
583 | _on_api_start=None,
584 | _on_api_end=None,
585 | _on_usage=None,
586 | _on_screenshot=None,
587 | **kwargs,
588 | ) -> Dict[str, Any]:
589 | """
590 | Predict the next step based on input messages.
591 |
592 | Args:
593 | messages: Input messages following Responses format
594 | model: Model name to use
595 | tools: Optional list of tool schemas
596 | max_retries: Maximum number of retries
597 | stream: Whether to stream responses
598 | computer_handler: Computer handler instance
599 | _on_api_start: Callback for API start
600 | _on_api_end: Callback for API end
601 | _on_usage: Callback for usage tracking
602 | _on_screenshot: Callback for screenshot events
603 | **kwargs: Additional arguments
604 |
605 | Returns:
606 | Dictionary with "output" (output items) and "usage" array
607 | """
608 | tools = tools or []
609 |
610 | # Create response items
611 | response_items = []
612 |
613 | # Find computer tool for screen dimensions
614 | computer_tool = None
615 | for tool_schema in tools:
616 | if tool_schema["type"] == "computer":
617 | computer_tool = tool_schema["computer"]
618 | break
619 |
620 | # Get screen dimensions
621 | screen_width, screen_height = 1024, 768
622 | if computer_tool:
623 | try:
624 | screen_width, screen_height = await computer_tool.get_dimensions()
625 | except:
626 | pass
627 |
628 | # Process messages to extract instruction and image
629 | instruction = ""
630 | image_data = None
631 |
632 | # Convert messages to list if string
633 | if isinstance(messages, str):
634 | messages = [{"role": "user", "content": messages}]
635 |
636 | # Extract instruction and latest screenshot
637 | for message in reversed(messages):
638 | if isinstance(message, dict):
639 | content = message.get("content", "")
640 |
641 | # Handle different content formats
642 | if isinstance(content, str):
643 | if not instruction and message.get("role") == "user":
644 | instruction = content
645 | elif isinstance(content, list):
646 | for item in content:
647 | if isinstance(item, dict):
648 | if item.get("type") == "text" and not instruction:
649 | instruction = item.get("text", "")
650 | elif item.get("type") == "image_url" and not image_data:
651 | image_url = item.get("image_url", {})
652 | if isinstance(image_url, dict):
653 | image_data = image_url.get("url", "")
654 | else:
655 | image_data = image_url
656 |
657 | # Also check for computer_call_output with screenshots
658 | if message.get("type") == "computer_call_output" and not image_data:
659 | output = message.get("output", {})
660 | if isinstance(output, dict) and output.get("type") == "input_image":
661 | image_data = output.get("image_url", "")
662 |
663 | if instruction and image_data:
664 | break
665 |
666 | if not instruction:
667 | instruction = (
668 | "Help me complete this task by analyzing the screen and taking appropriate actions."
669 | )
670 |
671 | # Create prompt
672 | user_prompt = UITARS_PROMPT_TEMPLATE.format(
673 | instruction=instruction, action_space=UITARS_ACTION_SPACE, language="English"
674 | )
675 |
676 | # Convert conversation history to LiteLLM format
677 | history_messages = convert_uitars_messages_to_litellm(messages)
678 |
679 | # Prepare messages for liteLLM
680 | litellm_messages = [{"role": "system", "content": "You are a helpful assistant."}]
681 |
682 | # Add current user instruction with screenshot
683 | current_user_message = {
684 | "role": "user",
685 | "content": [
686 | {"type": "text", "text": user_prompt},
687 | ],
688 | }
689 | litellm_messages.append(current_user_message)
690 |
691 | # Process image for UITARS
692 | if not image_data:
693 | # Take screenshot if none found in messages
694 | if computer_handler:
695 | image_data = await computer_handler.screenshot()
696 | await _on_screenshot(image_data, "screenshot_before")
697 |
698 | # Add screenshot to output items so it can be retained in history
699 | response_items.append(make_input_image_item(image_data))
700 | else:
701 | raise ValueError("No screenshot found in messages and no computer_handler provided")
702 | processed_image, original_width, original_height = process_image_for_uitars(image_data)
703 | encoded_image = pil_to_base64(processed_image)
704 |
705 | # Add conversation history
706 | if history_messages:
707 | litellm_messages.extend(history_messages)
708 | else:
709 | litellm_messages.append(
710 | {
711 | "role": "user",
712 | "content": [
713 | {
714 | "type": "image_url",
715 | "image_url": {"url": f"data:image/png;base64,{encoded_image}"},
716 | }
717 | ],
718 | }
719 | )
720 |
721 | # Prepare API call kwargs
722 | api_kwargs = {
723 | "model": model,
724 | "messages": litellm_messages,
725 | "max_tokens": kwargs.get("max_tokens", 500),
726 | "temperature": kwargs.get("temperature", 0.0),
727 | "do_sample": kwargs.get("temperature", 0.0) > 0.0,
728 | "num_retries": max_retries,
729 | **{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]},
730 | }
731 |
732 | # Call API start hook
733 | if _on_api_start:
734 | await _on_api_start(api_kwargs)
735 |
736 | # Call liteLLM with UITARS model
737 | response = await litellm.acompletion(**api_kwargs)
738 |
739 | # Call API end hook
740 | if _on_api_end:
741 | await _on_api_end(api_kwargs, response)
742 |
743 | # Extract response content
744 | response_content = response.choices[0].message.content.strip() # type: ignore
745 |
746 | # Parse UITARS response
747 | parsed_responses = parse_uitars_response(response_content, original_width, original_height)
748 |
749 | # Convert to computer actions
750 | computer_actions = convert_to_computer_actions(
751 | parsed_responses, original_width, original_height
752 | )
753 |
754 | # Add computer actions to response items
755 | thought = parsed_responses[0].get("thought", "")
756 | if thought:
757 | response_items.append(make_reasoning_item(thought))
758 | response_items.extend(computer_actions)
759 |
760 | # Extract usage information
761 | response_usage = {
762 | **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
763 | response.usage
764 | ).model_dump(),
765 | "response_cost": response._hidden_params.get("response_cost", 0.0),
766 | }
767 | if _on_usage:
768 | await _on_usage(response_usage)
769 |
770 | # Create agent response
771 | agent_response = {"output": response_items, "usage": response_usage}
772 |
773 | return agent_response
774 |
775 | async def predict_click(
776 | self, model: str, image_b64: str, instruction: str, **kwargs
777 | ) -> Optional[Tuple[int, int]]:
778 | """
779 | Predict click coordinates based on image and instruction.
780 |
781 | UITARS supports click prediction through its action parsing.
782 |
783 | Args:
784 | model: Model name to use
785 | image_b64: Base64 encoded image
786 | instruction: Instruction for where to click
787 |
788 | Returns:
789 | Tuple with (x, y) coordinates or None
790 | """
791 | try:
792 | # Create prompt using grounding template
793 | user_prompt = GROUNDING_UITARS_PROMPT_TEMPLATE.format(instruction=instruction)
794 |
795 | # Process image for UITARS
796 | processed_image, original_width, original_height = process_image_for_uitars(image_b64)
797 | encoded_image = pil_to_base64(processed_image)
798 |
799 | # Prepare messages for liteLLM
800 | litellm_messages = [
801 | {"role": "system", "content": "You are a helpful assistant."},
802 | {
803 | "role": "user",
804 | "content": [
805 | {"type": "text", "text": user_prompt},
806 | {
807 | "type": "image_url",
808 | "image_url": {"url": f"data:image/png;base64,{encoded_image}"},
809 | },
810 | ],
811 | },
812 | ]
813 |
814 | # Prepare API call kwargs
815 | api_kwargs = {
816 | "model": model,
817 | "messages": litellm_messages,
818 | "max_tokens": 2056,
819 | "temperature": 0.0,
820 | "do_sample": False,
821 | }
822 | api_kwargs.update({k: v for k, v in (kwargs or {}).items()})
823 |
824 | # Call liteLLM with UITARS model
825 | response = await litellm.acompletion(**api_kwargs)
826 |
827 | # Extract response content
828 | response_content = response.choices[0].message.content.strip() # type: ignore
829 |
830 | print(response_content)
831 |
832 | # Parse the response to extract click coordinates
833 | # Look for click action with coordinates (with special tokens)
834 | click_pattern = r"click\(point='<\|box_start\|>\((\d+),(\d+)\)<\|box_end\|>'\)"
835 | match = re.search(click_pattern, response_content)
836 |
837 | # Fallback: Look for simpler format without special tokens
838 | if not match:
839 | # Pattern for: click(start_box='(x,y)') or click(point='(x,y)')
840 | fallback_pattern = r"click\((?:start_box|point)='\((\d+),(\d+)\)'\)"
841 | match = re.search(fallback_pattern, response_content)
842 |
843 | if match:
844 | x, y = int(match.group(1)), int(match.group(2))
845 | # Scale coordinates back to original image dimensions
846 | scale_x = original_width / processed_image.width
847 | scale_y = original_height / processed_image.height
848 |
849 | scaled_x = int(x * scale_x)
850 | scaled_y = int(y * scale_y)
851 |
852 | return (scaled_x, scaled_y)
853 |
854 | return None
855 |
856 | except Exception as e:
857 | # Log error and return None
858 | print(f"Error in predict_click: {e}")
859 | return None
860 |
861 | def get_capabilities(self) -> List[AgentCapability]:
862 | """
863 | Get list of capabilities supported by this agent config.
864 |
865 | Returns:
866 | List of capability strings
867 | """
868 | return ["step", "click"]
869 |
```
--------------------------------------------------------------------------------
/libs/lume/src/VM/VM.swift:
--------------------------------------------------------------------------------
```swift
1 | import Foundation
2 |
3 | // MARK: - Support Types
4 |
5 | /// Base context for virtual machine directory and configuration
6 | struct VMDirContext {
7 | let dir: VMDirectory
8 | var config: VMConfig
9 | let home: Home
10 | let storage: String?
11 |
12 | func saveConfig() throws {
13 | try dir.saveConfig(config)
14 | }
15 |
16 | var name: String { dir.name }
17 | var initialized: Bool { dir.initialized() }
18 | var diskPath: Path { dir.diskPath }
19 | var nvramPath: Path { dir.nvramPath }
20 |
21 | func setDisk(_ size: UInt64) throws {
22 | try dir.setDisk(size)
23 | }
24 |
25 | func finalize(to name: String) throws {
26 | let vmDir = try home.getVMDirectory(name)
27 | try FileManager.default.moveItem(at: dir.dir.url, to: vmDir.dir.url)
28 | }
29 | }
30 |
31 | // MARK: - Base VM Class
32 |
33 | /// Base class for virtual machine implementations
34 | @MainActor
35 | class VM {
36 | // MARK: - Properties
37 |
38 | var vmDirContext: VMDirContext
39 |
40 | @MainActor
41 | private var virtualizationService: VMVirtualizationService?
42 | private let vncService: VNCService
43 | internal let virtualizationServiceFactory:
44 | (VMVirtualizationServiceContext) throws -> VMVirtualizationService
45 | private let vncServiceFactory: (VMDirectory) -> VNCService
46 |
47 | // MARK: - Initialization
48 |
49 | init(
50 | vmDirContext: VMDirContext,
51 | virtualizationServiceFactory: @escaping (VMVirtualizationServiceContext) throws ->
52 | VMVirtualizationService = { try DarwinVirtualizationService(configuration: $0) },
53 | vncServiceFactory: @escaping (VMDirectory) -> VNCService = {
54 | DefaultVNCService(vmDirectory: $0)
55 | }
56 | ) {
57 | self.vmDirContext = vmDirContext
58 | self.virtualizationServiceFactory = virtualizationServiceFactory
59 | self.vncServiceFactory = vncServiceFactory
60 |
61 | // Initialize VNC service
62 | self.vncService = vncServiceFactory(vmDirContext.dir)
63 | }
64 |
65 | // MARK: - VM State Management
66 |
67 | private var isRunning: Bool {
68 | // First check if we have a MAC address
69 | guard let macAddress = vmDirContext.config.macAddress else {
70 | Logger.info(
71 | "Cannot check if VM is running: macAddress is nil",
72 | metadata: ["name": vmDirContext.name])
73 | return false
74 | }
75 |
76 | // Then check if we have an IP address
77 | guard let ipAddress = DHCPLeaseParser.getIPAddress(forMAC: macAddress) else {
78 | return false
79 | }
80 |
81 | // Then check if it's reachable
82 | return NetworkUtils.isReachable(ipAddress: ipAddress)
83 | }
84 |
85 | var details: VMDetails {
86 | let isRunning: Bool = self.isRunning
87 | let vncUrl = isRunning ? getVNCUrl() : nil
88 |
89 | // Safely get disk size with fallback
90 | let diskSizeValue: DiskSize
91 | do {
92 | diskSizeValue = try getDiskSize()
93 | } catch {
94 | Logger.error(
95 | "Failed to get disk size",
96 | metadata: ["name": vmDirContext.name, "error": "\(error)"])
97 | // Provide a fallback value to avoid crashing
98 | diskSizeValue = DiskSize(allocated: 0, total: vmDirContext.config.diskSize ?? 0)
99 | }
100 |
101 | // Safely access MAC address
102 | let macAddress = vmDirContext.config.macAddress
103 | let ipAddress: String? =
104 | isRunning && macAddress != nil ? DHCPLeaseParser.getIPAddress(forMAC: macAddress!) : nil
105 |
106 | return VMDetails(
107 | name: vmDirContext.name,
108 | os: getOSType(),
109 | cpuCount: vmDirContext.config.cpuCount ?? 0,
110 | memorySize: vmDirContext.config.memorySize ?? 0,
111 | diskSize: diskSizeValue,
112 | display: vmDirContext.config.display.string,
113 | status: isRunning ? "running" : "stopped",
114 | vncUrl: vncUrl,
115 | ipAddress: ipAddress,
116 | locationName: vmDirContext.storage ?? "default"
117 | )
118 | }
119 |
120 | // MARK: - VM Lifecycle Management
121 |
122 | func run(
123 | noDisplay: Bool, sharedDirectories: [SharedDirectory], mount: Path?, vncPort: Int = 0,
124 | recoveryMode: Bool = false, usbMassStoragePaths: [Path]? = nil
125 | ) async throws {
126 | Logger.info(
127 | "VM.run method called",
128 | metadata: [
129 | "name": vmDirContext.name,
130 | "noDisplay": "\(noDisplay)",
131 | "recoveryMode": "\(recoveryMode)",
132 | ])
133 |
134 | guard vmDirContext.initialized else {
135 | Logger.error("VM not initialized", metadata: ["name": vmDirContext.name])
136 | throw VMError.notInitialized(vmDirContext.name)
137 | }
138 |
139 | guard let cpuCount = vmDirContext.config.cpuCount,
140 | let memorySize = vmDirContext.config.memorySize
141 | else {
142 | Logger.error("VM missing cpuCount or memorySize", metadata: ["name": vmDirContext.name])
143 | throw VMError.notInitialized(vmDirContext.name)
144 | }
145 |
146 | // Try to acquire lock on config file
147 | Logger.info(
148 | "Attempting to acquire lock on config file",
149 | metadata: [
150 | "path": vmDirContext.dir.configPath.path,
151 | "name": vmDirContext.name,
152 | ])
153 | var fileHandle = try FileHandle(forWritingTo: vmDirContext.dir.configPath.url)
154 |
155 | if flock(fileHandle.fileDescriptor, LOCK_EX | LOCK_NB) != 0 {
156 | try? fileHandle.close()
157 | Logger.error(
158 | "VM already running (failed to acquire lock)", metadata: ["name": vmDirContext.name]
159 | )
160 |
161 | // Try to forcibly clear the lock before giving up
162 | Logger.info("Attempting emergency lock cleanup", metadata: ["name": vmDirContext.name])
163 | unlockConfigFile()
164 |
165 | // Try one more time to acquire the lock
166 | if let retryHandle = try? FileHandle(forWritingTo: vmDirContext.dir.configPath.url),
167 | flock(retryHandle.fileDescriptor, LOCK_EX | LOCK_NB) == 0
168 | {
169 | Logger.info("Emergency lock cleanup worked", metadata: ["name": vmDirContext.name])
170 | // Continue with a fresh file handle
171 | try? retryHandle.close()
172 | // Get a completely new file handle to be safe
173 | guard let newHandle = try? FileHandle(forWritingTo: vmDirContext.dir.configPath.url)
174 | else {
175 | throw VMError.internalError("Failed to open file handle after lock cleanup")
176 | }
177 | // Update our main file handle
178 | fileHandle = newHandle
179 | } else {
180 | // If we still can't get the lock, give up
181 | Logger.error(
182 | "Could not acquire lock even after emergency cleanup",
183 | metadata: ["name": vmDirContext.name])
184 | throw VMError.alreadyRunning(vmDirContext.name)
185 | }
186 | }
187 | Logger.info("Successfully acquired lock", metadata: ["name": vmDirContext.name])
188 |
189 | Logger.info(
190 | "Running VM with configuration",
191 | metadata: [
192 | "name": vmDirContext.name,
193 | "cpuCount": "\(cpuCount)",
194 | "memorySize": "\(memorySize)",
195 | "diskSize": "\(vmDirContext.config.diskSize ?? 0)",
196 | "sharedDirectories": sharedDirectories.map { $0.string }.joined(separator: ", "),
197 | "recoveryMode": "\(recoveryMode)",
198 | ])
199 |
200 | // Create and configure the VM
201 | do {
202 | Logger.info(
203 | "Creating virtualization service context", metadata: ["name": vmDirContext.name])
204 | let config = try createVMVirtualizationServiceContext(
205 | cpuCount: cpuCount,
206 | memorySize: memorySize,
207 | display: vmDirContext.config.display.string,
208 | sharedDirectories: sharedDirectories,
209 | mount: mount,
210 | recoveryMode: recoveryMode,
211 | usbMassStoragePaths: usbMassStoragePaths
212 | )
213 | Logger.info(
214 | "Successfully created virtualization service context",
215 | metadata: ["name": vmDirContext.name])
216 |
217 | Logger.info(
218 | "Initializing virtualization service", metadata: ["name": vmDirContext.name])
219 | virtualizationService = try virtualizationServiceFactory(config)
220 | Logger.info(
221 | "Successfully initialized virtualization service",
222 | metadata: ["name": vmDirContext.name])
223 |
224 | Logger.info(
225 | "Setting up VNC",
226 | metadata: [
227 | "name": vmDirContext.name,
228 | "noDisplay": "\(noDisplay)",
229 | "port": "\(vncPort)",
230 | ])
231 | let vncInfo = try await setupSession(
232 | noDisplay: noDisplay, port: vncPort, sharedDirectories: sharedDirectories)
233 | Logger.info(
234 | "VNC setup successful", metadata: ["name": vmDirContext.name, "vncInfo": vncInfo])
235 |
236 | // Start the VM
237 | guard let service = virtualizationService else {
238 | Logger.error("Virtualization service is nil", metadata: ["name": vmDirContext.name])
239 | throw VMError.internalError("Virtualization service not initialized")
240 | }
241 | Logger.info(
242 | "Starting VM via virtualization service", metadata: ["name": vmDirContext.name])
243 | try await service.start()
244 | Logger.info("VM started successfully", metadata: ["name": vmDirContext.name])
245 |
246 | while true {
247 | try await Task.sleep(nanoseconds: UInt64(1e9))
248 | }
249 | } catch {
250 | Logger.error(
251 | "Failed in VM.run",
252 | metadata: [
253 | "name": vmDirContext.name,
254 | "error": error.localizedDescription,
255 | "errorType": "\(type(of: error))",
256 | ])
257 | virtualizationService = nil
258 | vncService.stop()
259 |
260 | // Release lock
261 | Logger.info("Releasing file lock after error", metadata: ["name": vmDirContext.name])
262 | flock(fileHandle.fileDescriptor, LOCK_UN)
263 | try? fileHandle.close()
264 |
265 | // Additionally, perform our aggressive unlock to ensure no locks remain
266 | Logger.info(
267 | "Performing additional lock cleanup after error",
268 | metadata: ["name": vmDirContext.name])
269 | unlockConfigFile()
270 |
271 | throw error
272 | }
273 | }
274 |
275 | @MainActor
276 | func stop() async throws {
277 | guard vmDirContext.initialized else {
278 | throw VMError.notInitialized(vmDirContext.name)
279 | }
280 |
281 | Logger.info("Attempting to stop VM", metadata: ["name": vmDirContext.name])
282 |
283 | // If we have a virtualization service, try to stop it cleanly first
284 | if let service = virtualizationService {
285 | do {
286 | Logger.info(
287 | "Stopping VM via virtualization service", metadata: ["name": vmDirContext.name])
288 | try await service.stop()
289 | virtualizationService = nil
290 | vncService.stop()
291 | Logger.info(
292 | "VM stopped successfully via virtualization service",
293 | metadata: ["name": vmDirContext.name])
294 |
295 | // Try to ensure any existing locks are released
296 | Logger.info(
297 | "Attempting to clear any locks on config file",
298 | metadata: ["name": vmDirContext.name])
299 | unlockConfigFile()
300 |
301 | return
302 | } catch let error {
303 | Logger.error(
304 | "Failed to stop VM via virtualization service",
305 | metadata: [
306 | "name": vmDirContext.name,
307 | "error": error.localizedDescription,
308 | ])
309 | // Fall through to process termination
310 | }
311 | }
312 |
313 | // Try to open config file to get file descriptor
314 | Logger.info(
315 | "Attempting to access config file lock",
316 | metadata: [
317 | "path": vmDirContext.dir.configPath.path,
318 | "name": vmDirContext.name,
319 | ])
320 | let fileHandle = try? FileHandle(forReadingFrom: vmDirContext.dir.configPath.url)
321 | guard let fileHandle = fileHandle else {
322 | Logger.info(
323 | "Failed to open config file - VM may not be running",
324 | metadata: ["name": vmDirContext.name])
325 |
326 | // Even though we couldn't open the file, try to force unlock anyway
327 | unlockConfigFile()
328 |
329 | throw VMError.notRunning(vmDirContext.name)
330 | }
331 |
332 | // Get the PID of the process holding the lock using lsof command
333 | Logger.info(
334 | "Finding process holding lock on config file", metadata: ["name": vmDirContext.name])
335 | let task = Process()
336 | task.executableURL = URL(fileURLWithPath: "/usr/sbin/lsof")
337 | task.arguments = ["-F", "p", vmDirContext.dir.configPath.path]
338 |
339 | let outputPipe = Pipe()
340 | task.standardOutput = outputPipe
341 |
342 | try task.run()
343 | task.waitUntilExit()
344 |
345 | let outputData = try outputPipe.fileHandleForReading.readToEnd() ?? Data()
346 | guard let outputString = String(data: outputData, encoding: .utf8),
347 | let pidString = outputString.split(separator: "\n").first?.dropFirst(), // Drop the 'p' prefix
348 | let pid = pid_t(pidString)
349 | else {
350 | try? fileHandle.close()
351 | Logger.info(
352 | "Failed to find process holding lock - VM may not be running",
353 | metadata: ["name": vmDirContext.name])
354 |
355 | // Even though we couldn't find the process, try to force unlock
356 | unlockConfigFile()
357 |
358 | throw VMError.notRunning(vmDirContext.name)
359 | }
360 |
361 | Logger.info(
362 | "Found process \(pid) holding lock on config file",
363 | metadata: ["name": vmDirContext.name])
364 |
365 | // First try graceful shutdown with SIGINT
366 | if kill(pid, SIGINT) == 0 {
367 | Logger.info("Sent SIGINT to VM process \(pid)", metadata: ["name": vmDirContext.name])
368 | }
369 |
370 | // Wait for process to stop with timeout
371 | var attempts = 0
372 | while attempts < 10 {
373 | Logger.info(
374 | "Waiting for process \(pid) to terminate (attempt \(attempts + 1)/10)",
375 | metadata: ["name": vmDirContext.name])
376 | try await Task.sleep(nanoseconds: 1_000_000_000)
377 |
378 | // Check if process still exists
379 | if kill(pid, 0) != 0 {
380 | // Process is gone, do final cleanup
381 | Logger.info("Process \(pid) has terminated", metadata: ["name": vmDirContext.name])
382 | virtualizationService = nil
383 | vncService.stop()
384 | try? fileHandle.close()
385 |
386 | // Force unlock the config file
387 | unlockConfigFile()
388 |
389 | Logger.info(
390 | "VM stopped successfully via process termination",
391 | metadata: ["name": vmDirContext.name])
392 | return
393 | }
394 | attempts += 1
395 | }
396 |
397 | // If graceful shutdown failed, force kill the process
398 | Logger.info(
399 | "Graceful shutdown failed, forcing termination of process \(pid)",
400 | metadata: ["name": vmDirContext.name])
401 | if kill(pid, SIGKILL) == 0 {
402 | Logger.info("Sent SIGKILL to process \(pid)", metadata: ["name": vmDirContext.name])
403 |
404 | // Wait a moment for the process to be fully killed
405 | try await Task.sleep(nanoseconds: 2_000_000_000)
406 |
407 | // Do final cleanup
408 | virtualizationService = nil
409 | vncService.stop()
410 | try? fileHandle.close()
411 |
412 | // Force unlock the config file
413 | unlockConfigFile()
414 |
415 | Logger.info("VM forcefully stopped", metadata: ["name": vmDirContext.name])
416 | return
417 | }
418 |
419 | // If we get here, something went very wrong
420 | try? fileHandle.close()
421 | Logger.error(
422 | "Failed to stop VM - could not terminate process \(pid)",
423 | metadata: ["name": vmDirContext.name])
424 |
425 | // As a last resort, try to force unlock
426 | unlockConfigFile()
427 |
428 | throw VMError.internalError("Failed to stop VM process")
429 | }
430 |
431 | // Helper method to forcibly clear any locks on the config file
432 | private func unlockConfigFile() {
433 | Logger.info(
434 | "Forcibly clearing locks on config file",
435 | metadata: [
436 | "path": vmDirContext.dir.configPath.path,
437 | "name": vmDirContext.name,
438 | ])
439 |
440 | // First attempt: standard unlock methods
441 | if let fileHandle = try? FileHandle(forWritingTo: vmDirContext.dir.configPath.url) {
442 | // Use F_GETLK and F_SETLK to check and clear locks
443 | var lockInfo = flock()
444 | lockInfo.l_type = Int16(F_UNLCK)
445 | lockInfo.l_whence = Int16(SEEK_SET)
446 | lockInfo.l_start = 0
447 | lockInfo.l_len = 0
448 |
449 | // Try to unlock the file using fcntl
450 | _ = fcntl(fileHandle.fileDescriptor, F_SETLK, &lockInfo)
451 |
452 | // Also try the regular flock method
453 | flock(fileHandle.fileDescriptor, LOCK_UN)
454 |
455 | try? fileHandle.close()
456 | Logger.info("Standard unlock attempts performed", metadata: ["name": vmDirContext.name])
457 | }
458 |
459 | // Second attempt: try to acquire and immediately release a fresh lock
460 | if let tempHandle = try? FileHandle(forWritingTo: vmDirContext.dir.configPath.url) {
461 | if flock(tempHandle.fileDescriptor, LOCK_EX | LOCK_NB) == 0 {
462 | Logger.info(
463 | "Successfully acquired and released lock to reset state",
464 | metadata: ["name": vmDirContext.name])
465 | flock(tempHandle.fileDescriptor, LOCK_UN)
466 | } else {
467 | Logger.info(
468 | "Could not acquire lock for resetting - may still be locked",
469 | metadata: ["name": vmDirContext.name])
470 | }
471 | try? tempHandle.close()
472 | }
473 |
474 | // Third attempt (most aggressive): copy the config file, remove the original, and restore
475 | Logger.info(
476 | "Trying aggressive method: backup and restore config file",
477 | metadata: ["name": vmDirContext.name])
478 | // Only proceed if the config file exists
479 | let fileManager = FileManager.default
480 | let configPath = vmDirContext.dir.configPath.path
481 | let backupPath = configPath + ".backup"
482 |
483 | if fileManager.fileExists(atPath: configPath) {
484 | // Create a backup of the config file
485 | if let configData = try? Data(contentsOf: URL(fileURLWithPath: configPath)) {
486 | // Make backup
487 | try? configData.write(to: URL(fileURLWithPath: backupPath))
488 |
489 | // Remove the original file to clear all locks
490 | try? fileManager.removeItem(atPath: configPath)
491 | Logger.info(
492 | "Removed original config file to clear locks",
493 | metadata: ["name": vmDirContext.name])
494 |
495 | // Wait a moment for OS to fully release resources
496 | Thread.sleep(forTimeInterval: 0.1)
497 |
498 | // Restore from backup
499 | try? configData.write(to: URL(fileURLWithPath: configPath))
500 | Logger.info(
501 | "Restored config file from backup", metadata: ["name": vmDirContext.name])
502 | } else {
503 | Logger.error(
504 | "Could not read config file content for backup",
505 | metadata: ["name": vmDirContext.name])
506 | }
507 | } else {
508 | Logger.info(
509 | "Config file does not exist, cannot perform aggressive unlock",
510 | metadata: ["name": vmDirContext.name])
511 | }
512 |
513 | // Final check
514 | if let finalHandle = try? FileHandle(forWritingTo: vmDirContext.dir.configPath.url) {
515 | let lockResult = flock(finalHandle.fileDescriptor, LOCK_EX | LOCK_NB)
516 | if lockResult == 0 {
517 | Logger.info(
518 | "Lock successfully cleared - verified by acquiring test lock",
519 | metadata: ["name": vmDirContext.name])
520 | flock(finalHandle.fileDescriptor, LOCK_UN)
521 | } else {
522 | Logger.info(
523 | "Lock still present after all clearing attempts",
524 | metadata: ["name": vmDirContext.name, "severity": "warning"])
525 | }
526 | try? finalHandle.close()
527 | }
528 | }
529 |
530 | // MARK: - Resource Management
531 |
532 | func updateVMConfig(vmConfig: VMConfig) throws {
533 | vmDirContext.config = vmConfig
534 | try vmDirContext.saveConfig()
535 | }
536 |
537 | private func getDiskSize() throws -> DiskSize {
538 | let resourceValues = try vmDirContext.diskPath.url.resourceValues(forKeys: [
539 | .totalFileAllocatedSizeKey,
540 | .totalFileSizeKey,
541 | ])
542 |
543 | guard let allocated = resourceValues.totalFileAllocatedSize,
544 | let total = resourceValues.totalFileSize
545 | else {
546 | throw VMConfigError.invalidDiskSize
547 | }
548 |
549 | return DiskSize(allocated: UInt64(allocated), total: UInt64(total))
550 | }
551 |
552 | func resizeDisk(_ newSize: UInt64) throws {
553 | let currentSize = try getDiskSize()
554 |
555 | guard newSize >= currentSize.total else {
556 | throw VMError.resizeTooSmall(current: currentSize.total, requested: newSize)
557 | }
558 |
559 | try setDiskSize(newSize)
560 | }
561 |
562 | func setCpuCount(_ newCpuCount: Int) throws {
563 | guard !isRunning else {
564 | throw VMError.alreadyRunning(vmDirContext.name)
565 | }
566 | vmDirContext.config.setCpuCount(newCpuCount)
567 | try vmDirContext.saveConfig()
568 | }
569 |
570 | func setMemorySize(_ newMemorySize: UInt64) throws {
571 | guard !isRunning else {
572 | throw VMError.alreadyRunning(vmDirContext.name)
573 | }
574 | vmDirContext.config.setMemorySize(newMemorySize)
575 | try vmDirContext.saveConfig()
576 | }
577 |
578 | func setDiskSize(_ newDiskSize: UInt64) throws {
579 | try vmDirContext.setDisk(newDiskSize)
580 | vmDirContext.config.setDiskSize(newDiskSize)
581 | try vmDirContext.saveConfig()
582 | }
583 |
584 | func setDisplay(_ newDisplay: String) throws {
585 | guard !isRunning else {
586 | throw VMError.alreadyRunning(vmDirContext.name)
587 | }
588 | guard let display: VMDisplayResolution = VMDisplayResolution(string: newDisplay) else {
589 | throw VMError.invalidDisplayResolution(newDisplay)
590 | }
591 | vmDirContext.config.setDisplay(display)
592 | try vmDirContext.saveConfig()
593 | }
594 |
595 | func setHardwareModel(_ newHardwareModel: Data) throws {
596 | guard !isRunning else {
597 | throw VMError.alreadyRunning(vmDirContext.name)
598 | }
599 | vmDirContext.config.setHardwareModel(newHardwareModel)
600 | try vmDirContext.saveConfig()
601 | }
602 |
603 | func setMachineIdentifier(_ newMachineIdentifier: Data) throws {
604 | guard !isRunning else {
605 | throw VMError.alreadyRunning(vmDirContext.name)
606 | }
607 | vmDirContext.config.setMachineIdentifier(newMachineIdentifier)
608 | try vmDirContext.saveConfig()
609 | }
610 |
611 | func setMacAddress(_ newMacAddress: String) throws {
612 | guard !isRunning else {
613 | throw VMError.alreadyRunning(vmDirContext.name)
614 | }
615 | vmDirContext.config.setMacAddress(newMacAddress)
616 | try vmDirContext.saveConfig()
617 | }
618 |
619 | // MARK: - VNC Management
620 |
621 | func getVNCUrl() -> String? {
622 | return vncService.url
623 | }
624 |
625 | /// Sets up the VNC service and returns the VNC URL
626 | private func startVNCService(port: Int = 0) async throws -> String {
627 | guard let service = virtualizationService else {
628 | throw VMError.internalError("Virtualization service not initialized")
629 | }
630 |
631 | try await vncService.start(port: port, virtualMachine: service.getVirtualMachine())
632 |
633 | guard let url = vncService.url else {
634 | throw VMError.vncNotConfigured
635 | }
636 |
637 | return url
638 | }
639 |
640 | /// Saves the session information including shared directories to disk
641 | private func saveSessionData(url: String, sharedDirectories: [SharedDirectory]) {
642 | do {
643 | let session = VNCSession(
644 | url: url, sharedDirectories: sharedDirectories.isEmpty ? nil : sharedDirectories)
645 | try vmDirContext.dir.saveSession(session)
646 | Logger.info(
647 | "Saved VNC session with shared directories",
648 | metadata: [
649 | "count": "\(sharedDirectories.count)",
650 | "dirs": "\(sharedDirectories.map { $0.hostPath }.joined(separator: ", "))",
651 | "sessionsPath": "\(vmDirContext.dir.sessionsPath.path)",
652 | ])
653 | } catch {
654 | Logger.error("Failed to save VNC session", metadata: ["error": "\(error)"])
655 | }
656 | }
657 |
658 | /// Main session setup method that handles VNC and persists session data
659 | private func setupSession(
660 | noDisplay: Bool, port: Int = 0, sharedDirectories: [SharedDirectory] = []
661 | ) async throws -> String {
662 | // Start the VNC service and get the URL
663 | let url = try await startVNCService(port: port)
664 |
665 | // Save the session data
666 | saveSessionData(url: url, sharedDirectories: sharedDirectories)
667 |
668 | // Open the VNC client if needed
669 | if !noDisplay {
670 | Logger.info("Starting VNC session", metadata: ["name": vmDirContext.name])
671 | try await vncService.openClient(url: url)
672 | }
673 |
674 | return url
675 | }
676 |
677 | // MARK: - Platform-specific Methods
678 |
679 | func getOSType() -> String {
680 | fatalError("Must be implemented by subclass")
681 | }
682 |
683 | func createVMVirtualizationServiceContext(
684 | cpuCount: Int,
685 | memorySize: UInt64,
686 | display: String,
687 | sharedDirectories: [SharedDirectory] = [],
688 | mount: Path? = nil,
689 | recoveryMode: Bool = false,
690 | usbMassStoragePaths: [Path]? = nil
691 | ) throws -> VMVirtualizationServiceContext {
692 | // This is a diagnostic log to track actual file paths on disk for debugging
693 | try validateDiskState()
694 |
695 | return VMVirtualizationServiceContext(
696 | cpuCount: cpuCount,
697 | memorySize: memorySize,
698 | display: display,
699 | sharedDirectories: sharedDirectories,
700 | mount: mount,
701 | hardwareModel: vmDirContext.config.hardwareModel,
702 | machineIdentifier: vmDirContext.config.machineIdentifier,
703 | macAddress: vmDirContext.config.macAddress!,
704 | diskPath: vmDirContext.diskPath,
705 | nvramPath: vmDirContext.nvramPath,
706 | recoveryMode: recoveryMode,
707 | usbMassStoragePaths: usbMassStoragePaths
708 | )
709 | }
710 |
711 | /// Validates the disk state to help diagnose storage attachment issues
712 | private func validateDiskState() throws {
713 | // Check disk image state
714 | let diskPath = vmDirContext.diskPath.path
715 | let diskExists = FileManager.default.fileExists(atPath: diskPath)
716 | var diskSize: UInt64 = 0
717 | var diskPermissions = ""
718 |
719 | if diskExists {
720 | if let attrs = try? FileManager.default.attributesOfItem(atPath: diskPath) {
721 | diskSize = attrs[.size] as? UInt64 ?? 0
722 | let posixPerms = attrs[.posixPermissions] as? Int ?? 0
723 | diskPermissions = String(format: "%o", posixPerms)
724 | }
725 | }
726 |
727 | // Check disk container directory permissions
728 | let diskDir = (diskPath as NSString).deletingLastPathComponent
729 | let dirPerms =
730 | try? FileManager.default.attributesOfItem(atPath: diskDir)[.posixPermissions] as? Int
731 | ?? 0
732 | let dirPermsString = dirPerms != nil ? String(format: "%o", dirPerms!) : "unknown"
733 |
734 | // Log detailed diagnostics
735 | Logger.info(
736 | "Validating VM disk state",
737 | metadata: [
738 | "diskPath": diskPath,
739 | "diskExists": "\(diskExists)",
740 | "diskSize":
741 | "\(ByteCountFormatter.string(fromByteCount: Int64(diskSize), countStyle: .file))",
742 | "diskPermissions": diskPermissions,
743 | "dirPermissions": dirPermsString,
744 | "locationName": vmDirContext.storage ?? "default",
745 | ])
746 |
747 | if !diskExists {
748 | Logger.error("VM disk image does not exist", metadata: ["diskPath": diskPath])
749 | } else if diskSize == 0 {
750 | Logger.error("VM disk image exists but has zero size", metadata: ["diskPath": diskPath])
751 | }
752 | }
753 |
754 | func setup(
755 | ipswPath: String,
756 | cpuCount: Int,
757 | memorySize: UInt64,
758 | diskSize: UInt64,
759 | display: String
760 | ) async throws {
761 | fatalError("Must be implemented by subclass")
762 | }
763 |
764 | // MARK: - Finalization
765 |
766 | /// Post-installation step to move the VM directory to the home directory
767 | func finalize(to name: String, home: Home, storage: String? = nil) throws {
768 | let vmDir = try home.getVMDirectory(name, storage: storage)
769 | try FileManager.default.moveItem(at: vmDirContext.dir.dir.url, to: vmDir.dir.url)
770 | }
771 |
772 | // Method to run VM with additional USB mass storage devices
773 | func runWithUSBStorage(
774 | noDisplay: Bool, sharedDirectories: [SharedDirectory], mount: Path?, vncPort: Int = 0,
775 | recoveryMode: Bool = false, usbImagePaths: [Path]
776 | ) async throws {
777 | guard vmDirContext.initialized else {
778 | throw VMError.notInitialized(vmDirContext.name)
779 | }
780 |
781 | guard let cpuCount = vmDirContext.config.cpuCount,
782 | let memorySize = vmDirContext.config.memorySize
783 | else {
784 | throw VMError.notInitialized(vmDirContext.name)
785 | }
786 |
787 | // Try to acquire lock on config file
788 | let fileHandle = try FileHandle(forWritingTo: vmDirContext.dir.configPath.url)
789 | guard flock(fileHandle.fileDescriptor, LOCK_EX | LOCK_NB) == 0 else {
790 | try? fileHandle.close()
791 | throw VMError.alreadyRunning(vmDirContext.name)
792 | }
793 |
794 | Logger.info(
795 | "Running VM with USB storage devices",
796 | metadata: [
797 | "cpuCount": "\(cpuCount)",
798 | "memorySize": "\(memorySize)",
799 | "diskSize": "\(vmDirContext.config.diskSize ?? 0)",
800 | "usbImageCount": "\(usbImagePaths.count)",
801 | "recoveryMode": "\(recoveryMode)",
802 | ])
803 |
804 | // Create and configure the VM
805 | do {
806 | let config = try createVMVirtualizationServiceContext(
807 | cpuCount: cpuCount,
808 | memorySize: memorySize,
809 | display: vmDirContext.config.display.string,
810 | sharedDirectories: sharedDirectories,
811 | mount: mount,
812 | recoveryMode: recoveryMode,
813 | usbMassStoragePaths: usbImagePaths
814 | )
815 | virtualizationService = try virtualizationServiceFactory(config)
816 |
817 | let vncInfo = try await setupSession(
818 | noDisplay: noDisplay, port: vncPort, sharedDirectories: sharedDirectories)
819 | Logger.info("VNC info", metadata: ["vncInfo": vncInfo])
820 |
821 | // Start the VM
822 | guard let service = virtualizationService else {
823 | throw VMError.internalError("Virtualization service not initialized")
824 | }
825 | try await service.start()
826 |
827 | while true {
828 | try await Task.sleep(nanoseconds: UInt64(1e9))
829 | }
830 | } catch {
831 | Logger.error(
832 | "Failed to create/start VM with USB storage",
833 | metadata: [
834 | "error": "\(error)",
835 | "errorType": "\(type(of: error))",
836 | ])
837 | virtualizationService = nil
838 | vncService.stop()
839 | // Release lock
840 | flock(fileHandle.fileDescriptor, LOCK_UN)
841 | try? fileHandle.close()
842 | throw error
843 | }
844 | }
845 | }
846 |
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/agent.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | ComputerAgent - Main agent class that selects and runs agent loops
3 | """
4 |
5 | import asyncio
6 | import inspect
7 | import json
8 | from pathlib import Path
9 | from typing import (
10 | Any,
11 | AsyncGenerator,
12 | Callable,
13 | Dict,
14 | List,
15 | Optional,
16 | Set,
17 | Tuple,
18 | Union,
19 | cast,
20 | )
21 |
22 | import litellm
23 | import litellm.utils
24 | from litellm.responses.utils import Usage
25 |
26 | from .adapters import (
27 | AzureMLAdapter,
28 | CUAAdapter,
29 | HuggingFaceLocalAdapter,
30 | HumanAdapter,
31 | MLXVLMAdapter,
32 | )
33 | from .callbacks import (
34 | BudgetManagerCallback,
35 | ImageRetentionCallback,
36 | LoggingCallback,
37 | OperatorNormalizerCallback,
38 | PromptInstructionsCallback,
39 | TelemetryCallback,
40 | TrajectorySaverCallback,
41 | )
42 | from .computers import AsyncComputerHandler, is_agent_computer, make_computer_handler
43 | from .decorators import find_agent_config
44 | from .responses import (
45 | make_tool_error_item,
46 | replace_failed_computer_calls_with_function_calls,
47 | )
48 | from .tools.base import BaseComputerTool, BaseTool
49 | from .types import AgentCapability, IllegalArgumentError, Messages, ToolError
50 |
51 |
52 | def assert_callable_with(f, *args, **kwargs):
53 | """Check if function can be called with given arguments."""
54 | try:
55 | inspect.signature(f).bind(*args, **kwargs)
56 | return True
57 | except TypeError as e:
58 | sig = inspect.signature(f)
59 | raise IllegalArgumentError(f"Expected {sig}, got args={args} kwargs={kwargs}") from e
60 |
61 |
62 | def get_json(obj: Any, max_depth: int = 10) -> Any:
63 | def custom_serializer(o: Any, depth: int = 0, seen: Optional[Set[int]] = None) -> Any:
64 | if seen is None:
65 | seen = set()
66 |
67 | # Use model_dump() if available
68 | if hasattr(o, "model_dump"):
69 | return o.model_dump()
70 |
71 | # Check depth limit
72 | if depth > max_depth:
73 | return f"<max_depth_exceeded:{max_depth}>"
74 |
75 | # Check for circular references using object id
76 | obj_id = id(o)
77 | if obj_id in seen:
78 | return f"<circular_reference:{type(o).__name__}>"
79 |
80 | # Handle Computer objects
81 | if hasattr(o, "__class__") and "computer" in o.__class__.__name__.lower():
82 | return f"<computer:{o.__class__.__name__}>"
83 |
84 | # Handle objects with __dict__
85 | if hasattr(o, "__dict__"):
86 | seen.add(obj_id)
87 | try:
88 | result = {}
89 | for k, v in o.__dict__.items():
90 | if v is not None:
91 | # Recursively serialize with updated depth and seen set
92 | serialized_value = custom_serializer(v, depth + 1, seen.copy())
93 | result[k] = serialized_value
94 | return result
95 | finally:
96 | seen.discard(obj_id)
97 |
98 | # Handle common types that might contain nested objects
99 | elif isinstance(o, dict):
100 | seen.add(obj_id)
101 | try:
102 | return {
103 | k: custom_serializer(v, depth + 1, seen.copy())
104 | for k, v in o.items()
105 | if v is not None
106 | }
107 | finally:
108 | seen.discard(obj_id)
109 |
110 | elif isinstance(o, (list, tuple, set)):
111 | seen.add(obj_id)
112 | try:
113 | return [
114 | custom_serializer(item, depth + 1, seen.copy())
115 | for item in o
116 | if item is not None
117 | ]
118 | finally:
119 | seen.discard(obj_id)
120 |
121 | # For basic types that json.dumps can handle
122 | elif isinstance(o, (str, int, float, bool)) or o is None:
123 | return o
124 |
125 | # Fallback to string representation
126 | else:
127 | return str(o)
128 |
129 | def remove_nones(obj: Any) -> Any:
130 | if isinstance(obj, dict):
131 | return {k: remove_nones(v) for k, v in obj.items() if v is not None}
132 | elif isinstance(obj, list):
133 | return [remove_nones(item) for item in obj if item is not None]
134 | return obj
135 |
136 | # Serialize with circular reference and depth protection
137 | serialized = custom_serializer(obj)
138 |
139 | # Convert to JSON string and back to ensure JSON compatibility
140 | json_str = json.dumps(serialized)
141 | parsed = json.loads(json_str)
142 |
143 | # Final cleanup of any remaining None values
144 | return remove_nones(parsed)
145 |
146 |
147 | def sanitize_message(msg: Any) -> Any:
148 | """Return a copy of the message with image_url omitted for computer_call_output messages."""
149 | if msg.get("type") == "computer_call_output":
150 | output = msg.get("output", {})
151 | if isinstance(output, dict):
152 | sanitized = msg.copy()
153 | sanitized["output"] = {**output, "image_url": "[omitted]"}
154 | return sanitized
155 | return msg
156 |
157 |
158 | def get_output_call_ids(messages: List[Dict[str, Any]]) -> List[str]:
159 | call_ids = []
160 | for message in messages:
161 | if (
162 | message.get("type") == "computer_call_output"
163 | or message.get("type") == "function_call_output"
164 | ):
165 | call_ids.append(message.get("call_id"))
166 | return call_ids
167 |
168 |
169 | class ComputerAgent:
170 | """
171 | Main agent class that automatically selects the appropriate agent loop
172 | based on the model and executes tool calls.
173 | """
174 |
175 | def __init__(
176 | self,
177 | model: str,
178 | tools: Optional[List[Any]] = None,
179 | custom_loop: Optional[Callable] = None,
180 | only_n_most_recent_images: Optional[int] = None,
181 | callbacks: Optional[List[Any]] = None,
182 | instructions: Optional[str] = None,
183 | verbosity: Optional[int] = None,
184 | trajectory_dir: Optional[str | Path | dict] = None,
185 | max_retries: Optional[int] = 3,
186 | screenshot_delay: Optional[float | int] = 0.5,
187 | use_prompt_caching: Optional[bool] = False,
188 | max_trajectory_budget: Optional[float | dict] = None,
189 | telemetry_enabled: Optional[bool] = True,
190 | trust_remote_code: Optional[bool] = False,
191 | api_key: Optional[str] = None,
192 | api_base: Optional[str] = None,
193 | **additional_generation_kwargs,
194 | ):
195 | """
196 | Initialize ComputerAgent.
197 |
198 | Args:
199 | model: Model name (e.g., "claude-sonnet-4-5-20250929", "computer-use-preview", "omni+vertex_ai/gemini-pro")
200 | tools: List of tools (computer objects, decorated functions, etc.)
201 | custom_loop: Custom agent loop function to use instead of auto-selection
202 | only_n_most_recent_images: If set, only keep the N most recent images in message history. Adds ImageRetentionCallback automatically.
203 | callbacks: List of AsyncCallbackHandler instances for preprocessing/postprocessing
204 | instructions: Optional system instructions to be passed to the model
205 | verbosity: Logging level (logging.DEBUG, logging.INFO, etc.). If set, adds LoggingCallback automatically
206 | trajectory_dir: If set, saves trajectory data (screenshots, responses) to this directory. Adds TrajectorySaverCallback automatically.
207 | max_retries: Maximum number of retries for failed API calls
208 | screenshot_delay: Delay before screenshots in seconds
209 | use_prompt_caching: If set, use prompt caching to avoid reprocessing the same prompt. Intended for use with anthropic providers.
210 | max_trajectory_budget: If set, adds BudgetManagerCallback to track usage costs and stop when budget is exceeded
211 | telemetry_enabled: If set, adds TelemetryCallback to track anonymized usage data. Enabled by default.
212 | trust_remote_code: If set, trust remote code when loading local models. Disabled by default.
213 | api_key: Optional API key override for the model provider
214 | api_base: Optional API base URL override for the model provider
215 | **additional_generation_kwargs: Additional arguments passed to the model provider
216 | """
217 | # If the loop is "human/human", we need to prefix a grounding model fallback
218 | if model in ["human/human", "human"]:
219 | model = "openai/computer-use-preview+human/human"
220 |
221 | self.model = model
222 | self.tools = tools or []
223 | self.custom_loop = custom_loop
224 | self.only_n_most_recent_images = only_n_most_recent_images
225 | self.callbacks = callbacks or []
226 | self.instructions = instructions
227 | self.verbosity = verbosity
228 | self.trajectory_dir = trajectory_dir
229 | self.max_retries = max_retries
230 | self.screenshot_delay = screenshot_delay
231 | self.use_prompt_caching = use_prompt_caching
232 | self.telemetry_enabled = telemetry_enabled
233 | self.kwargs = additional_generation_kwargs
234 | self.trust_remote_code = trust_remote_code
235 | self.api_key = api_key
236 | self.api_base = api_base
237 |
238 | # == Add built-in callbacks ==
239 |
240 | # Prepend operator normalizer callback
241 | self.callbacks.insert(0, OperatorNormalizerCallback())
242 |
243 | # Add prompt instructions callback if provided
244 | if self.instructions:
245 | self.callbacks.append(PromptInstructionsCallback(self.instructions))
246 |
247 | # Add logging callback if verbosity is set
248 | if self.verbosity is not None:
249 | self.callbacks.append(LoggingCallback(level=self.verbosity))
250 |
251 | # Add image retention callback if only_n_most_recent_images is set
252 | if self.only_n_most_recent_images:
253 | self.callbacks.append(ImageRetentionCallback(self.only_n_most_recent_images))
254 |
255 | # Add trajectory saver callback if trajectory_dir is set
256 | if self.trajectory_dir:
257 | if isinstance(self.trajectory_dir, dict):
258 | self.callbacks.append(TrajectorySaverCallback(**self.trajectory_dir))
259 | elif isinstance(self.trajectory_dir, (str, Path)):
260 | self.callbacks.append(TrajectorySaverCallback(str(self.trajectory_dir)))
261 |
262 | # Add budget manager if max_trajectory_budget is set
263 | if max_trajectory_budget:
264 | if isinstance(max_trajectory_budget, dict):
265 | self.callbacks.append(BudgetManagerCallback(**max_trajectory_budget))
266 | else:
267 | self.callbacks.append(BudgetManagerCallback(max_trajectory_budget))
268 |
269 | # == Enable local model providers w/ LiteLLM ==
270 |
271 | # Register local model providers
272 | hf_adapter = HuggingFaceLocalAdapter(
273 | device="auto", trust_remote_code=self.trust_remote_code or False
274 | )
275 | human_adapter = HumanAdapter()
276 | mlx_adapter = MLXVLMAdapter()
277 | cua_adapter = CUAAdapter()
278 | azure_ml_adapter = AzureMLAdapter()
279 | litellm.custom_provider_map = [
280 | {"provider": "huggingface-local", "custom_handler": hf_adapter},
281 | {"provider": "human", "custom_handler": human_adapter},
282 | {"provider": "mlx", "custom_handler": mlx_adapter},
283 | {"provider": "cua", "custom_handler": cua_adapter},
284 | {"provider": "azure_ml", "custom_handler": azure_ml_adapter},
285 | ]
286 | litellm.suppress_debug_info = True
287 |
288 | # == Initialize computer agent ==
289 |
290 | # Find the appropriate agent loop
291 | if custom_loop:
292 | self.agent_loop = custom_loop
293 | self.agent_config_info = None
294 | else:
295 | config_info = find_agent_config(model)
296 | if not config_info:
297 | raise ValueError(f"No agent config found for model: {model}")
298 | # Instantiate the agent config class
299 | self.agent_loop = config_info.agent_class()
300 | self.agent_config_info = config_info
301 |
302 | # Add telemetry callback AFTER agent_loop is set so it can capture the correct agent_type
303 | if self.telemetry_enabled:
304 | if isinstance(self.telemetry_enabled, bool):
305 | self.callbacks.append(TelemetryCallback(self))
306 | else:
307 | self.callbacks.append(TelemetryCallback(self, **self.telemetry_enabled))
308 |
309 | self.tool_schemas = []
310 | self.computer_handler = None
311 |
312 | async def _initialize_computers(self):
313 | """Initialize computer objects"""
314 | if not self.tool_schemas:
315 | # Process tools and create tool schemas
316 | self.tool_schemas = self._process_tools()
317 |
318 | # Find computer tool and create interface adapter
319 | computer_handler = None
320 |
321 | # First check if any tool is a BaseComputerTool instance
322 | for tool in self.tools:
323 | if isinstance(tool, BaseComputerTool):
324 | computer_handler = tool
325 | break
326 |
327 | # If no BaseComputerTool found, look for traditional computer objects
328 | if computer_handler is None:
329 | for schema in self.tool_schemas:
330 | if schema["type"] == "computer":
331 | computer_handler = await make_computer_handler(schema["computer"])
332 | break
333 |
334 | self.computer_handler = computer_handler
335 |
336 | def _process_input(self, input: Messages) -> List[Dict[str, Any]]:
337 | """Process input messages and create schemas for the agent loop"""
338 | if isinstance(input, str):
339 | return [{"role": "user", "content": input}]
340 | return [get_json(msg) for msg in input]
341 |
342 | def _process_tools(self) -> List[Dict[str, Any]]:
343 | """Process tools and create schemas for the agent loop"""
344 | schemas = []
345 |
346 | for tool in self.tools:
347 | # Check if it's a computer object (has interface attribute)
348 | if is_agent_computer(tool):
349 | # This is a computer tool - will be handled by agent loop
350 | schemas.append({"type": "computer", "computer": tool})
351 | elif isinstance(tool, BaseTool):
352 | # BaseTool instance - extract schema from its properties
353 | function_schema = {
354 | "name": tool.name,
355 | "description": tool.description,
356 | "parameters": tool.parameters,
357 | }
358 | schemas.append({"type": "function", "function": function_schema})
359 | elif callable(tool):
360 | # Use litellm.utils.function_to_dict to extract schema from docstring
361 | try:
362 | function_schema = litellm.utils.function_to_dict(tool)
363 | schemas.append({"type": "function", "function": function_schema})
364 | except Exception as e:
365 | print(f"Warning: Could not process tool {tool}: {e}")
366 | else:
367 | print(f"Warning: Unknown tool type: {tool}")
368 |
369 | return schemas
370 |
371 | def _get_tool(self, name: str) -> Optional[Union[Callable, BaseTool]]:
372 | """Get a tool by name"""
373 | for tool in self.tools:
374 | # Check if it's a BaseTool instance
375 | if isinstance(tool, BaseTool) and tool.name == name:
376 | return tool
377 | # Check if it's a regular callable
378 | elif hasattr(tool, "__name__") and tool.__name__ == name:
379 | return tool
380 | elif hasattr(tool, "func") and tool.func.__name__ == name:
381 | return tool
382 | return None
383 |
384 | # ============================================================================
385 | # AGENT RUN LOOP LIFECYCLE HOOKS
386 | # ============================================================================
387 |
388 | async def _on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
389 | """Initialize run tracking by calling callbacks."""
390 | for callback in self.callbacks:
391 | if hasattr(callback, "on_run_start"):
392 | await callback.on_run_start(kwargs, old_items)
393 |
394 | async def _on_run_end(
395 | self,
396 | kwargs: Dict[str, Any],
397 | old_items: List[Dict[str, Any]],
398 | new_items: List[Dict[str, Any]],
399 | ) -> None:
400 | """Finalize run tracking by calling callbacks."""
401 | for callback in self.callbacks:
402 | if hasattr(callback, "on_run_end"):
403 | await callback.on_run_end(kwargs, old_items, new_items)
404 |
405 | async def _on_run_continue(
406 | self,
407 | kwargs: Dict[str, Any],
408 | old_items: List[Dict[str, Any]],
409 | new_items: List[Dict[str, Any]],
410 | ) -> bool:
411 | """Check if run should continue by calling callbacks."""
412 | for callback in self.callbacks:
413 | if hasattr(callback, "on_run_continue"):
414 | should_continue = await callback.on_run_continue(kwargs, old_items, new_items)
415 | if not should_continue:
416 | return False
417 | return True
418 |
419 | async def _on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
420 | """Prepare messages for the LLM call by applying callbacks."""
421 | result = messages
422 | for callback in self.callbacks:
423 | if hasattr(callback, "on_llm_start"):
424 | result = await callback.on_llm_start(result)
425 | return result
426 |
427 | async def _on_llm_end(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
428 | """Postprocess messages after the LLM call by applying callbacks."""
429 | result = messages
430 | for callback in self.callbacks:
431 | if hasattr(callback, "on_llm_end"):
432 | result = await callback.on_llm_end(result)
433 | return result
434 |
435 | async def _on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
436 | """Called when responses are received."""
437 | for callback in self.callbacks:
438 | if hasattr(callback, "on_responses"):
439 | await callback.on_responses(get_json(kwargs), get_json(responses))
440 |
441 | async def _on_computer_call_start(self, item: Dict[str, Any]) -> None:
442 | """Called when a computer call is about to start."""
443 | for callback in self.callbacks:
444 | if hasattr(callback, "on_computer_call_start"):
445 | await callback.on_computer_call_start(get_json(item))
446 |
447 | async def _on_computer_call_end(
448 | self, item: Dict[str, Any], result: List[Dict[str, Any]]
449 | ) -> None:
450 | """Called when a computer call has completed."""
451 | for callback in self.callbacks:
452 | if hasattr(callback, "on_computer_call_end"):
453 | await callback.on_computer_call_end(get_json(item), get_json(result))
454 |
455 | async def _on_function_call_start(self, item: Dict[str, Any]) -> None:
456 | """Called when a function call is about to start."""
457 | for callback in self.callbacks:
458 | if hasattr(callback, "on_function_call_start"):
459 | await callback.on_function_call_start(get_json(item))
460 |
461 | async def _on_function_call_end(
462 | self, item: Dict[str, Any], result: List[Dict[str, Any]]
463 | ) -> None:
464 | """Called when a function call has completed."""
465 | for callback in self.callbacks:
466 | if hasattr(callback, "on_function_call_end"):
467 | await callback.on_function_call_end(get_json(item), get_json(result))
468 |
469 | async def _on_text(self, item: Dict[str, Any]) -> None:
470 | """Called when a text message is encountered."""
471 | for callback in self.callbacks:
472 | if hasattr(callback, "on_text"):
473 | await callback.on_text(get_json(item))
474 |
475 | async def _on_api_start(self, kwargs: Dict[str, Any]) -> None:
476 | """Called when an LLM API call is about to start."""
477 | for callback in self.callbacks:
478 | if hasattr(callback, "on_api_start"):
479 | await callback.on_api_start(get_json(kwargs))
480 |
481 | async def _on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
482 | """Called when an LLM API call has completed."""
483 | for callback in self.callbacks:
484 | if hasattr(callback, "on_api_end"):
485 | await callback.on_api_end(get_json(kwargs), get_json(result))
486 |
487 | async def _on_usage(self, usage: Dict[str, Any]) -> None:
488 | """Called when usage information is received."""
489 | for callback in self.callbacks:
490 | if hasattr(callback, "on_usage"):
491 | await callback.on_usage(get_json(usage))
492 |
493 | async def _on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
494 | """Called when a screenshot is taken."""
495 | for callback in self.callbacks:
496 | if hasattr(callback, "on_screenshot"):
497 | await callback.on_screenshot(screenshot, name)
498 |
499 | # ============================================================================
500 | # AGENT OUTPUT PROCESSING
501 | # ============================================================================
502 |
503 | async def _handle_item(
504 | self,
505 | item: Any,
506 | computer: Optional[AsyncComputerHandler] = None,
507 | ignore_call_ids: Optional[List[str]] = None,
508 | ) -> List[Dict[str, Any]]:
509 | """Handle each item; may cause a computer action + screenshot."""
510 | call_id = item.get("call_id")
511 | if ignore_call_ids and call_id and call_id in ignore_call_ids:
512 | return []
513 |
514 | item_type = item.get("type", None)
515 |
516 | if item_type == "message":
517 | await self._on_text(item)
518 | # # Print messages
519 | # if item.get("content"):
520 | # for content_item in item.get("content"):
521 | # if content_item.get("text"):
522 | # print(content_item.get("text"))
523 | return []
524 |
525 | try:
526 | if item_type == "computer_call":
527 | await self._on_computer_call_start(item)
528 | if not computer:
529 | raise ValueError("Computer handler is required for computer calls")
530 |
531 | # Perform computer actions
532 | action = item.get("action")
533 | action_type = action.get("type")
534 | if action_type is None:
535 | print(
536 | f"Action type cannot be `None`: action={action}, action_type={action_type}"
537 | )
538 | return []
539 |
540 | # Extract action arguments (all fields except 'type')
541 | action_args = {k: v for k, v in action.items() if k != "type"}
542 |
543 | # print(f"{action_type}({action_args})")
544 |
545 | # Execute the computer action
546 | computer_method = getattr(computer, action_type, None)
547 | if computer_method:
548 | assert_callable_with(computer_method, **action_args)
549 | await computer_method(**action_args)
550 | else:
551 | raise ToolError(f"Unknown computer action: {action_type}")
552 |
553 | # Take screenshot after action
554 | if self.screenshot_delay and self.screenshot_delay > 0:
555 | await asyncio.sleep(self.screenshot_delay)
556 | screenshot_base64 = await computer.screenshot()
557 | await self._on_screenshot(screenshot_base64, "screenshot_after")
558 |
559 | # Handle safety checks
560 | pending_checks = item.get("pending_safety_checks", [])
561 | acknowledged_checks = []
562 | for check in pending_checks:
563 | check_message = check.get("message", str(check))
564 | acknowledged_checks.append(check)
565 | # TODO: implement a callback for safety checks
566 | # if acknowledge_safety_check_callback(check_message, allow_always=True):
567 | # acknowledged_checks.append(check)
568 | # else:
569 | # raise ValueError(f"Safety check failed: {check_message}")
570 |
571 | # Create call output
572 | call_output = {
573 | "type": "computer_call_output",
574 | "call_id": item.get("call_id"),
575 | "acknowledged_safety_checks": acknowledged_checks,
576 | "output": {
577 | "type": "input_image",
578 | "image_url": f"data:image/png;base64,{screenshot_base64}",
579 | },
580 | }
581 |
582 | # # Additional URL safety checks for browser environments
583 | # if await computer.get_environment() == "browser":
584 | # current_url = await computer.get_current_url()
585 | # call_output["output"]["current_url"] = current_url
586 | # # TODO: implement a callback for URL safety checks
587 | # # check_blocklisted_url(current_url)
588 |
589 | result = [call_output]
590 | await self._on_computer_call_end(item, result)
591 | return result
592 |
593 | if item_type == "function_call":
594 | await self._on_function_call_start(item)
595 | # Perform function call
596 | function = self._get_tool(item.get("name"))
597 | if not function:
598 | raise ToolError(f"Function {item.get('name')} not found")
599 |
600 | args = json.loads(item.get("arguments"))
601 |
602 | # Handle BaseTool instances
603 | if isinstance(function, BaseTool):
604 | # BaseTool.call() handles its own execution
605 | result = function.call(args)
606 | else:
607 | # Validate arguments before execution for regular callables
608 | assert_callable_with(function, **args)
609 |
610 | # Execute function - use asyncio.to_thread for non-async functions
611 | if inspect.iscoroutinefunction(function):
612 | result = await function(**args)
613 | else:
614 | result = await asyncio.to_thread(function, **args)
615 |
616 | # Create function call output
617 | call_output = {
618 | "type": "function_call_output",
619 | "call_id": item.get("call_id"),
620 | "output": str(result),
621 | }
622 |
623 | result = [call_output]
624 | await self._on_function_call_end(item, result)
625 | return result
626 | except ToolError as e:
627 | return [make_tool_error_item(repr(e), call_id)]
628 |
629 | return []
630 |
631 | # ============================================================================
632 | # MAIN AGENT LOOP
633 | # ============================================================================
634 |
635 | async def run(
636 | self,
637 | messages: Messages,
638 | stream: bool = False,
639 | api_key: Optional[str] = None,
640 | api_base: Optional[str] = None,
641 | **additional_generation_kwargs,
642 | ) -> AsyncGenerator[Dict[str, Any], None]:
643 | """
644 | Run the agent with the given messages using Computer protocol handler pattern.
645 |
646 | Args:
647 | messages: List of message dictionaries
648 | stream: Whether to stream the response
649 | api_key: Optional API key override for the model provider
650 | api_base: Optional API base URL override for the model provider
651 | **additional_generation_kwargs: Additional arguments passed to the model provider
652 |
653 | Returns:
654 | AsyncGenerator that yields response chunks
655 | """
656 | if not self.agent_config_info:
657 | raise ValueError("Agent configuration not found")
658 |
659 | capabilities = self.get_capabilities()
660 | if "step" not in capabilities:
661 | raise ValueError(
662 | f"Agent loop {self.agent_config_info.agent_class.__name__} does not support step predictions"
663 | )
664 |
665 | await self._initialize_computers()
666 |
667 | # Merge kwargs and thread api credentials (run overrides constructor)
668 | merged_kwargs = {**self.kwargs, **additional_generation_kwargs}
669 | if (api_key is not None) or (self.api_key is not None):
670 | merged_kwargs["api_key"] = api_key if api_key is not None else self.api_key
671 | if (api_base is not None) or (self.api_base is not None):
672 | merged_kwargs["api_base"] = api_base if api_base is not None else self.api_base
673 |
674 | old_items = self._process_input(messages)
675 | new_items = []
676 |
677 | # Initialize run tracking
678 | run_kwargs = {
679 | "messages": messages,
680 | "stream": stream,
681 | "model": self.model,
682 | "agent_loop": self.agent_config_info.agent_class.__name__,
683 | **merged_kwargs,
684 | }
685 | await self._on_run_start(run_kwargs, old_items)
686 |
687 | while new_items[-1].get("role") != "assistant" if new_items else True:
688 | # Lifecycle hook: Check if we should continue based on callbacks (e.g., budget manager)
689 | should_continue = await self._on_run_continue(run_kwargs, old_items, new_items)
690 | if not should_continue:
691 | break
692 |
693 | # Lifecycle hook: Prepare messages for the LLM call
694 | # Use cases:
695 | # - PII anonymization
696 | # - Image retention policy
697 | combined_messages = old_items + new_items
698 | combined_messages = replace_failed_computer_calls_with_function_calls(combined_messages)
699 | preprocessed_messages = await self._on_llm_start(combined_messages)
700 |
701 | loop_kwargs = {
702 | "messages": preprocessed_messages,
703 | "model": self.model,
704 | "tools": self.tool_schemas,
705 | "stream": False,
706 | "computer_handler": self.computer_handler,
707 | "max_retries": self.max_retries,
708 | "use_prompt_caching": self.use_prompt_caching,
709 | **merged_kwargs,
710 | }
711 |
712 | # Run agent loop iteration
713 | result = await self.agent_loop.predict_step(
714 | **loop_kwargs,
715 | _on_api_start=self._on_api_start,
716 | _on_api_end=self._on_api_end,
717 | _on_usage=self._on_usage,
718 | _on_screenshot=self._on_screenshot,
719 | )
720 | result = get_json(result)
721 |
722 | # Lifecycle hook: Postprocess messages after the LLM call
723 | # Use cases:
724 | # - PII deanonymization (if you want tool calls to see PII)
725 | result["output"] = await self._on_llm_end(result.get("output", []))
726 | await self._on_responses(loop_kwargs, result)
727 |
728 | # Yield agent response
729 | yield result
730 |
731 | # Add agent response to new_items
732 | new_items += result.get("output")
733 |
734 | # Get output call ids
735 | output_call_ids = get_output_call_ids(result.get("output", []))
736 |
737 | # Handle computer actions
738 | for item in result.get("output"):
739 | partial_items = await self._handle_item(
740 | item, self.computer_handler, ignore_call_ids=output_call_ids
741 | )
742 | new_items += partial_items
743 |
744 | # Yield partial response
745 | yield {
746 | "output": partial_items,
747 | "usage": Usage(
748 | prompt_tokens=0,
749 | completion_tokens=0,
750 | total_tokens=0,
751 | ),
752 | }
753 |
754 | await self._on_run_end(loop_kwargs, old_items, new_items)
755 |
756 | async def predict_click(
757 | self, instruction: str, image_b64: Optional[str] = None
758 | ) -> Optional[Tuple[int, int]]:
759 | """
760 | Predict click coordinates based on image and instruction.
761 |
762 | Args:
763 | instruction: Instruction for where to click
764 | image_b64: Base64 encoded image (optional, will take screenshot if not provided)
765 |
766 | Returns:
767 | None or tuple with (x, y) coordinates
768 | """
769 | if not self.agent_config_info:
770 | raise ValueError("Agent configuration not found")
771 |
772 | capabilities = self.get_capabilities()
773 | if "click" not in capabilities:
774 | raise ValueError(
775 | f"Agent loop {self.agent_config_info.agent_class.__name__} does not support click predictions"
776 | )
777 | if hasattr(self.agent_loop, "predict_click"):
778 | if not image_b64:
779 | if not self.computer_handler:
780 | raise ValueError("Computer tool or image_b64 is required for predict_click")
781 | image_b64 = await self.computer_handler.screenshot()
782 | # Pass along api credentials if available
783 | click_kwargs: Dict[str, Any] = {}
784 | if self.api_key is not None:
785 | click_kwargs["api_key"] = self.api_key
786 | if self.api_base is not None:
787 | click_kwargs["api_base"] = self.api_base
788 | return await self.agent_loop.predict_click(
789 | model=self.model, image_b64=image_b64, instruction=instruction, **click_kwargs
790 | )
791 | return None
792 |
793 | def get_capabilities(self) -> List[AgentCapability]:
794 | """
795 | Get list of capabilities supported by the current agent config.
796 |
797 | Returns:
798 | List of capability strings (e.g., ["step", "click"])
799 | """
800 | if not self.agent_config_info:
801 | raise ValueError("Agent configuration not found")
802 |
803 | if hasattr(self.agent_loop, "get_capabilities"):
804 | return self.agent_loop.get_capabilities()
805 | return ["step"] # Default capability
806 |
807 | def open(self, port: Optional[int] = None):
808 | """
809 | Start the playground server and open it in the browser.
810 |
811 | This method starts a local HTTP server that exposes the /responses endpoint
812 | and automatically opens the CUA playground interface in the default browser.
813 |
814 | Args:
815 | port: Port to run the server on. If None, finds an available port automatically.
816 |
817 | Example:
818 | >>> agent = ComputerAgent(model="claude-sonnet-4")
819 | >>> agent.open() # Starts server and opens browser
820 | """
821 | from .playground import PlaygroundServer
822 |
823 | server = PlaygroundServer(agent_instance=self)
824 | server.start(port=port, open_browser=True)
825 |
```