trycua/cua # codebase.md

This is page 16 of 20. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .all-contributorsrc
├── .cursorignore
├── .devcontainer
│   ├── devcontainer.json
│   ├── post-install.sh
│   └── README.md
├── .dockerignore
├── .gitattributes
├── .github
│   ├── FUNDING.yml
│   ├── scripts
│   │   ├── get_pyproject_version.py
│   │   └── tests
│   │       ├── __init__.py
│   │       ├── README.md
│   │       └── test_get_pyproject_version.py
│   └── workflows
│       ├── ci-lume.yml
│       ├── docker-publish-kasm.yml
│       ├── docker-publish-xfce.yml
│       ├── docker-reusable-publish.yml
│       ├── npm-publish-computer.yml
│       ├── npm-publish-core.yml
│       ├── publish-lume.yml
│       ├── pypi-publish-agent.yml
│       ├── pypi-publish-computer-server.yml
│       ├── pypi-publish-computer.yml
│       ├── pypi-publish-core.yml
│       ├── pypi-publish-mcp-server.yml
│       ├── pypi-publish-pylume.yml
│       ├── pypi-publish-som.yml
│       ├── pypi-reusable-publish.yml
│       └── test-validation-script.yml
├── .gitignore
├── .vscode
│   ├── docs.code-workspace
│   ├── launch.json
│   ├── libs-ts.code-workspace
│   ├── lume.code-workspace
│   ├── lumier.code-workspace
│   └── py.code-workspace
├── blog
│   ├── app-use.md
│   ├── assets
│   │   ├── composite-agents.png
│   │   ├── docker-ubuntu-support.png
│   │   ├── hack-booth.png
│   │   ├── hack-closing-ceremony.jpg
│   │   ├── hack-cua-ollama-hud.jpeg
│   │   ├── hack-leaderboard.png
│   │   ├── hack-the-north.png
│   │   ├── hack-winners.jpeg
│   │   ├── hack-workshop.jpeg
│   │   ├── hud-agent-evals.png
│   │   └── trajectory-viewer.jpeg
│   ├── bringing-computer-use-to-the-web.md
│   ├── build-your-own-operator-on-macos-1.md
│   ├── build-your-own-operator-on-macos-2.md
│   ├── composite-agents.md
│   ├── cua-hackathon.md
│   ├── hack-the-north.md
│   ├── hud-agent-evals.md
│   ├── human-in-the-loop.md
│   ├── introducing-cua-cloud-containers.md
│   ├── lume-to-containerization.md
│   ├── sandboxed-python-execution.md
│   ├── training-computer-use-models-trajectories-1.md
│   ├── trajectory-viewer.md
│   ├── ubuntu-docker-support.md
│   └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│   ├── .gitignore
│   ├── .prettierrc
│   ├── content
│   │   └── docs
│   │       ├── agent-sdk
│   │       │   ├── agent-loops.mdx
│   │       │   ├── benchmarks
│   │       │   │   ├── index.mdx
│   │       │   │   ├── interactive.mdx
│   │       │   │   ├── introduction.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── osworld-verified.mdx
│   │       │   │   ├── screenspot-pro.mdx
│   │       │   │   └── screenspot-v2.mdx
│   │       │   ├── callbacks
│   │       │   │   ├── agent-lifecycle.mdx
│   │       │   │   ├── cost-saving.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── logging.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── pii-anonymization.mdx
│   │       │   │   └── trajectories.mdx
│   │       │   ├── chat-history.mdx
│   │       │   ├── custom-computer-handlers.mdx
│   │       │   ├── custom-tools.mdx
│   │       │   ├── customizing-computeragent.mdx
│   │       │   ├── integrations
│   │       │   │   ├── hud.mdx
│   │       │   │   └── meta.json
│   │       │   ├── message-format.mdx
│   │       │   ├── meta.json
│   │       │   ├── migration-guide.mdx
│   │       │   ├── prompt-caching.mdx
│   │       │   ├── supported-agents
│   │       │   │   ├── composed-agents.mdx
│   │       │   │   ├── computer-use-agents.mdx
│   │       │   │   ├── grounding-models.mdx
│   │       │   │   ├── human-in-the-loop.mdx
│   │       │   │   └── meta.json
│   │       │   ├── supported-model-providers
│   │       │   │   ├── index.mdx
│   │       │   │   └── local-models.mdx
│   │       │   └── usage-tracking.mdx
│   │       ├── computer-sdk
│   │       │   ├── commands.mdx
│   │       │   ├── computer-ui.mdx
│   │       │   ├── computers.mdx
│   │       │   ├── meta.json
│   │       │   └── sandboxed-python.mdx
│   │       ├── index.mdx
│   │       ├── libraries
│   │       │   ├── agent
│   │       │   │   └── index.mdx
│   │       │   ├── computer
│   │       │   │   └── index.mdx
│   │       │   ├── computer-server
│   │       │   │   ├── Commands.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── REST-API.mdx
│   │       │   │   └── WebSocket-API.mdx
│   │       │   ├── core
│   │       │   │   └── index.mdx
│   │       │   ├── lume
│   │       │   │   ├── cli-reference.mdx
│   │       │   │   ├── faq.md
│   │       │   │   ├── http-api.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── meta.json
│   │       │   │   └── prebuilt-images.mdx
│   │       │   ├── lumier
│   │       │   │   ├── building-lumier.mdx
│   │       │   │   ├── docker-compose.mdx
│   │       │   │   ├── docker.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   └── meta.json
│   │       │   ├── mcp-server
│   │       │   │   ├── client-integrations.mdx
│   │       │   │   ├── configuration.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── llm-integrations.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── tools.mdx
│   │       │   │   └── usage.mdx
│   │       │   └── som
│   │       │       ├── configuration.mdx
│   │       │       └── index.mdx
│   │       ├── meta.json
│   │       ├── quickstart-cli.mdx
│   │       ├── quickstart-devs.mdx
│   │       └── telemetry.mdx
│   ├── next.config.mjs
│   ├── package-lock.json
│   ├── package.json
│   ├── pnpm-lock.yaml
│   ├── postcss.config.mjs
│   ├── public
│   │   └── img
│   │       ├── agent_gradio_ui.png
│   │       ├── agent.png
│   │       ├── cli.png
│   │       ├── computer.png
│   │       ├── som_box_threshold.png
│   │       └── som_iou_threshold.png
│   ├── README.md
│   ├── source.config.ts
│   ├── src
│   │   ├── app
│   │   │   ├── (home)
│   │   │   │   ├── [[...slug]]
│   │   │   │   │   └── page.tsx
│   │   │   │   └── layout.tsx
│   │   │   ├── api
│   │   │   │   └── search
│   │   │   │       └── route.ts
│   │   │   ├── favicon.ico
│   │   │   ├── global.css
│   │   │   ├── layout.config.tsx
│   │   │   ├── layout.tsx
│   │   │   ├── llms.mdx
│   │   │   │   └── [[...slug]]
│   │   │   │       └── route.ts
│   │   │   └── llms.txt
│   │   │       └── route.ts
│   │   ├── assets
│   │   │   ├── discord-black.svg
│   │   │   ├── discord-white.svg
│   │   │   ├── logo-black.svg
│   │   │   └── logo-white.svg
│   │   ├── components
│   │   │   ├── iou.tsx
│   │   │   └── mermaid.tsx
│   │   ├── lib
│   │   │   ├── llms.ts
│   │   │   └── source.ts
│   │   └── mdx-components.tsx
│   └── tsconfig.json
├── examples
│   ├── agent_examples.py
│   ├── agent_ui_examples.py
│   ├── computer_examples_windows.py
│   ├── computer_examples.py
│   ├── computer_ui_examples.py
│   ├── computer-example-ts
│   │   ├── .env.example
│   │   ├── .gitignore
│   │   ├── .prettierrc
│   │   ├── package-lock.json
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── README.md
│   │   ├── src
│   │   │   ├── helpers.ts
│   │   │   └── index.ts
│   │   └── tsconfig.json
│   ├── docker_examples.py
│   ├── evals
│   │   ├── hud_eval_examples.py
│   │   └── wikipedia_most_linked.txt
│   ├── pylume_examples.py
│   ├── sandboxed_functions_examples.py
│   ├── som_examples.py
│   ├── utils.py
│   └── winsandbox_example.py
├── img
│   ├── agent_gradio_ui.png
│   ├── agent.png
│   ├── cli.png
│   ├── computer.png
│   ├── logo_black.png
│   └── logo_white.png
├── libs
│   ├── kasm
│   │   ├── Dockerfile
│   │   ├── LICENSE
│   │   ├── README.md
│   │   └── src
│   │       └── ubuntu
│   │           └── install
│   │               └── firefox
│   │                   ├── custom_startup.sh
│   │                   ├── firefox.desktop
│   │                   └── install_firefox.sh
│   ├── lume
│   │   ├── .cursorignore
│   │   ├── CONTRIBUTING.md
│   │   ├── Development.md
│   │   ├── img
│   │   │   └── cli.png
│   │   ├── Package.resolved
│   │   ├── Package.swift
│   │   ├── README.md
│   │   ├── resources
│   │   │   └── lume.entitlements
│   │   ├── scripts
│   │   │   ├── build
│   │   │   │   ├── build-debug.sh
│   │   │   │   ├── build-release-notarized.sh
│   │   │   │   └── build-release.sh
│   │   │   └── install.sh
│   │   ├── src
│   │   │   ├── Commands
│   │   │   │   ├── Clone.swift
│   │   │   │   ├── Config.swift
│   │   │   │   ├── Create.swift
│   │   │   │   ├── Delete.swift
│   │   │   │   ├── Get.swift
│   │   │   │   ├── Images.swift
│   │   │   │   ├── IPSW.swift
│   │   │   │   ├── List.swift
│   │   │   │   ├── Logs.swift
│   │   │   │   ├── Options
│   │   │   │   │   └── FormatOption.swift
│   │   │   │   ├── Prune.swift
│   │   │   │   ├── Pull.swift
│   │   │   │   ├── Push.swift
│   │   │   │   ├── Run.swift
│   │   │   │   ├── Serve.swift
│   │   │   │   ├── Set.swift
│   │   │   │   └── Stop.swift
│   │   │   ├── ContainerRegistry
│   │   │   │   ├── ImageContainerRegistry.swift
│   │   │   │   ├── ImageList.swift
│   │   │   │   └── ImagesPrinter.swift
│   │   │   ├── Errors
│   │   │   │   └── Errors.swift
│   │   │   ├── FileSystem
│   │   │   │   ├── Home.swift
│   │   │   │   ├── Settings.swift
│   │   │   │   ├── VMConfig.swift
│   │   │   │   ├── VMDirectory.swift
│   │   │   │   └── VMLocation.swift
│   │   │   ├── LumeController.swift
│   │   │   ├── Main.swift
│   │   │   ├── Server
│   │   │   │   ├── Handlers.swift
│   │   │   │   ├── HTTP.swift
│   │   │   │   ├── Requests.swift
│   │   │   │   ├── Responses.swift
│   │   │   │   └── Server.swift
│   │   │   ├── Utils
│   │   │   │   ├── CommandRegistry.swift
│   │   │   │   ├── CommandUtils.swift
│   │   │   │   ├── Logger.swift
│   │   │   │   ├── NetworkUtils.swift
│   │   │   │   ├── Path.swift
│   │   │   │   ├── ProcessRunner.swift
│   │   │   │   ├── ProgressLogger.swift
│   │   │   │   ├── String.swift
│   │   │   │   └── Utils.swift
│   │   │   ├── Virtualization
│   │   │   │   ├── DarwinImageLoader.swift
│   │   │   │   ├── DHCPLeaseParser.swift
│   │   │   │   ├── ImageLoaderFactory.swift
│   │   │   │   └── VMVirtualizationService.swift
│   │   │   ├── VM
│   │   │   │   ├── DarwinVM.swift
│   │   │   │   ├── LinuxVM.swift
│   │   │   │   ├── VM.swift
│   │   │   │   ├── VMDetails.swift
│   │   │   │   ├── VMDetailsPrinter.swift
│   │   │   │   ├── VMDisplayResolution.swift
│   │   │   │   └── VMFactory.swift
│   │   │   └── VNC
│   │   │       ├── PassphraseGenerator.swift
│   │   │       └── VNCService.swift
│   │   └── tests
│   │       ├── Mocks
│   │       │   ├── MockVM.swift
│   │       │   ├── MockVMVirtualizationService.swift
│   │       │   └── MockVNCService.swift
│   │       ├── VM
│   │       │   └── VMDetailsPrinterTests.swift
│   │       ├── VMTests.swift
│   │       ├── VMVirtualizationServiceTests.swift
│   │       └── VNCServiceTests.swift
│   ├── lumier
│   │   ├── .dockerignore
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   └── src
│   │       ├── bin
│   │       │   └── entry.sh
│   │       ├── config
│   │       │   └── constants.sh
│   │       ├── hooks
│   │       │   └── on-logon.sh
│   │       └── lib
│   │           ├── utils.sh
│   │           └── vm.sh
│   ├── python
│   │   ├── agent
│   │   │   ├── agent
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── adapters
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── huggingfacelocal_adapter.py
│   │   │   │   │   ├── human_adapter.py
│   │   │   │   │   ├── mlxvlm_adapter.py
│   │   │   │   │   └── models
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── generic.py
│   │   │   │   │       ├── internvl.py
│   │   │   │   │       ├── opencua.py
│   │   │   │   │       └── qwen2_5_vl.py
│   │   │   │   ├── agent.py
│   │   │   │   ├── callbacks
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── budget_manager.py
│   │   │   │   │   ├── image_retention.py
│   │   │   │   │   ├── logging.py
│   │   │   │   │   ├── operator_validator.py
│   │   │   │   │   ├── pii_anonymization.py
│   │   │   │   │   ├── prompt_instructions.py
│   │   │   │   │   ├── telemetry.py
│   │   │   │   │   └── trajectory_saver.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── computers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cua.py
│   │   │   │   │   └── custom.py
│   │   │   │   ├── decorators.py
│   │   │   │   ├── human_tool
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   ├── server.py
│   │   │   │   │   └── ui.py
│   │   │   │   ├── integrations
│   │   │   │   │   └── hud
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── agent.py
│   │   │   │   │       └── proxy.py
│   │   │   │   ├── loops
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── anthropic.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── composed_grounded.py
│   │   │   │   │   ├── glm45v.py
│   │   │   │   │   ├── gta1.py
│   │   │   │   │   ├── holo.py
│   │   │   │   │   ├── internvl.py
│   │   │   │   │   ├── model_types.csv
│   │   │   │   │   ├── moondream3.py
│   │   │   │   │   ├── omniparser.py
│   │   │   │   │   ├── openai.py
│   │   │   │   │   ├── opencua.py
│   │   │   │   │   └── uitars.py
│   │   │   │   ├── proxy
│   │   │   │   │   ├── examples.py
│   │   │   │   │   └── handlers.py
│   │   │   │   ├── responses.py
│   │   │   │   ├── types.py
│   │   │   │   └── ui
│   │   │   │       ├── __init__.py
│   │   │   │       ├── __main__.py
│   │   │   │       └── gradio
│   │   │   │           ├── __init__.py
│   │   │   │           ├── app.py
│   │   │   │           └── ui_components.py
│   │   │   ├── benchmarks
│   │   │   │   ├── .gitignore
│   │   │   │   ├── contrib.md
│   │   │   │   ├── interactive.py
│   │   │   │   ├── models
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   └── gta1.py
│   │   │   │   ├── README.md
│   │   │   │   ├── ss-pro.py
│   │   │   │   ├── ss-v2.py
│   │   │   │   └── utils.py
│   │   │   ├── example.py
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── computer
│   │   │   ├── computer
│   │   │   │   ├── __init__.py
│   │   │   │   ├── computer.py
│   │   │   │   ├── diorama_computer.py
│   │   │   │   ├── helpers.py
│   │   │   │   ├── interface
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   ├── models.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── logger.py
│   │   │   │   ├── models.py
│   │   │   │   ├── providers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cloud
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── docker
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── lume
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── lume_api.py
│   │   │   │   │   ├── lumier
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   └── winsandbox
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── provider.py
│   │   │   │   │       └── setup_script.ps1
│   │   │   │   ├── ui
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   └── gradio
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       └── app.py
│   │   │   │   └── utils.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── computer-server
│   │   │   ├── computer_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── diorama
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── diorama_computer.py
│   │   │   │   │   ├── diorama.py
│   │   │   │   │   ├── draw.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── safezone.py
│   │   │   │   ├── handlers
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── main.py
│   │   │   │   ├── server.py
│   │   │   │   └── watchdog.py
│   │   │   ├── examples
│   │   │   │   ├── __init__.py
│   │   │   │   └── usage_example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   ├── run_server.py
│   │   │   └── test_connection.py
│   │   ├── core
│   │   │   ├── core
│   │   │   │   ├── __init__.py
│   │   │   │   └── telemetry
│   │   │   │       ├── __init__.py
│   │   │   │       └── posthog.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── mcp-server
│   │   │   ├── mcp_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   └── server.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── scripts
│   │   │       ├── install_mcp_server.sh
│   │   │       └── start_mcp_server.sh
│   │   ├── pylume
│   │   │   ├── __init__.py
│   │   │   ├── pylume
│   │   │   │   ├── __init__.py
│   │   │   │   ├── client.py
│   │   │   │   ├── exceptions.py
│   │   │   │   ├── lume
│   │   │   │   ├── models.py
│   │   │   │   ├── pylume.py
│   │   │   │   └── server.py
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   └── som
│   │       ├── LICENSE
│   │       ├── poetry.toml
│   │       ├── pyproject.toml
│   │       ├── README.md
│   │       ├── som
│   │       │   ├── __init__.py
│   │       │   ├── detect.py
│   │       │   ├── detection.py
│   │       │   ├── models.py
│   │       │   ├── ocr.py
│   │       │   ├── util
│   │       │   │   └── utils.py
│   │       │   └── visualization.py
│   │       └── tests
│   │           └── test_omniparser.py
│   ├── typescript
│   │   ├── .gitignore
│   │   ├── .nvmrc
│   │   ├── agent
│   │   │   ├── examples
│   │   │   │   ├── playground-example.html
│   │   │   │   └── README.md
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── client.ts
│   │   │   │   ├── index.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   └── client.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── biome.json
│   │   ├── computer
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── computer
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── providers
│   │   │   │   │   │   ├── base.ts
│   │   │   │   │   │   ├── cloud.ts
│   │   │   │   │   │   └── index.ts
│   │   │   │   │   └── types.ts
│   │   │   │   ├── index.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── base.ts
│   │   │   │   │   ├── factory.ts
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── linux.ts
│   │   │   │   │   ├── macos.ts
│   │   │   │   │   └── windows.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   ├── computer
│   │   │   │   │   └── cloud.test.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── factory.test.ts
│   │   │   │   │   ├── index.test.ts
│   │   │   │   │   ├── linux.test.ts
│   │   │   │   │   ├── macos.test.ts
│   │   │   │   │   └── windows.test.ts
│   │   │   │   └── setup.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── core
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── index.ts
│   │   │   │   └── telemetry
│   │   │   │       ├── clients
│   │   │   │       │   ├── index.ts
│   │   │   │       │   └── posthog.ts
│   │   │   │       └── index.ts
│   │   │   ├── tests
│   │   │   │   └── telemetry.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── pnpm-workspace.yaml
│   │   └── README.md
│   └── xfce
│       ├── .dockerignore
│       ├── .gitignore
│       ├── Dockerfile
│       ├── README.md
│       └── src
│           ├── scripts
│           │   ├── resize-display.sh
│           │   ├── start-computer-server.sh
│           │   ├── start-novnc.sh
│           │   ├── start-vnc.sh
│           │   └── xstartup.sh
│           ├── supervisor
│           │   └── supervisord.conf
│           └── xfce-config
│               ├── helpers.rc
│               ├── xfce4-power-manager.xml
│               └── xfce4-session.xml
├── LICENSE.md
├── notebooks
│   ├── agent_nb.ipynb
│   ├── blog
│   │   ├── build-your-own-operator-on-macos-1.ipynb
│   │   └── build-your-own-operator-on-macos-2.ipynb
│   ├── composite_agents_docker_nb.ipynb
│   ├── computer_nb.ipynb
│   ├── computer_server_nb.ipynb
│   ├── customizing_computeragent.ipynb
│   ├── eval_osworld.ipynb
│   ├── ollama_nb.ipynb
│   ├── pylume_nb.ipynb
│   ├── README.md
│   ├── sota_hackathon_cloud.ipynb
│   └── sota_hackathon.ipynb
├── pdm.lock
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── samples
│   └── community
│       ├── global-online
│       │   └── README.md
│       └── hack-the-north
│           └── README.md
├── scripts
│   ├── build-uv.sh
│   ├── build.ps1
│   ├── build.sh
│   ├── cleanup.sh
│   ├── playground-docker.sh
│   ├── playground.sh
│   └── run-docker-dev.sh
└── tests
    ├── pytest.ini
    ├── shell_cmd.py
    ├── test_files.py
    ├── test_shell_bash.py
    ├── test_telemetry.py
    ├── test_venv.py
    └── test_watchdog.py
```

# Files

--------------------------------------------------------------------------------
/libs/python/agent/agent/responses.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Functions for making various Responses API items from different types of responses.
  3 | Based on the OpenAI spec for Responses API items.
  4 | """
  5 | 
  6 | import base64
  7 | import json
  8 | import uuid
  9 | from typing import List, Dict, Any, Literal, Union, Optional
 10 | 
 11 | from openai.types.responses.response_computer_tool_call_param import (
 12 |     ResponseComputerToolCallParam, 
 13 |     ActionClick,
 14 |     ActionDoubleClick,
 15 |     ActionDrag,
 16 |     ActionDragPath,
 17 |     ActionKeypress,
 18 |     ActionMove,
 19 |     ActionScreenshot,
 20 |     ActionScroll,
 21 |     ActionType as ActionTypeAction,
 22 |     ActionWait,
 23 |     PendingSafetyCheck
 24 | )
 25 | 
 26 | from openai.types.responses.response_function_tool_call_param import ResponseFunctionToolCallParam
 27 | from openai.types.responses.response_output_text_param import ResponseOutputTextParam
 28 | from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
 29 | from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
 30 | from openai.types.responses.easy_input_message_param import EasyInputMessageParam
 31 | from openai.types.responses.response_input_image_param import ResponseInputImageParam
 32 | 
 33 | def random_id():
 34 |     return str(uuid.uuid4())
 35 | 
 36 | # User message items
 37 | def make_input_image_item(image_data: Union[str, bytes]) -> EasyInputMessageParam:
 38 |     return EasyInputMessageParam(
 39 |         content=[
 40 |             ResponseInputImageParam(
 41 |                 type="input_image",
 42 |                 image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}"
 43 |             ) # type: ignore
 44 |         ],
 45 |         role="user",
 46 |         type="message"
 47 |     )
 48 | 
 49 | # Text items
 50 | def make_reasoning_item(reasoning: str) -> ResponseReasoningItemParam:
 51 |     return ResponseReasoningItemParam(
 52 |         id=random_id(),
 53 |         summary=[
 54 |             Summary(text=reasoning, type="summary_text")
 55 |         ],
 56 |         type="reasoning"
 57 |     )
 58 | 
 59 | def make_output_text_item(content: str) -> ResponseOutputMessageParam:
 60 |     return ResponseOutputMessageParam(
 61 |         id=random_id(),
 62 |         content=[
 63 |             ResponseOutputTextParam(
 64 |                 text=content,
 65 |                 type="output_text",
 66 |                 annotations=[]
 67 |             )
 68 |         ],
 69 |         role="assistant",
 70 |         status="completed",
 71 |         type="message"
 72 |     )
 73 | 
 74 | # Function call items
 75 | def make_function_call_item(function_name: str, arguments: Dict[str, Any], call_id: Optional[str] = None) -> ResponseFunctionToolCallParam:
 76 |     return ResponseFunctionToolCallParam(
 77 |         id=random_id(),
 78 |         call_id=call_id if call_id else random_id(),
 79 |         name=function_name,
 80 |         arguments=json.dumps(arguments),
 81 |         status="completed",
 82 |         type="function_call"
 83 |     )
 84 | 
 85 | # Computer tool call items
 86 | def make_click_item(x: int, y: int, button: Literal["left", "right", "wheel", "back", "forward"] = "left", call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
 87 |     return ResponseComputerToolCallParam(
 88 |         id=random_id(),
 89 |         call_id=call_id if call_id else random_id(),
 90 |         action=ActionClick(
 91 |             button=button,
 92 |             type="click",
 93 |             x=x,
 94 |             y=y
 95 |         ),
 96 |         pending_safety_checks=[],
 97 |         status="completed",
 98 |         type="computer_call"
 99 |     )
100 | 
101 | def make_double_click_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
102 |     return ResponseComputerToolCallParam(
103 |         id=random_id(),
104 |         call_id=call_id if call_id else random_id(),
105 |         action=ActionDoubleClick(
106 |             type="double_click",
107 |             x=x,
108 |             y=y
109 |         ),
110 |         pending_safety_checks=[],
111 |         status="completed",
112 |         type="computer_call"
113 |     )
114 | 
115 | def make_drag_item(path: List[Dict[str, int]], call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
116 |     drag_path = [ActionDragPath(x=point["x"], y=point["y"]) for point in path]
117 |     return ResponseComputerToolCallParam(
118 |         id=random_id(),
119 |         call_id=call_id if call_id else random_id(),
120 |         action=ActionDrag(
121 |             path=drag_path,
122 |             type="drag"
123 |         ),
124 |         pending_safety_checks=[],
125 |         status="completed",
126 |         type="computer_call"
127 |     )
128 | 
129 | def make_keypress_item(keys: List[str], call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
130 |     return ResponseComputerToolCallParam(
131 |         id=random_id(),
132 |         call_id=call_id if call_id else random_id(),
133 |         action=ActionKeypress(
134 |             keys=keys,
135 |             type="keypress"
136 |         ),
137 |         pending_safety_checks=[],
138 |         status="completed",
139 |         type="computer_call"
140 |     )
141 | 
142 | def make_move_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
143 |     return ResponseComputerToolCallParam(
144 |         id=random_id(),
145 |         call_id=call_id if call_id else random_id(),
146 |         action=ActionMove(
147 |             type="move",
148 |             x=x,
149 |             y=y
150 |         ),
151 |         pending_safety_checks=[],
152 |         status="completed",
153 |         type="computer_call"
154 |     )
155 | 
156 | def make_screenshot_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
157 |     return ResponseComputerToolCallParam(
158 |         id=random_id(),
159 |         call_id=call_id if call_id else random_id(),
160 |         action=ActionScreenshot(
161 |             type="screenshot"
162 |         ),
163 |         pending_safety_checks=[],
164 |         status="completed",
165 |         type="computer_call"
166 |     )
167 | 
168 | def make_scroll_item(x: int, y: int, scroll_x: int, scroll_y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
169 |     return ResponseComputerToolCallParam(
170 |         id=random_id(),
171 |         call_id=call_id if call_id else random_id(),
172 |         action=ActionScroll(
173 |             scroll_x=scroll_x,
174 |             scroll_y=scroll_y,
175 |             type="scroll",
176 |             x=x,
177 |             y=y
178 |         ),
179 |         pending_safety_checks=[],
180 |         status="completed",
181 |         type="computer_call"
182 |     )
183 | 
184 | def make_type_item(text: str, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
185 |     return ResponseComputerToolCallParam(
186 |         id=random_id(),
187 |         call_id=call_id if call_id else random_id(),
188 |         action=ActionTypeAction(
189 |             text=text,
190 |             type="type"
191 |         ),
192 |         pending_safety_checks=[],
193 |         status="completed",
194 |         type="computer_call"
195 |     )
196 | 
197 | def make_wait_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
198 |     return ResponseComputerToolCallParam(
199 |         id=random_id(),
200 |         call_id=call_id if call_id else random_id(),
201 |         action=ActionWait(
202 |             type="wait"
203 |         ),
204 |         pending_safety_checks=[],
205 |         status="completed",
206 |         type="computer_call"
207 |     )
208 | 
209 | # Extra anthropic computer calls
210 | def make_left_mouse_down_item(x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None) -> Dict[str, Any]:
211 |     return {
212 |         "id": random_id(),
213 |         "call_id": call_id if call_id else random_id(),
214 |         "action": {
215 |             "type": "left_mouse_down",
216 |             "x": x,
217 |             "y": y
218 |         },
219 |         "pending_safety_checks": [],
220 |         "status": "completed",
221 |         "type": "computer_call"
222 |     }
223 | 
224 | def make_left_mouse_up_item(x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None) -> Dict[str, Any]:
225 |     return {
226 |         "id": random_id(),
227 |         "call_id": call_id if call_id else random_id(),
228 |         "action": {
229 |             "type": "left_mouse_up",
230 |             "x": x,
231 |             "y": y
232 |         },
233 |         "pending_safety_checks": [],
234 |         "status": "completed",
235 |         "type": "computer_call"
236 |     }
237 | 
238 | def make_failed_tool_call_items(tool_name: str, tool_kwargs: Dict[str, Any], error_message: str, call_id: Optional[str] = None) -> List[Dict[str, Any]]:
239 |     call_id = call_id if call_id else random_id()
240 |     return [
241 |         {
242 |             "type": "function_call",
243 |             "id": random_id(),
244 |             "call_id": call_id,
245 |             "name": tool_name,
246 |             "arguments": json.dumps(tool_kwargs),
247 |         },
248 |         {
249 |             "type": "function_call_output",
250 |             "call_id": call_id,
251 |             "output": json.dumps({"error": error_message}),
252 |         }
253 |     ]
254 | 
255 | def make_tool_error_item(error_message: str, call_id: Optional[str] = None) -> Dict[str, Any]:
256 |     call_id = call_id if call_id else random_id()
257 |     return {
258 |         "type": "function_call_output",
259 |         "call_id": call_id,
260 |         "output": json.dumps({"error": error_message}),
261 |     }
262 | 
263 | def replace_failed_computer_calls_with_function_calls(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
264 |     """
265 |     Replace computer_call items with function_call items if they share a call_id with a function_call_output.
266 |     This indicates the computer call failed and should be treated as a function call instead.
267 |     We do this because the computer_call_output items do not support text output.
268 |     
269 |     Args:
270 |         messages: List of message items to process
271 |     """
272 |     messages = messages.copy()
273 | 
274 |     # Find all call_ids that have function_call_output items
275 |     failed_call_ids = set()
276 |     for msg in messages:
277 |         if msg.get("type") == "function_call_output":
278 |             call_id = msg.get("call_id")
279 |             if call_id:
280 |                 failed_call_ids.add(call_id)
281 |     
282 |     # Replace computer_call items that have matching call_ids
283 |     for i, msg in enumerate(messages):
284 |         if (msg.get("type") == "computer_call" and 
285 |             msg.get("call_id") in failed_call_ids):
286 |             
287 |             # Extract action from computer_call
288 |             action = msg.get("action", {})
289 |             call_id = msg.get("call_id")
290 |             
291 |             # Create function_call replacement
292 |             messages[i] = {
293 |                 "type": "function_call",
294 |                 "id": msg.get("id", random_id()),
295 |                 "call_id": call_id,
296 |                 "name": "computer",
297 |                 "arguments": json.dumps(action),
298 |             }
299 |     
300 |     return messages
301 | 
302 | # Conversion functions between element descriptions and coordinates
303 | def convert_computer_calls_desc2xy(responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]) -> List[Dict[str, Any]]:
304 |     """
305 |     Convert computer calls from element descriptions to x,y coordinates.
306 |     
307 |     Args:
308 |         responses_items: List of response items containing computer calls with element_description
309 |         desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
310 |         
311 |     Returns:
312 |         List of response items with element_description replaced by x,y coordinates
313 |     """
314 |     converted_items = []
315 |     
316 |     for item in responses_items:
317 |         if item.get("type") == "computer_call" and "action" in item:
318 |             action = item["action"].copy()
319 |             
320 |             # Handle single element_description
321 |             if "element_description" in action:
322 |                 desc = action["element_description"]
323 |                 if desc in desc2xy:
324 |                     x, y = desc2xy[desc]
325 |                     action["x"] = x
326 |                     action["y"] = y
327 |                     del action["element_description"]
328 |             
329 |             # Handle start_element_description and end_element_description for drag operations
330 |             elif "start_element_description" in action and "end_element_description" in action:
331 |                 start_desc = action["start_element_description"]
332 |                 end_desc = action["end_element_description"]
333 |                 
334 |                 if start_desc in desc2xy and end_desc in desc2xy:
335 |                     start_x, start_y = desc2xy[start_desc]
336 |                     end_x, end_y = desc2xy[end_desc]
337 |                     action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
338 |                     del action["start_element_description"]
339 |                     del action["end_element_description"]
340 |             
341 |             converted_item = item.copy()
342 |             converted_item["action"] = action
343 |             converted_items.append(converted_item)
344 |         else:
345 |             converted_items.append(item)
346 |     
347 |     return converted_items
348 | 
349 | 
350 | def convert_computer_calls_xy2desc(responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]) -> List[Dict[str, Any]]:
351 |     """
352 |     Convert computer calls from x,y coordinates to element descriptions.
353 |     
354 |     Args:
355 |         responses_items: List of response items containing computer calls with x,y coordinates
356 |         desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
357 |         
358 |     Returns:
359 |         List of response items with x,y coordinates replaced by element_description
360 |     """
361 |     # Create reverse mapping from coordinates to descriptions
362 |     xy2desc = {coords: desc for desc, coords in desc2xy.items()}
363 |     
364 |     converted_items = []
365 |     
366 |     for item in responses_items:
367 |         if item.get("type") == "computer_call" and "action" in item:
368 |             action = item["action"].copy()
369 |             
370 |             # Handle single x,y coordinates
371 |             if "x" in action and "y" in action:
372 |                 coords = (action["x"], action["y"])
373 |                 if coords in xy2desc:
374 |                     action["element_description"] = xy2desc[coords]
375 |                     del action["x"]
376 |                     del action["y"]
377 |             
378 |             # Handle path for drag operations
379 |             elif "path" in action and isinstance(action["path"], list) and len(action["path"]) == 2:
380 |                 start_point = action["path"][0]
381 |                 end_point = action["path"][1]
382 |                 
383 |                 if ("x" in start_point and "y" in start_point and 
384 |                     "x" in end_point and "y" in end_point):
385 |                     
386 |                     start_coords = (start_point["x"], start_point["y"])
387 |                     end_coords = (end_point["x"], end_point["y"])
388 |                     
389 |                     if start_coords in xy2desc and end_coords in xy2desc:
390 |                         action["start_element_description"] = xy2desc[start_coords]
391 |                         action["end_element_description"] = xy2desc[end_coords]
392 |                         del action["path"]
393 |             
394 |             converted_item = item.copy()
395 |             converted_item["action"] = action
396 |             converted_items.append(converted_item)
397 |         else:
398 |             converted_items.append(item)
399 |     
400 |     return converted_items
401 | 
402 | 
403 | def get_all_element_descriptions(responses_items: List[Dict[str, Any]]) -> List[str]:
404 |     """
405 |     Extract all element descriptions from computer calls in responses items.
406 |     
407 |     Args:
408 |         responses_items: List of response items containing computer calls
409 |         
410 |     Returns:
411 |         List of unique element descriptions found in computer calls
412 |     """
413 |     descriptions = set()
414 |     
415 |     for item in responses_items:
416 |         if item.get("type") == "computer_call" and "action" in item:
417 |             action = item["action"]
418 |             
419 |             # Handle single element_description
420 |             if "element_description" in action:
421 |                 descriptions.add(action["element_description"])
422 |             
423 |             # Handle start_element_description and end_element_description for drag operations
424 |             if "start_element_description" in action:
425 |                 descriptions.add(action["start_element_description"])
426 |             
427 |             if "end_element_description" in action:
428 |                 descriptions.add(action["end_element_description"])
429 |     
430 |     return list(descriptions)
431 | 
432 | 
433 | # Conversion functions between responses_items and completion messages formats
434 | def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]], allow_images_in_tool_results: bool = True) -> List[Dict[str, Any]]:
435 |     """Convert responses_items message format to liteLLM completion format.
436 |     
437 |     Args:
438 |         messages: List of responses_items format messages
439 |         allow_images_in_tool_results: If True, include images in tool role messages.
440 |                                     If False, send tool message + separate user message with image.
441 |     """
442 |     completion_messages = []
443 |     
444 |     for message in messages:
445 |         msg_type = message.get("type")
446 |         role = message.get("role")
447 |         
448 |         # Handle user messages (both with and without explicit type)
449 |         if role == "user" or msg_type == "user":
450 |             content = message.get("content", "")
451 |             if isinstance(content, list):
452 |                 # Handle list content (images, text blocks)
453 |                 completion_content = []
454 |                 for item in content:
455 |                     if item.get("type") == "input_image":
456 |                         completion_content.append({
457 |                             "type": "image_url",
458 |                             "image_url": {
459 |                                 "url": item.get("image_url")
460 |                             }
461 |                         })
462 |                     elif item.get("type") == "input_text":
463 |                         completion_content.append({
464 |                             "type": "text",
465 |                             "text": item.get("text")
466 |                         })
467 |                     elif item.get("type") == "text":
468 |                         completion_content.append({
469 |                             "type": "text",
470 |                             "text": item.get("text")
471 |                         })
472 |                 
473 |                 completion_messages.append({
474 |                     "role": "user",
475 |                     "content": completion_content
476 |                 })
477 |             elif isinstance(content, str):
478 |                 # Handle string content
479 |                 completion_messages.append({
480 |                     "role": "user",
481 |                     "content": content
482 |                 })
483 |         
484 |         # Handle assistant messages
485 |         elif role == "assistant" or msg_type == "message":
486 |             content = message.get("content", [])
487 |             if isinstance(content, list):
488 |                 text_parts = []
489 |                 for item in content:
490 |                     if item.get("type") == "output_text":
491 |                         text_parts.append(item.get("text", ""))
492 |                     elif item.get("type") == "text":
493 |                         text_parts.append(item.get("text", ""))
494 |                 
495 |                 if text_parts:
496 |                     completion_messages.append({
497 |                         "role": "assistant",
498 |                         "content": "\n".join(text_parts)
499 |                     })
500 |         
501 |         # Handle reasoning items (convert to assistant message)
502 |         elif msg_type == "reasoning":
503 |             summary = message.get("summary", [])
504 |             text_parts = []
505 |             for item in summary:
506 |                 if item.get("type") == "summary_text":
507 |                     text_parts.append(item.get("text", ""))
508 |             
509 |             if text_parts:
510 |                 completion_messages.append({
511 |                     "role": "assistant",
512 |                     "content": "\n".join(text_parts)
513 |                 })
514 |         
515 |         # Handle function calls
516 |         elif msg_type == "function_call":
517 |             # Add tool call to last assistant message or create new one
518 |             if not completion_messages or completion_messages[-1]["role"] != "assistant":
519 |                 completion_messages.append({
520 |                     "role": "assistant",
521 |                     "content": "",
522 |                     "tool_calls": []
523 |                 })
524 |             
525 |             if "tool_calls" not in completion_messages[-1]:
526 |                 completion_messages[-1]["tool_calls"] = []
527 |             
528 |             completion_messages[-1]["tool_calls"].append({
529 |                 "id": message.get("call_id"),
530 |                 "type": "function",
531 |                 "function": {
532 |                     "name": message.get("name"),
533 |                     "arguments": message.get("arguments")
534 |                 }
535 |             })
536 |         
537 |         # Handle computer calls
538 |         elif msg_type == "computer_call":
539 |             # Add tool call to last assistant message or create new one
540 |             if not completion_messages or completion_messages[-1]["role"] != "assistant":
541 |                 completion_messages.append({
542 |                     "role": "assistant",
543 |                     "content": "",
544 |                     "tool_calls": []
545 |                 })
546 |             
547 |             if "tool_calls" not in completion_messages[-1]:
548 |                 completion_messages[-1]["tool_calls"] = []
549 |             
550 |             action = message.get("action", {})
551 |             completion_messages[-1]["tool_calls"].append({
552 |                 "id": message.get("call_id"),
553 |                 "type": "function",
554 |                 "function": {
555 |                     "name": "computer",
556 |                     "arguments": json.dumps(action)
557 |                 }
558 |             })
559 |         
560 |         # Handle function/computer call outputs
561 |         elif msg_type in ["function_call_output", "computer_call_output"]:
562 |             output = message.get("output")
563 |             call_id = message.get("call_id")
564 |             
565 |             if isinstance(output, dict) and output.get("type") == "input_image":
566 |                 if allow_images_in_tool_results:
567 |                     # Handle image output as tool response (may not work with all APIs)
568 |                     completion_messages.append({
569 |                         "role": "tool",
570 |                         "tool_call_id": call_id,
571 |                         "content": [{
572 |                             "type": "image_url",
573 |                             "image_url": {
574 |                                 "url": output.get("image_url")
575 |                             }
576 |                         }]
577 |                     })
578 |                 else:
579 |                     # Send tool message + separate user message with image (OpenAI compatible)
580 |                     completion_messages += [{
581 |                         "role": "tool",
582 |                         "tool_call_id": call_id,
583 |                         "content": "[Execution completed. See screenshot below]"
584 |                     }, {
585 |                         "role": "user",
586 |                         "content": [{
587 |                             "type": "image_url",
588 |                             "image_url": {
589 |                                 "url": output.get("image_url")
590 |                             }
591 |                         }]
592 |                     }]
593 |             else:
594 |                 # Handle text output as tool response
595 |                 completion_messages.append({
596 |                     "role": "tool",
597 |                     "tool_call_id": call_id,
598 |                     "content": str(output)
599 |                 })
600 |     
601 |     return completion_messages
602 | 
603 | 
604 | def convert_completion_messages_to_responses_items(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
605 |     """Convert completion messages format to responses_items message format."""
606 |     responses_items = []
607 |     skip_next = False
608 |     
609 |     for i, message in enumerate(completion_messages):
610 |         if skip_next:
611 |             skip_next = False
612 |             continue
613 | 
614 |         role = message.get("role")
615 |         content = message.get("content")
616 |         tool_calls = message.get("tool_calls", [])
617 |         
618 |         # Handle assistant messages with text content
619 |         if role == "assistant" and content and isinstance(content, str):
620 |             responses_items.append({
621 |                 "type": "message",
622 |                 "role": "assistant",
623 |                 "content": [{
624 |                     "type": "output_text",
625 |                     "text": content
626 |                 }]
627 |             })
628 |         
629 |         # Handle tool calls
630 |         if tool_calls:
631 |             for tool_call in tool_calls:
632 |                 if tool_call.get("type") == "function":
633 |                     function = tool_call.get("function", {})
634 |                     function_name = function.get("name")
635 |                     
636 |                     if function_name == "computer":
637 |                         # Parse computer action
638 |                         try:
639 |                             action = json.loads(function.get("arguments", "{}"))
640 |                             # Change key from "action" -> "type"
641 |                             if action.get("action"):
642 |                                 action["type"] = action["action"]
643 |                                 del action["action"]
644 |                             responses_items.append({
645 |                                 "type": "computer_call",
646 |                                 "call_id": tool_call.get("id"),
647 |                                 "action": action,
648 |                                 "status": "completed"
649 |                             })
650 |                         except json.JSONDecodeError:
651 |                             # Fallback to function call format
652 |                             responses_items.append({
653 |                                 "type": "function_call",
654 |                                 "call_id": tool_call.get("id"),
655 |                                 "name": function_name,
656 |                                 "arguments": function.get("arguments", "{}"),
657 |                                 "status": "completed"
658 |                             })
659 |                     else:
660 |                         # Regular function call
661 |                         responses_items.append({
662 |                             "type": "function_call",
663 |                             "call_id": tool_call.get("id"),
664 |                             "name": function_name,
665 |                             "arguments": function.get("arguments", "{}"),
666 |                             "status": "completed"
667 |                         })
668 |         
669 |         # Handle tool messages (function/computer call outputs)
670 |         elif role == "tool" and content:
671 |             tool_call_id = message.get("tool_call_id")
672 |             if isinstance(content, str):
673 |                 # Check if this is the "[Execution completed. See screenshot below]" pattern
674 |                 if content == "[Execution completed. See screenshot below]":
675 |                     # Look ahead for the next user message with image
676 |                     next_idx = i + 1
677 |                     if (next_idx < len(completion_messages) and 
678 |                         completion_messages[next_idx].get("role") == "user" and 
679 |                         isinstance(completion_messages[next_idx].get("content"), list)):
680 |                         # Found the pattern - extract image from next message
681 |                         next_content = completion_messages[next_idx]["content"]
682 |                         for item in next_content:
683 |                             if item.get("type") == "image_url":
684 |                                 responses_items.append({
685 |                                     "type": "computer_call_output",
686 |                                     "call_id": tool_call_id,
687 |                                     "output": {
688 |                                         "type": "input_image",
689 |                                         "image_url": item.get("image_url", {}).get("url")
690 |                                     }
691 |                                 })
692 |                                 # Skip the next user message since we processed it
693 |                                 skip_next = True
694 |                                 break
695 |                     else:
696 |                         # No matching user message, treat as regular text
697 |                         responses_items.append({
698 |                             "type": "computer_call_output",
699 |                             "call_id": tool_call_id,
700 |                             "output": content
701 |                         })
702 |                 else:
703 |                     # Determine if this is a computer call or function call output
704 |                     try:
705 |                         # Try to parse as structured output
706 |                         parsed_content = json.loads(content)
707 |                         if parsed_content.get("type") == "input_image":
708 |                             responses_items.append({
709 |                                 "type": "computer_call_output",
710 |                                 "call_id": tool_call_id,
711 |                                 "output": parsed_content
712 |                             })
713 |                         else:
714 |                             responses_items.append({
715 |                                 "type": "computer_call_output",
716 |                                 "call_id": tool_call_id,
717 |                                 "output": content
718 |                             })
719 |                     except json.JSONDecodeError:
720 |                         # Plain text output - could be function or computer call
721 |                         responses_items.append({
722 |                             "type": "function_call_output",
723 |                             "call_id": tool_call_id,
724 |                             "output": content
725 |                         })
726 |             elif isinstance(content, list):
727 |                 # Handle structured content (e.g., images)
728 |                 for item in content:
729 |                     if item.get("type") == "image_url":
730 |                         responses_items.append({
731 |                             "type": "computer_call_output",
732 |                             "call_id": tool_call_id,
733 |                             "output": {
734 |                                 "type": "input_image",
735 |                                 "image_url": item.get("image_url", {}).get("url")
736 |                             }
737 |                         })
738 |                     elif item.get("type") == "text":
739 |                         responses_items.append({
740 |                             "type": "function_call_output",
741 |                             "call_id": tool_call_id,
742 |                             "output": item.get("text")
743 |                         })
744 |         
745 |         # Handle actual user messages
746 |         elif role == "user" and content:
747 |             if isinstance(content, list):
748 |                 # Handle structured user content (e.g., text + images)
749 |                 user_content = []
750 |                 for item in content:
751 |                     if item.get("type") == "image_url":
752 |                         user_content.append({
753 |                             "type": "input_image",
754 |                             "image_url": item.get("image_url", {}).get("url")
755 |                         })
756 |                     elif item.get("type") == "text":
757 |                         user_content.append({
758 |                             "type": "input_text",
759 |                             "text": item.get("text")
760 |                         })
761 |                 
762 |                 if user_content:
763 |                     responses_items.append({
764 |                         "role": "user",
765 |                         "type": "message",
766 |                         "content": user_content
767 |                     })
768 |             elif isinstance(content, str):
769 |                 # Handle simple text user message
770 |                 responses_items.append({
771 |                     "role": "user",
772 |                     "content": content
773 |                 })
774 |     
775 |     return responses_items
776 | 
```

--------------------------------------------------------------------------------
/libs/lume/src/VM/VM.swift:
--------------------------------------------------------------------------------

```swift
  1 | import Foundation
  2 | 
  3 | // MARK: - Support Types
  4 | 
  5 | /// Base context for virtual machine directory and configuration
  6 | struct VMDirContext {
  7 |     let dir: VMDirectory
  8 |     var config: VMConfig
  9 |     let home: Home
 10 |     let storage: String?
 11 | 
 12 |     func saveConfig() throws {
 13 |         try dir.saveConfig(config)
 14 |     }
 15 | 
 16 |     var name: String { dir.name }
 17 |     var initialized: Bool { dir.initialized() }
 18 |     var diskPath: Path { dir.diskPath }
 19 |     var nvramPath: Path { dir.nvramPath }
 20 | 
 21 |     func setDisk(_ size: UInt64) throws {
 22 |         try dir.setDisk(size)
 23 |     }
 24 | 
 25 |     func finalize(to name: String) throws {
 26 |         let vmDir = try home.getVMDirectory(name)
 27 |         try FileManager.default.moveItem(at: dir.dir.url, to: vmDir.dir.url)
 28 |     }
 29 | }
 30 | 
 31 | // MARK: - Base VM Class
 32 | 
 33 | /// Base class for virtual machine implementations
 34 | @MainActor
 35 | class VM {
 36 |     // MARK: - Properties
 37 | 
 38 |     var vmDirContext: VMDirContext
 39 | 
 40 |     @MainActor
 41 |     private var virtualizationService: VMVirtualizationService?
 42 |     private let vncService: VNCService
 43 |     internal let virtualizationServiceFactory:
 44 |         (VMVirtualizationServiceContext) throws -> VMVirtualizationService
 45 |     private let vncServiceFactory: (VMDirectory) -> VNCService
 46 | 
 47 |     // MARK: - Initialization
 48 | 
 49 |     init(
 50 |         vmDirContext: VMDirContext,
 51 |         virtualizationServiceFactory: @escaping (VMVirtualizationServiceContext) throws ->
 52 |             VMVirtualizationService = { try DarwinVirtualizationService(configuration: $0) },
 53 |         vncServiceFactory: @escaping (VMDirectory) -> VNCService = {
 54 |             DefaultVNCService(vmDirectory: $0)
 55 |         }
 56 |     ) {
 57 |         self.vmDirContext = vmDirContext
 58 |         self.virtualizationServiceFactory = virtualizationServiceFactory
 59 |         self.vncServiceFactory = vncServiceFactory
 60 | 
 61 |         // Initialize VNC service
 62 |         self.vncService = vncServiceFactory(vmDirContext.dir)
 63 |     }
 64 | 
 65 |     // MARK: - VM State Management
 66 | 
 67 |     private var isRunning: Bool {
 68 |         // First check if we have a MAC address
 69 |         guard let macAddress = vmDirContext.config.macAddress else {
 70 |             Logger.info(
 71 |                 "Cannot check if VM is running: macAddress is nil",
 72 |                 metadata: ["name": vmDirContext.name])
 73 |             return false
 74 |         }
 75 | 
 76 |         // Then check if we have an IP address
 77 |         guard let ipAddress = DHCPLeaseParser.getIPAddress(forMAC: macAddress) else {
 78 |             return false
 79 |         }
 80 | 
 81 |         // Then check if it's reachable
 82 |         return NetworkUtils.isReachable(ipAddress: ipAddress)
 83 |     }
 84 | 
 85 |     var details: VMDetails {
 86 |         let isRunning: Bool = self.isRunning
 87 |         let vncUrl = isRunning ? getVNCUrl() : nil
 88 | 
 89 |         // Safely get disk size with fallback
 90 |         let diskSizeValue: DiskSize
 91 |         do {
 92 |             diskSizeValue = try getDiskSize()
 93 |         } catch {
 94 |             Logger.error(
 95 |                 "Failed to get disk size",
 96 |                 metadata: ["name": vmDirContext.name, "error": "\(error)"])
 97 |             // Provide a fallback value to avoid crashing
 98 |             diskSizeValue = DiskSize(allocated: 0, total: vmDirContext.config.diskSize ?? 0)
 99 |         }
100 | 
101 |         // Safely access MAC address
102 |         let macAddress = vmDirContext.config.macAddress
103 |         let ipAddress: String? =
104 |             isRunning && macAddress != nil ? DHCPLeaseParser.getIPAddress(forMAC: macAddress!) : nil
105 | 
106 |         return VMDetails(
107 |             name: vmDirContext.name,
108 |             os: getOSType(),
109 |             cpuCount: vmDirContext.config.cpuCount ?? 0,
110 |             memorySize: vmDirContext.config.memorySize ?? 0,
111 |             diskSize: diskSizeValue,
112 |             display: vmDirContext.config.display.string,
113 |             status: isRunning ? "running" : "stopped",
114 |             vncUrl: vncUrl,
115 |             ipAddress: ipAddress,
116 |             locationName: vmDirContext.storage ?? "default"
117 |         )
118 |     }
119 | 
120 |     // MARK: - VM Lifecycle Management
121 | 
122 |     func run(
123 |         noDisplay: Bool, sharedDirectories: [SharedDirectory], mount: Path?, vncPort: Int = 0,
124 |         recoveryMode: Bool = false, usbMassStoragePaths: [Path]? = nil
125 |     ) async throws {
126 |         Logger.info(
127 |             "VM.run method called",
128 |             metadata: [
129 |                 "name": vmDirContext.name,
130 |                 "noDisplay": "\(noDisplay)",
131 |                 "recoveryMode": "\(recoveryMode)",
132 |             ])
133 | 
134 |         guard vmDirContext.initialized else {
135 |             Logger.error("VM not initialized", metadata: ["name": vmDirContext.name])
136 |             throw VMError.notInitialized(vmDirContext.name)
137 |         }
138 | 
139 |         guard let cpuCount = vmDirContext.config.cpuCount,
140 |             let memorySize = vmDirContext.config.memorySize
141 |         else {
142 |             Logger.error("VM missing cpuCount or memorySize", metadata: ["name": vmDirContext.name])
143 |             throw VMError.notInitialized(vmDirContext.name)
144 |         }
145 | 
146 |         // Try to acquire lock on config file
147 |         Logger.info(
148 |             "Attempting to acquire lock on config file",
149 |             metadata: [
150 |                 "path": vmDirContext.dir.configPath.path,
151 |                 "name": vmDirContext.name,
152 |             ])
153 |         var fileHandle = try FileHandle(forWritingTo: vmDirContext.dir.configPath.url)
154 | 
155 |         if flock(fileHandle.fileDescriptor, LOCK_EX | LOCK_NB) != 0 {
156 |             try? fileHandle.close()
157 |             Logger.error(
158 |                 "VM already running (failed to acquire lock)", metadata: ["name": vmDirContext.name]
159 |             )
160 | 
161 |             // Try to forcibly clear the lock before giving up
162 |             Logger.info("Attempting emergency lock cleanup", metadata: ["name": vmDirContext.name])
163 |             unlockConfigFile()
164 | 
165 |             // Try one more time to acquire the lock
166 |             if let retryHandle = try? FileHandle(forWritingTo: vmDirContext.dir.configPath.url),
167 |                 flock(retryHandle.fileDescriptor, LOCK_EX | LOCK_NB) == 0
168 |             {
169 |                 Logger.info("Emergency lock cleanup worked", metadata: ["name": vmDirContext.name])
170 |                 // Continue with a fresh file handle
171 |                 try? retryHandle.close()
172 |                 // Get a completely new file handle to be safe
173 |                 guard let newHandle = try? FileHandle(forWritingTo: vmDirContext.dir.configPath.url)
174 |                 else {
175 |                     throw VMError.internalError("Failed to open file handle after lock cleanup")
176 |                 }
177 |                 // Update our main file handle
178 |                 fileHandle = newHandle
179 |             } else {
180 |                 // If we still can't get the lock, give up
181 |                 Logger.error(
182 |                     "Could not acquire lock even after emergency cleanup",
183 |                     metadata: ["name": vmDirContext.name])
184 |                 throw VMError.alreadyRunning(vmDirContext.name)
185 |             }
186 |         }
187 |         Logger.info("Successfully acquired lock", metadata: ["name": vmDirContext.name])
188 | 
189 |         Logger.info(
190 |             "Running VM with configuration",
191 |             metadata: [
192 |                 "name": vmDirContext.name,
193 |                 "cpuCount": "\(cpuCount)",
194 |                 "memorySize": "\(memorySize)",
195 |                 "diskSize": "\(vmDirContext.config.diskSize ?? 0)",
196 |                 "sharedDirectories": sharedDirectories.map { $0.string }.joined(separator: ", "),
197 |                 "recoveryMode": "\(recoveryMode)",
198 |             ])
199 | 
200 |         // Create and configure the VM
201 |         do {
202 |             Logger.info(
203 |                 "Creating virtualization service context", metadata: ["name": vmDirContext.name])
204 |             let config = try createVMVirtualizationServiceContext(
205 |                 cpuCount: cpuCount,
206 |                 memorySize: memorySize,
207 |                 display: vmDirContext.config.display.string,
208 |                 sharedDirectories: sharedDirectories,
209 |                 mount: mount,
210 |                 recoveryMode: recoveryMode,
211 |                 usbMassStoragePaths: usbMassStoragePaths
212 |             )
213 |             Logger.info(
214 |                 "Successfully created virtualization service context",
215 |                 metadata: ["name": vmDirContext.name])
216 | 
217 |             Logger.info(
218 |                 "Initializing virtualization service", metadata: ["name": vmDirContext.name])
219 |             virtualizationService = try virtualizationServiceFactory(config)
220 |             Logger.info(
221 |                 "Successfully initialized virtualization service",
222 |                 metadata: ["name": vmDirContext.name])
223 | 
224 |             Logger.info(
225 |                 "Setting up VNC",
226 |                 metadata: [
227 |                     "name": vmDirContext.name,
228 |                     "noDisplay": "\(noDisplay)",
229 |                     "port": "\(vncPort)",
230 |                 ])
231 |             let vncInfo = try await setupSession(
232 |                 noDisplay: noDisplay, port: vncPort, sharedDirectories: sharedDirectories)
233 |             Logger.info(
234 |                 "VNC setup successful", metadata: ["name": vmDirContext.name, "vncInfo": vncInfo])
235 | 
236 |             // Start the VM
237 |             guard let service = virtualizationService else {
238 |                 Logger.error("Virtualization service is nil", metadata: ["name": vmDirContext.name])
239 |                 throw VMError.internalError("Virtualization service not initialized")
240 |             }
241 |             Logger.info(
242 |                 "Starting VM via virtualization service", metadata: ["name": vmDirContext.name])
243 |             try await service.start()
244 |             Logger.info("VM started successfully", metadata: ["name": vmDirContext.name])
245 | 
246 |             while true {
247 |                 try await Task.sleep(nanoseconds: UInt64(1e9))
248 |             }
249 |         } catch {
250 |             Logger.error(
251 |                 "Failed in VM.run",
252 |                 metadata: [
253 |                     "name": vmDirContext.name,
254 |                     "error": error.localizedDescription,
255 |                     "errorType": "\(type(of: error))",
256 |                 ])
257 |             virtualizationService = nil
258 |             vncService.stop()
259 | 
260 |             // Release lock
261 |             Logger.info("Releasing file lock after error", metadata: ["name": vmDirContext.name])
262 |             flock(fileHandle.fileDescriptor, LOCK_UN)
263 |             try? fileHandle.close()
264 | 
265 |             // Additionally, perform our aggressive unlock to ensure no locks remain
266 |             Logger.info(
267 |                 "Performing additional lock cleanup after error",
268 |                 metadata: ["name": vmDirContext.name])
269 |             unlockConfigFile()
270 | 
271 |             throw error
272 |         }
273 |     }
274 | 
275 |     @MainActor
276 |     func stop() async throws {
277 |         guard vmDirContext.initialized else {
278 |             throw VMError.notInitialized(vmDirContext.name)
279 |         }
280 | 
281 |         Logger.info("Attempting to stop VM", metadata: ["name": vmDirContext.name])
282 | 
283 |         // If we have a virtualization service, try to stop it cleanly first
284 |         if let service = virtualizationService {
285 |             do {
286 |                 Logger.info(
287 |                     "Stopping VM via virtualization service", metadata: ["name": vmDirContext.name])
288 |                 try await service.stop()
289 |                 virtualizationService = nil
290 |                 vncService.stop()
291 |                 Logger.info(
292 |                     "VM stopped successfully via virtualization service",
293 |                     metadata: ["name": vmDirContext.name])
294 | 
295 |                 // Try to ensure any existing locks are released
296 |                 Logger.info(
297 |                     "Attempting to clear any locks on config file",
298 |                     metadata: ["name": vmDirContext.name])
299 |                 unlockConfigFile()
300 | 
301 |                 return
302 |             } catch let error {
303 |                 Logger.error(
304 |                     "Failed to stop VM via virtualization service",
305 |                     metadata: [
306 |                         "name": vmDirContext.name,
307 |                         "error": error.localizedDescription,
308 |                     ])
309 |                 // Fall through to process termination
310 |             }
311 |         }
312 | 
313 |         // Try to open config file to get file descriptor
314 |         Logger.info(
315 |             "Attempting to access config file lock",
316 |             metadata: [
317 |                 "path": vmDirContext.dir.configPath.path,
318 |                 "name": vmDirContext.name,
319 |             ])
320 |         let fileHandle = try? FileHandle(forReadingFrom: vmDirContext.dir.configPath.url)
321 |         guard let fileHandle = fileHandle else {
322 |             Logger.info(
323 |                 "Failed to open config file - VM may not be running",
324 |                 metadata: ["name": vmDirContext.name])
325 | 
326 |             // Even though we couldn't open the file, try to force unlock anyway
327 |             unlockConfigFile()
328 | 
329 |             throw VMError.notRunning(vmDirContext.name)
330 |         }
331 | 
332 |         // Get the PID of the process holding the lock using lsof command
333 |         Logger.info(
334 |             "Finding process holding lock on config file", metadata: ["name": vmDirContext.name])
335 |         let task = Process()
336 |         task.executableURL = URL(fileURLWithPath: "/usr/sbin/lsof")
337 |         task.arguments = ["-F", "p", vmDirContext.dir.configPath.path]
338 | 
339 |         let outputPipe = Pipe()
340 |         task.standardOutput = outputPipe
341 | 
342 |         try task.run()
343 |         task.waitUntilExit()
344 | 
345 |         let outputData = try outputPipe.fileHandleForReading.readToEnd() ?? Data()
346 |         guard let outputString = String(data: outputData, encoding: .utf8),
347 |             let pidString = outputString.split(separator: "\n").first?.dropFirst(),  // Drop the 'p' prefix
348 |             let pid = pid_t(pidString)
349 |         else {
350 |             try? fileHandle.close()
351 |             Logger.info(
352 |                 "Failed to find process holding lock - VM may not be running",
353 |                 metadata: ["name": vmDirContext.name])
354 | 
355 |             // Even though we couldn't find the process, try to force unlock
356 |             unlockConfigFile()
357 | 
358 |             throw VMError.notRunning(vmDirContext.name)
359 |         }
360 | 
361 |         Logger.info(
362 |             "Found process \(pid) holding lock on config file",
363 |             metadata: ["name": vmDirContext.name])
364 | 
365 |         // First try graceful shutdown with SIGINT
366 |         if kill(pid, SIGINT) == 0 {
367 |             Logger.info("Sent SIGINT to VM process \(pid)", metadata: ["name": vmDirContext.name])
368 |         }
369 | 
370 |         // Wait for process to stop with timeout
371 |         var attempts = 0
372 |         while attempts < 10 {
373 |             Logger.info(
374 |                 "Waiting for process \(pid) to terminate (attempt \(attempts + 1)/10)",
375 |                 metadata: ["name": vmDirContext.name])
376 |             try await Task.sleep(nanoseconds: 1_000_000_000)
377 | 
378 |             // Check if process still exists
379 |             if kill(pid, 0) != 0 {
380 |                 // Process is gone, do final cleanup
381 |                 Logger.info("Process \(pid) has terminated", metadata: ["name": vmDirContext.name])
382 |                 virtualizationService = nil
383 |                 vncService.stop()
384 |                 try? fileHandle.close()
385 | 
386 |                 // Force unlock the config file
387 |                 unlockConfigFile()
388 | 
389 |                 Logger.info(
390 |                     "VM stopped successfully via process termination",
391 |                     metadata: ["name": vmDirContext.name])
392 |                 return
393 |             }
394 |             attempts += 1
395 |         }
396 | 
397 |         // If graceful shutdown failed, force kill the process
398 |         Logger.info(
399 |             "Graceful shutdown failed, forcing termination of process \(pid)",
400 |             metadata: ["name": vmDirContext.name])
401 |         if kill(pid, SIGKILL) == 0 {
402 |             Logger.info("Sent SIGKILL to process \(pid)", metadata: ["name": vmDirContext.name])
403 | 
404 |             // Wait a moment for the process to be fully killed
405 |             try await Task.sleep(nanoseconds: 2_000_000_000)
406 | 
407 |             // Do final cleanup
408 |             virtualizationService = nil
409 |             vncService.stop()
410 |             try? fileHandle.close()
411 | 
412 |             // Force unlock the config file
413 |             unlockConfigFile()
414 | 
415 |             Logger.info("VM forcefully stopped", metadata: ["name": vmDirContext.name])
416 |             return
417 |         }
418 | 
419 |         // If we get here, something went very wrong
420 |         try? fileHandle.close()
421 |         Logger.error(
422 |             "Failed to stop VM - could not terminate process \(pid)",
423 |             metadata: ["name": vmDirContext.name])
424 | 
425 |         // As a last resort, try to force unlock
426 |         unlockConfigFile()
427 | 
428 |         throw VMError.internalError("Failed to stop VM process")
429 |     }
430 | 
431 |     // Helper method to forcibly clear any locks on the config file
432 |     private func unlockConfigFile() {
433 |         Logger.info(
434 |             "Forcibly clearing locks on config file",
435 |             metadata: [
436 |                 "path": vmDirContext.dir.configPath.path,
437 |                 "name": vmDirContext.name,
438 |             ])
439 | 
440 |         // First attempt: standard unlock methods
441 |         if let fileHandle = try? FileHandle(forWritingTo: vmDirContext.dir.configPath.url) {
442 |             // Use F_GETLK and F_SETLK to check and clear locks
443 |             var lockInfo = flock()
444 |             lockInfo.l_type = Int16(F_UNLCK)
445 |             lockInfo.l_whence = Int16(SEEK_SET)
446 |             lockInfo.l_start = 0
447 |             lockInfo.l_len = 0
448 | 
449 |             // Try to unlock the file using fcntl
450 |             _ = fcntl(fileHandle.fileDescriptor, F_SETLK, &lockInfo)
451 | 
452 |             // Also try the regular flock method
453 |             flock(fileHandle.fileDescriptor, LOCK_UN)
454 | 
455 |             try? fileHandle.close()
456 |             Logger.info("Standard unlock attempts performed", metadata: ["name": vmDirContext.name])
457 |         }
458 | 
459 |         // Second attempt: try to acquire and immediately release a fresh lock
460 |         if let tempHandle = try? FileHandle(forWritingTo: vmDirContext.dir.configPath.url) {
461 |             if flock(tempHandle.fileDescriptor, LOCK_EX | LOCK_NB) == 0 {
462 |                 Logger.info(
463 |                     "Successfully acquired and released lock to reset state",
464 |                     metadata: ["name": vmDirContext.name])
465 |                 flock(tempHandle.fileDescriptor, LOCK_UN)
466 |             } else {
467 |                 Logger.info(
468 |                     "Could not acquire lock for resetting - may still be locked",
469 |                     metadata: ["name": vmDirContext.name])
470 |             }
471 |             try? tempHandle.close()
472 |         }
473 | 
474 |         // Third attempt (most aggressive): copy the config file, remove the original, and restore
475 |         Logger.info(
476 |             "Trying aggressive method: backup and restore config file",
477 |             metadata: ["name": vmDirContext.name])
478 |         // Only proceed if the config file exists
479 |         let fileManager = FileManager.default
480 |         let configPath = vmDirContext.dir.configPath.path
481 |         let backupPath = configPath + ".backup"
482 | 
483 |         if fileManager.fileExists(atPath: configPath) {
484 |             // Create a backup of the config file
485 |             if let configData = try? Data(contentsOf: URL(fileURLWithPath: configPath)) {
486 |                 // Make backup
487 |                 try? configData.write(to: URL(fileURLWithPath: backupPath))
488 | 
489 |                 // Remove the original file to clear all locks
490 |                 try? fileManager.removeItem(atPath: configPath)
491 |                 Logger.info(
492 |                     "Removed original config file to clear locks",
493 |                     metadata: ["name": vmDirContext.name])
494 | 
495 |                 // Wait a moment for OS to fully release resources
496 |                 Thread.sleep(forTimeInterval: 0.1)
497 | 
498 |                 // Restore from backup
499 |                 try? configData.write(to: URL(fileURLWithPath: configPath))
500 |                 Logger.info(
501 |                     "Restored config file from backup", metadata: ["name": vmDirContext.name])
502 |             } else {
503 |                 Logger.error(
504 |                     "Could not read config file content for backup",
505 |                     metadata: ["name": vmDirContext.name])
506 |             }
507 |         } else {
508 |             Logger.info(
509 |                 "Config file does not exist, cannot perform aggressive unlock",
510 |                 metadata: ["name": vmDirContext.name])
511 |         }
512 | 
513 |         // Final check
514 |         if let finalHandle = try? FileHandle(forWritingTo: vmDirContext.dir.configPath.url) {
515 |             let lockResult = flock(finalHandle.fileDescriptor, LOCK_EX | LOCK_NB)
516 |             if lockResult == 0 {
517 |                 Logger.info(
518 |                     "Lock successfully cleared - verified by acquiring test lock",
519 |                     metadata: ["name": vmDirContext.name])
520 |                 flock(finalHandle.fileDescriptor, LOCK_UN)
521 |             } else {
522 |                 Logger.info(
523 |                     "Lock still present after all clearing attempts",
524 |                     metadata: ["name": vmDirContext.name, "severity": "warning"])
525 |             }
526 |             try? finalHandle.close()
527 |         }
528 |     }
529 | 
530 |     // MARK: - Resource Management
531 | 
532 |     func updateVMConfig(vmConfig: VMConfig) throws {
533 |         vmDirContext.config = vmConfig
534 |         try vmDirContext.saveConfig()
535 |     }
536 | 
537 |     private func getDiskSize() throws -> DiskSize {
538 |         let resourceValues = try vmDirContext.diskPath.url.resourceValues(forKeys: [
539 |             .totalFileAllocatedSizeKey,
540 |             .totalFileSizeKey,
541 |         ])
542 | 
543 |         guard let allocated = resourceValues.totalFileAllocatedSize,
544 |             let total = resourceValues.totalFileSize
545 |         else {
546 |             throw VMConfigError.invalidDiskSize
547 |         }
548 | 
549 |         return DiskSize(allocated: UInt64(allocated), total: UInt64(total))
550 |     }
551 | 
552 |     func resizeDisk(_ newSize: UInt64) throws {
553 |         let currentSize = try getDiskSize()
554 | 
555 |         guard newSize >= currentSize.total else {
556 |             throw VMError.resizeTooSmall(current: currentSize.total, requested: newSize)
557 |         }
558 | 
559 |         try setDiskSize(newSize)
560 |     }
561 | 
562 |     func setCpuCount(_ newCpuCount: Int) throws {
563 |         guard !isRunning else {
564 |             throw VMError.alreadyRunning(vmDirContext.name)
565 |         }
566 |         vmDirContext.config.setCpuCount(newCpuCount)
567 |         try vmDirContext.saveConfig()
568 |     }
569 | 
570 |     func setMemorySize(_ newMemorySize: UInt64) throws {
571 |         guard !isRunning else {
572 |             throw VMError.alreadyRunning(vmDirContext.name)
573 |         }
574 |         vmDirContext.config.setMemorySize(newMemorySize)
575 |         try vmDirContext.saveConfig()
576 |     }
577 | 
578 |     func setDiskSize(_ newDiskSize: UInt64) throws {
579 |         try vmDirContext.setDisk(newDiskSize)
580 |         vmDirContext.config.setDiskSize(newDiskSize)
581 |         try vmDirContext.saveConfig()
582 |     }
583 | 
584 |     func setDisplay(_ newDisplay: String) throws {
585 |         guard !isRunning else {
586 |             throw VMError.alreadyRunning(vmDirContext.name)
587 |         }
588 |         guard let display: VMDisplayResolution = VMDisplayResolution(string: newDisplay) else {
589 |             throw VMError.invalidDisplayResolution(newDisplay)
590 |         }
591 |         vmDirContext.config.setDisplay(display)
592 |         try vmDirContext.saveConfig()
593 |     }
594 | 
595 |     func setHardwareModel(_ newHardwareModel: Data) throws {
596 |         guard !isRunning else {
597 |             throw VMError.alreadyRunning(vmDirContext.name)
598 |         }
599 |         vmDirContext.config.setHardwareModel(newHardwareModel)
600 |         try vmDirContext.saveConfig()
601 |     }
602 | 
603 |     func setMachineIdentifier(_ newMachineIdentifier: Data) throws {
604 |         guard !isRunning else {
605 |             throw VMError.alreadyRunning(vmDirContext.name)
606 |         }
607 |         vmDirContext.config.setMachineIdentifier(newMachineIdentifier)
608 |         try vmDirContext.saveConfig()
609 |     }
610 | 
611 |     func setMacAddress(_ newMacAddress: String) throws {
612 |         guard !isRunning else {
613 |             throw VMError.alreadyRunning(vmDirContext.name)
614 |         }
615 |         vmDirContext.config.setMacAddress(newMacAddress)
616 |         try vmDirContext.saveConfig()
617 |     }
618 | 
619 |     // MARK: - VNC Management
620 | 
621 |     func getVNCUrl() -> String? {
622 |         return vncService.url
623 |     }
624 | 
625 |     /// Sets up the VNC service and returns the VNC URL
626 |     private func startVNCService(port: Int = 0) async throws -> String {
627 |         guard let service = virtualizationService else {
628 |             throw VMError.internalError("Virtualization service not initialized")
629 |         }
630 | 
631 |         try await vncService.start(port: port, virtualMachine: service.getVirtualMachine())
632 | 
633 |         guard let url = vncService.url else {
634 |             throw VMError.vncNotConfigured
635 |         }
636 | 
637 |         return url
638 |     }
639 | 
640 |     /// Saves the session information including shared directories to disk
641 |     private func saveSessionData(url: String, sharedDirectories: [SharedDirectory]) {
642 |         do {
643 |             let session = VNCSession(
644 |                 url: url, sharedDirectories: sharedDirectories.isEmpty ? nil : sharedDirectories)
645 |             try vmDirContext.dir.saveSession(session)
646 |             Logger.info(
647 |                 "Saved VNC session with shared directories",
648 |                 metadata: [
649 |                     "count": "\(sharedDirectories.count)",
650 |                     "dirs": "\(sharedDirectories.map { $0.hostPath }.joined(separator: ", "))",
651 |                     "sessionsPath": "\(vmDirContext.dir.sessionsPath.path)",
652 |                 ])
653 |         } catch {
654 |             Logger.error("Failed to save VNC session", metadata: ["error": "\(error)"])
655 |         }
656 |     }
657 | 
658 |     /// Main session setup method that handles VNC and persists session data
659 |     private func setupSession(
660 |         noDisplay: Bool, port: Int = 0, sharedDirectories: [SharedDirectory] = []
661 |     ) async throws -> String {
662 |         // Start the VNC service and get the URL
663 |         let url = try await startVNCService(port: port)
664 | 
665 |         // Save the session data
666 |         saveSessionData(url: url, sharedDirectories: sharedDirectories)
667 | 
668 |         // Open the VNC client if needed
669 |         if !noDisplay {
670 |             Logger.info("Starting VNC session", metadata: ["name": vmDirContext.name])
671 |             try await vncService.openClient(url: url)
672 |         }
673 | 
674 |         return url
675 |     }
676 | 
677 |     // MARK: - Platform-specific Methods
678 | 
679 |     func getOSType() -> String {
680 |         fatalError("Must be implemented by subclass")
681 |     }
682 | 
683 |     func createVMVirtualizationServiceContext(
684 |         cpuCount: Int,
685 |         memorySize: UInt64,
686 |         display: String,
687 |         sharedDirectories: [SharedDirectory] = [],
688 |         mount: Path? = nil,
689 |         recoveryMode: Bool = false,
690 |         usbMassStoragePaths: [Path]? = nil
691 |     ) throws -> VMVirtualizationServiceContext {
692 |         // This is a diagnostic log to track actual file paths on disk for debugging
693 |         try validateDiskState()
694 | 
695 |         return VMVirtualizationServiceContext(
696 |             cpuCount: cpuCount,
697 |             memorySize: memorySize,
698 |             display: display,
699 |             sharedDirectories: sharedDirectories,
700 |             mount: mount,
701 |             hardwareModel: vmDirContext.config.hardwareModel,
702 |             machineIdentifier: vmDirContext.config.machineIdentifier,
703 |             macAddress: vmDirContext.config.macAddress!,
704 |             diskPath: vmDirContext.diskPath,
705 |             nvramPath: vmDirContext.nvramPath,
706 |             recoveryMode: recoveryMode,
707 |             usbMassStoragePaths: usbMassStoragePaths
708 |         )
709 |     }
710 | 
711 |     /// Validates the disk state to help diagnose storage attachment issues
712 |     private func validateDiskState() throws {
713 |         // Check disk image state
714 |         let diskPath = vmDirContext.diskPath.path
715 |         let diskExists = FileManager.default.fileExists(atPath: diskPath)
716 |         var diskSize: UInt64 = 0
717 |         var diskPermissions = ""
718 | 
719 |         if diskExists {
720 |             if let attrs = try? FileManager.default.attributesOfItem(atPath: diskPath) {
721 |                 diskSize = attrs[.size] as? UInt64 ?? 0
722 |                 let posixPerms = attrs[.posixPermissions] as? Int ?? 0
723 |                 diskPermissions = String(format: "%o", posixPerms)
724 |             }
725 |         }
726 | 
727 |         // Check disk container directory permissions
728 |         let diskDir = (diskPath as NSString).deletingLastPathComponent
729 |         let dirPerms =
730 |             try? FileManager.default.attributesOfItem(atPath: diskDir)[.posixPermissions] as? Int
731 |             ?? 0
732 |         let dirPermsString = dirPerms != nil ? String(format: "%o", dirPerms!) : "unknown"
733 | 
734 |         // Log detailed diagnostics
735 |         Logger.info(
736 |             "Validating VM disk state",
737 |             metadata: [
738 |                 "diskPath": diskPath,
739 |                 "diskExists": "\(diskExists)",
740 |                 "diskSize":
741 |                     "\(ByteCountFormatter.string(fromByteCount: Int64(diskSize), countStyle: .file))",
742 |                 "diskPermissions": diskPermissions,
743 |                 "dirPermissions": dirPermsString,
744 |                 "locationName": vmDirContext.storage ?? "default",
745 |             ])
746 | 
747 |         if !diskExists {
748 |             Logger.error("VM disk image does not exist", metadata: ["diskPath": diskPath])
749 |         } else if diskSize == 0 {
750 |             Logger.error("VM disk image exists but has zero size", metadata: ["diskPath": diskPath])
751 |         }
752 |     }
753 | 
754 |     func setup(
755 |         ipswPath: String,
756 |         cpuCount: Int,
757 |         memorySize: UInt64,
758 |         diskSize: UInt64,
759 |         display: String
760 |     ) async throws {
761 |         fatalError("Must be implemented by subclass")
762 |     }
763 | 
764 |     // MARK: - Finalization
765 | 
766 |     /// Post-installation step to move the VM directory to the home directory
767 |     func finalize(to name: String, home: Home, storage: String? = nil) throws {
768 |         let vmDir = try home.getVMDirectory(name, storage: storage)
769 |         try FileManager.default.moveItem(at: vmDirContext.dir.dir.url, to: vmDir.dir.url)
770 |     }
771 | 
772 |     // Method to run VM with additional USB mass storage devices
773 |     func runWithUSBStorage(
774 |         noDisplay: Bool, sharedDirectories: [SharedDirectory], mount: Path?, vncPort: Int = 0,
775 |         recoveryMode: Bool = false, usbImagePaths: [Path]
776 |     ) async throws {
777 |         guard vmDirContext.initialized else {
778 |             throw VMError.notInitialized(vmDirContext.name)
779 |         }
780 | 
781 |         guard let cpuCount = vmDirContext.config.cpuCount,
782 |             let memorySize = vmDirContext.config.memorySize
783 |         else {
784 |             throw VMError.notInitialized(vmDirContext.name)
785 |         }
786 | 
787 |         // Try to acquire lock on config file
788 |         let fileHandle = try FileHandle(forWritingTo: vmDirContext.dir.configPath.url)
789 |         guard flock(fileHandle.fileDescriptor, LOCK_EX | LOCK_NB) == 0 else {
790 |             try? fileHandle.close()
791 |             throw VMError.alreadyRunning(vmDirContext.name)
792 |         }
793 | 
794 |         Logger.info(
795 |             "Running VM with USB storage devices",
796 |             metadata: [
797 |                 "cpuCount": "\(cpuCount)",
798 |                 "memorySize": "\(memorySize)",
799 |                 "diskSize": "\(vmDirContext.config.diskSize ?? 0)",
800 |                 "usbImageCount": "\(usbImagePaths.count)",
801 |                 "recoveryMode": "\(recoveryMode)",
802 |             ])
803 | 
804 |         // Create and configure the VM
805 |         do {
806 |             let config = try createVMVirtualizationServiceContext(
807 |                 cpuCount: cpuCount,
808 |                 memorySize: memorySize,
809 |                 display: vmDirContext.config.display.string,
810 |                 sharedDirectories: sharedDirectories,
811 |                 mount: mount,
812 |                 recoveryMode: recoveryMode,
813 |                 usbMassStoragePaths: usbImagePaths
814 |             )
815 |             virtualizationService = try virtualizationServiceFactory(config)
816 | 
817 |             let vncInfo = try await setupSession(
818 |                 noDisplay: noDisplay, port: vncPort, sharedDirectories: sharedDirectories)
819 |             Logger.info("VNC info", metadata: ["vncInfo": vncInfo])
820 | 
821 |             // Start the VM
822 |             guard let service = virtualizationService else {
823 |                 throw VMError.internalError("Virtualization service not initialized")
824 |             }
825 |             try await service.start()
826 | 
827 |             while true {
828 |                 try await Task.sleep(nanoseconds: UInt64(1e9))
829 |             }
830 |         } catch {
831 |             Logger.error(
832 |                 "Failed to create/start VM with USB storage",
833 |                 metadata: [
834 |                     "error": "\(error)",
835 |                     "errorType": "\(type(of: error))",
836 |                 ])
837 |             virtualizationService = nil
838 |             vncService.stop()
839 |             // Release lock
840 |             flock(fileHandle.fileDescriptor, LOCK_UN)
841 |             try? fileHandle.close()
842 |             throw error
843 |         }
844 |     }
845 | }
846 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/uitars.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | UITARS agent loop implementation using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B
  3 | Paper: https://arxiv.org/abs/2501.12326
  4 | Code: https://github.com/bytedance/UI-TARS
  5 | """
  6 | 
  7 | import asyncio
  8 | from ctypes import cast
  9 | import json
 10 | import base64
 11 | import math
 12 | import re
 13 | import ast
 14 | from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
 15 | from io import BytesIO
 16 | from PIL import Image
 17 | import litellm
 18 | from litellm.types.utils import ModelResponse
 19 | from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
 20 | from litellm.responses.utils import Usage
 21 | from openai.types.responses.response_computer_tool_call_param import ActionType, ResponseComputerToolCallParam
 22 | from openai.types.responses.response_input_param import ComputerCallOutput
 23 | from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
 24 | from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
 25 | 
 26 | from ..decorators import register_agent
 27 | from ..types import Messages, AgentResponse, Tools, AgentCapability
 28 | from ..responses import (
 29 |     make_reasoning_item, 
 30 |     make_output_text_item,
 31 |     make_click_item,
 32 |     make_double_click_item,
 33 |     make_drag_item,
 34 |     make_keypress_item,
 35 |     make_scroll_item,
 36 |     make_type_item,
 37 |     make_wait_item,
 38 |     make_input_image_item
 39 | )
 40 | 
 41 | # Constants from reference code
 42 | IMAGE_FACTOR = 28
 43 | MIN_PIXELS = 100 * 28 * 28
 44 | MAX_PIXELS = 16384 * 28 * 28
 45 | MAX_RATIO = 200
 46 | 
 47 | FINISH_WORD = "finished"
 48 | WAIT_WORD = "wait"
 49 | ENV_FAIL_WORD = "error_env"
 50 | CALL_USER = "call_user"
 51 | 
 52 | # Action space prompt for UITARS
 53 | UITARS_ACTION_SPACE = """
 54 | click(start_box='<|box_start|>(x1,y1)<|box_end|>')
 55 | left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
 56 | right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
 57 | drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
 58 | hotkey(key='')
 59 | type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
 60 | scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
 61 | wait() #Sleep for 5s and take a screenshot to check for any changes.
 62 | finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
 63 | """
 64 | 
 65 | UITARS_PROMPT_TEMPLATE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. 
 66 | 
 67 | ## Output Format
 68 | ```
 69 | Thought: ...
 70 | Action: ...
 71 | ```
 72 | 
 73 | ## Action Space
 74 | {action_space}
 75 | 
 76 | ## Note
 77 | - Use {language} in `Thought` part.
 78 | - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
 79 | 
 80 | ## User Instruction
 81 | {instruction}
 82 | """
 83 | 
 84 | GROUNDING_UITARS_PROMPT_TEMPLATE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. 
 85 | 
 86 | ## Output Format
 87 | 
 88 | Action: ...
 89 | 
 90 | 
 91 | ## Action Space
 92 | click(point='<|box_start|>(x1,y1)<|box_end|>')
 93 | 
 94 | ## User Instruction
 95 | {instruction}"""
 96 | 
 97 | def round_by_factor(number: float, factor: int) -> int:
 98 |     """Returns the closest integer to 'number' that is divisible by 'factor'."""
 99 |     return round(number / factor) * factor
100 | 
101 | 
102 | def ceil_by_factor(number: float, factor: int) -> int:
103 |     """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
104 |     return math.ceil(number / factor) * factor
105 | 
106 | 
107 | def floor_by_factor(number: float, factor: int) -> int:
108 |     """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
109 |     return math.floor(number / factor) * factor
110 | 
111 | 
112 | def smart_resize(
113 |     height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
114 | ) -> tuple[int, int]:
115 |     """
116 |     Rescales the image so that the following conditions are met:
117 |     1. Both dimensions (height and width) are divisible by 'factor'.
118 |     2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
119 |     3. The aspect ratio of the image is maintained as closely as possible.
120 |     """
121 |     if max(height, width) / min(height, width) > MAX_RATIO:
122 |         raise ValueError(
123 |             f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
124 |         )
125 |     h_bar = max(factor, round_by_factor(height, factor))
126 |     w_bar = max(factor, round_by_factor(width, factor))
127 |     if h_bar * w_bar > max_pixels:
128 |         beta = math.sqrt((height * width) / max_pixels)
129 |         h_bar = floor_by_factor(height / beta, factor)
130 |         w_bar = floor_by_factor(width / beta, factor)
131 |     elif h_bar * w_bar < min_pixels:
132 |         beta = math.sqrt(min_pixels / (height * width))
133 |         h_bar = ceil_by_factor(height * beta, factor)
134 |         w_bar = ceil_by_factor(width * beta, factor)
135 |     return h_bar, w_bar
136 | 
137 | 
138 | def escape_single_quotes(text):
139 |     """Escape single quotes in text for safe string formatting."""
140 |     pattern = r"(?<!\\)'"
141 |     return re.sub(pattern, r"\\'", text)
142 | 
143 | 
144 | def parse_action(action_str):
145 |     """Parse action string into structured format."""
146 |     try:
147 |         node = ast.parse(action_str, mode='eval')
148 |         if not isinstance(node, ast.Expression):
149 |             raise ValueError("Not an expression")
150 |         
151 |         call = node.body
152 |         if not isinstance(call, ast.Call):
153 |             raise ValueError("Not a function call")
154 |         
155 |         # Get function name
156 |         if isinstance(call.func, ast.Name):
157 |             func_name = call.func.id
158 |         elif isinstance(call.func, ast.Attribute):
159 |             func_name = call.func.attr
160 |         else:
161 |             func_name = None
162 |         
163 |         # Get keyword arguments
164 |         kwargs = {}
165 |         for kw in call.keywords:
166 |             key = kw.arg
167 |             if isinstance(kw.value, ast.Constant):
168 |                 value = kw.value.value
169 |             elif isinstance(kw.value, ast.Str):  # Compatibility with older Python
170 |                 value = kw.value.s
171 |             else:
172 |                 value = None
173 |             kwargs[key] = value
174 |         
175 |         return {
176 |             'function': func_name,
177 |             'args': kwargs
178 |         }
179 |     
180 |     except Exception as e:
181 |         print(f"Failed to parse action '{action_str}': {e}")
182 |         return None
183 | 
184 | 
185 | def parse_uitars_response(text: str, image_width: int, image_height: int) -> List[Dict[str, Any]]:
186 |     """Parse UITARS model response into structured actions."""
187 |     text = text.strip()
188 |     
189 |     # Extract thought
190 |     thought = None
191 |     if text.startswith("Thought:"):
192 |         thought_match = re.search(r"Thought: (.+?)(?=\s*Action:|$)", text, re.DOTALL)
193 |         if thought_match:
194 |             thought = thought_match.group(1).strip()
195 |     
196 |     # Extract action
197 |     if "Action:" not in text:
198 |         raise ValueError("No Action found in response")
199 |     
200 |     action_str = text.split("Action:")[-1].strip()
201 | 
202 |     # Handle special case for type actions
203 |     if "type(content" in action_str:
204 |         def escape_quotes(match):
205 |             return match.group(1)
206 |         
207 |         pattern = r"type\(content='(.*?)'\)"
208 |         content = re.sub(pattern, escape_quotes, action_str)
209 |         action_str = escape_single_quotes(content)
210 |         action_str = "type(content='" + action_str + "')"
211 |         
212 |     
213 |     # Parse the action
214 |     parsed_action = parse_action(action_str.replace("\n", "\\n").lstrip())
215 |     if parsed_action is None:
216 |         raise ValueError(f"Action can't parse: {action_str}")
217 |     
218 |     action_type = parsed_action["function"]
219 |     params = parsed_action["args"]
220 |     
221 |     # Process parameters
222 |     action_inputs = {}
223 |     for param_name, param in params.items():
224 |         if param == "":
225 |             continue
226 |         param = str(param).lstrip()
227 |         action_inputs[param_name.strip()] = param
228 |         
229 |         # Handle coordinate parameters
230 |         if "start_box" in param_name or "end_box" in param_name:
231 |             # Parse coordinates like '<|box_start|>(x,y)<|box_end|>' or '(x,y)'
232 |             # First, remove special tokens
233 |             clean_param = param.replace("<|box_start|>", "").replace("<|box_end|>", "")
234 |             # Then remove parentheses and split
235 |             numbers = clean_param.replace("(", "").replace(")", "").split(",")
236 |             
237 |             try:
238 |                 float_numbers = [float(num.strip()) / 1000 for num in numbers]  # Normalize to 0-1 range
239 |                 
240 |                 if len(float_numbers) == 2:
241 |                     # Single point, duplicate for box format
242 |                     float_numbers = [float_numbers[0], float_numbers[1], float_numbers[0], float_numbers[1]]
243 |                 
244 |                 action_inputs[param_name.strip()] = str(float_numbers)
245 |             except ValueError as e:
246 |                 # If parsing fails, keep the original parameter value
247 |                 print(f"Warning: Could not parse coordinates '{param}': {e}")
248 |                 action_inputs[param_name.strip()] = param
249 |     
250 |     return [{
251 |         "thought": thought,
252 |         "action_type": action_type,
253 |         "action_inputs": action_inputs,
254 |         "text": text
255 |     }]
256 | 
257 | 
258 | def convert_to_computer_actions(parsed_responses: List[Dict[str, Any]], image_width: int, image_height: int) -> List[ResponseComputerToolCallParam | ResponseOutputMessageParam]:
259 |     """Convert parsed UITARS responses to computer actions."""
260 |     computer_actions = []
261 |     
262 |     for response in parsed_responses:
263 |         action_type = response.get("action_type")
264 |         action_inputs = response.get("action_inputs", {})
265 |         
266 |         if action_type == "finished":
267 |             finished_text = action_inputs.get("content", "Task completed successfully.")
268 |             computer_actions.append(make_output_text_item(finished_text))
269 |             break
270 |         
271 |         elif action_type == "wait":
272 |             computer_actions.append(make_wait_item())
273 |         
274 |         elif action_type == "call_user":
275 |             computer_actions.append(make_output_text_item("I need assistance from the user to proceed with this task."))
276 |         
277 |         elif action_type in ["click", "left_single"]:
278 |             start_box = action_inputs.get("start_box")
279 |             if start_box:
280 |                 coords = eval(start_box)
281 |                 x = int((coords[0] + coords[2]) / 2 * image_width)
282 |                 y = int((coords[1] + coords[3]) / 2 * image_height)
283 |                 
284 |                 computer_actions.append(make_click_item(x, y, "left"))
285 |         
286 |         elif action_type == "double_click":
287 |             start_box = action_inputs.get("start_box")
288 |             if start_box:
289 |                 coords = eval(start_box)
290 |                 x = int((coords[0] + coords[2]) / 2 * image_width)
291 |                 y = int((coords[1] + coords[3]) / 2 * image_height)
292 |                 
293 |                 computer_actions.append(make_double_click_item(x, y))
294 |         
295 |         elif action_type == "right_click":
296 |             start_box = action_inputs.get("start_box")
297 |             if start_box:
298 |                 coords = eval(start_box)
299 |                 x = int((coords[0] + coords[2]) / 2 * image_width)
300 |                 y = int((coords[1] + coords[3]) / 2 * image_height)
301 |                 
302 |                 computer_actions.append(make_click_item(x, y, "right"))
303 |         
304 |         elif action_type == "type":
305 |             content = action_inputs.get("content", "")
306 |             computer_actions.append(make_type_item(content))
307 |         
308 |         elif action_type == "hotkey":
309 |             key = action_inputs.get("key", "")
310 |             keys = key.split()
311 |             computer_actions.append(make_keypress_item(keys))
312 |         
313 |         elif action_type == "press":
314 |             key = action_inputs.get("key", "")
315 |             computer_actions.append(make_keypress_item([key]))
316 |         
317 |         elif action_type == "scroll":
318 |             start_box = action_inputs.get("start_box")
319 |             direction = action_inputs.get("direction", "down")
320 |             
321 |             if start_box:
322 |                 coords = eval(start_box)
323 |                 x = int((coords[0] + coords[2]) / 2 * image_width)
324 |                 y = int((coords[1] + coords[3]) / 2 * image_height)
325 |             else:
326 |                 x, y = image_width // 2, image_height // 2
327 |             
328 |             scroll_y = 5 if "up" in direction.lower() else -5
329 |             computer_actions.append(make_scroll_item(x, y, 0, scroll_y))
330 |         
331 |         elif action_type == "drag":
332 |             start_box = action_inputs.get("start_box")
333 |             end_box = action_inputs.get("end_box")
334 |             
335 |             if start_box and end_box:
336 |                 start_coords = eval(start_box)
337 |                 end_coords = eval(end_box)
338 |                 
339 |                 start_x = int((start_coords[0] + start_coords[2]) / 2 * image_width)
340 |                 start_y = int((start_coords[1] + start_coords[3]) / 2 * image_height)
341 |                 end_x = int((end_coords[0] + end_coords[2]) / 2 * image_width)
342 |                 end_y = int((end_coords[1] + end_coords[3]) / 2 * image_height)
343 |                 
344 |                 path = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
345 |                 computer_actions.append(make_drag_item(path))
346 |     
347 |     return computer_actions
348 | 
349 | 
350 | def pil_to_base64(image: Image.Image) -> str:
351 |     """Convert PIL image to base64 string."""
352 |     buffer = BytesIO()
353 |     image.save(buffer, format="PNG")
354 |     return base64.b64encode(buffer.getvalue()).decode("utf-8")
355 | 
356 | 
357 | def process_image_for_uitars(image_data: str, max_pixels: int = MAX_PIXELS, min_pixels: int = MIN_PIXELS) -> tuple[Image.Image, int, int]:
358 |     """Process image for UITARS model input."""
359 |     # Decode base64 image
360 |     if image_data.startswith('data:image'):
361 |         image_data = image_data.split(',')[1]
362 |     
363 |     image_bytes = base64.b64decode(image_data)
364 |     image = Image.open(BytesIO(image_bytes))
365 |     
366 |     original_width, original_height = image.size
367 |     
368 |     # Resize image according to UITARS requirements
369 |     if image.width * image.height > max_pixels:
370 |         resize_factor = math.sqrt(max_pixels / (image.width * image.height))
371 |         width = int(image.width * resize_factor)
372 |         height = int(image.height * resize_factor)
373 |         image = image.resize((width, height))
374 |     
375 |     if image.width * image.height < min_pixels:
376 |         resize_factor = math.sqrt(min_pixels / (image.width * image.height))
377 |         width = math.ceil(image.width * resize_factor)
378 |         height = math.ceil(image.height * resize_factor)
379 |         image = image.resize((width, height))
380 |     
381 |     if image.mode != "RGB":
382 |         image = image.convert("RGB")
383 |     
384 |     return image, original_width, original_height
385 | 
386 | 
387 | def sanitize_message(msg: Any) -> Any:
388 |     """Return a copy of the message with image_url ommited within content parts"""
389 |     if isinstance(msg, dict):
390 |         result = {}
391 |         for key, value in msg.items():
392 |             if key == "content" and isinstance(value, list):
393 |                 result[key] = [
394 |                     {k: v for k, v in item.items() if k != "image_url"} if isinstance(item, dict) else item
395 |                     for item in value
396 |                 ]
397 |             else:
398 |                 result[key] = value
399 |         return result
400 |     elif isinstance(msg, list):
401 |         return [sanitize_message(item) for item in msg]
402 |     else:
403 |         return msg
404 | 
405 | 
406 | def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any]]:
407 |     """
408 |     Convert UITARS internal message format back to LiteLLM format.
409 |     
410 |     This function processes reasoning, computer_call, and computer_call_output messages
411 |     and converts them to the appropriate LiteLLM assistant message format.
412 |     
413 |     Args:
414 |         messages: List of UITARS internal messages
415 |         
416 |     Returns:
417 |         List of LiteLLM formatted messages
418 |     """
419 |     litellm_messages = []
420 |     current_assistant_content = []
421 |     
422 |     for message in messages:
423 |         if isinstance(message, dict):
424 |             message_type = message.get("type")
425 |             
426 |             if message_type == "reasoning":
427 |                 # Extract reasoning text from summary
428 |                 summary = message.get("summary", [])
429 |                 if summary and isinstance(summary, list):
430 |                     for summary_item in summary:
431 |                         if isinstance(summary_item, dict) and summary_item.get("type") == "summary_text":
432 |                             reasoning_text = summary_item.get("text", "")
433 |                             if reasoning_text:
434 |                                 current_assistant_content.append(f"Thought: {reasoning_text}")
435 |             
436 |             elif message_type == "computer_call":
437 |                 # Convert computer action to UITARS action format
438 |                 action = message.get("action", {})
439 |                 action_type = action.get("type")
440 |                 
441 |                 if action_type == "click":
442 |                     x, y = action.get("x", 0), action.get("y", 0)
443 |                     button = action.get("button", "left")
444 |                     if button == "left":
445 |                         action_text = f"Action: click(start_box='({x},{y})')"
446 |                     elif button == "right":
447 |                         action_text = f"Action: right_single(start_box='({x},{y})')"
448 |                     else:
449 |                         action_text = f"Action: click(start_box='({x},{y})')"
450 |                 
451 |                 elif action_type == "double_click":
452 |                     x, y = action.get("x", 0), action.get("y", 0)
453 |                     action_text = f"Action: left_double(start_box='({x},{y})')"
454 |                 
455 |                 elif action_type == "drag":
456 |                     start_x, start_y = action.get("start_x", 0), action.get("start_y", 0)
457 |                     end_x, end_y = action.get("end_x", 0), action.get("end_y", 0)
458 |                     action_text = f"Action: drag(start_box='({start_x},{start_y})', end_box='({end_x},{end_y})')"
459 |                 
460 |                 elif action_type == "key":
461 |                     key = action.get("key", "")
462 |                     action_text = f"Action: hotkey(key='{key}')"
463 |                 
464 |                 elif action_type == "type":
465 |                     text = action.get("text", "")
466 |                     # Escape single quotes in the text
467 |                     escaped_text = escape_single_quotes(text)
468 |                     action_text = f"Action: type(content='{escaped_text}')"
469 |                 
470 |                 elif action_type == "scroll":
471 |                     x, y = action.get("x", 0), action.get("y", 0)
472 |                     direction = action.get("direction", "down")
473 |                     action_text = f"Action: scroll(start_box='({x},{y})', direction='{direction}')"
474 |                 
475 |                 elif action_type == "wait":
476 |                     action_text = "Action: wait()"
477 |                 
478 |                 else:
479 |                     # Fallback for unknown action types
480 |                     action_text = f"Action: {action_type}({action})"
481 |                 
482 |                 current_assistant_content.append(action_text)
483 |                 
484 |                 # When we hit a computer_call_output, finalize the current assistant message
485 |                 if current_assistant_content:
486 |                     litellm_messages.append({
487 |                         "role": "assistant",
488 |                         "content": [{"type": "text", "text": "\n".join(current_assistant_content)}]
489 |                     })
490 |                     current_assistant_content = []
491 |             
492 |             elif message_type == "computer_call_output":
493 |                 # Add screenshot from computer call output
494 |                 output = message.get("output", {})
495 |                 if isinstance(output, dict) and output.get("type") == "input_image":
496 |                     image_url = output.get("image_url", "")
497 |                     if image_url:
498 |                         litellm_messages.append({
499 |                             "role": "user",
500 |                             "content": [{"type": "image_url", "image_url": {"url": image_url}}]
501 |                         })
502 |             
503 |             elif message.get("role") == "user":
504 |                 # # Handle user messages
505 |                 # content = message.get("content", "")
506 |                 # if isinstance(content, str):
507 |                 #     litellm_messages.append({
508 |                 #         "role": "user",
509 |                 #         "content": content
510 |                 #     })
511 |                 # elif isinstance(content, list):
512 |                 #     litellm_messages.append({
513 |                 #         "role": "user",
514 |                 #         "content": content
515 |                 #     })
516 |                 pass
517 |     
518 |     # Add any remaining assistant content
519 |     if current_assistant_content:
520 |         litellm_messages.append({
521 |             "role": "assistant",
522 |             "content": current_assistant_content
523 |         })
524 |     
525 |     return litellm_messages
526 | 
527 | @register_agent(models=r"(?i).*ui-?tars.*")
528 | class UITARSConfig:
529 |     """
530 |     UITARS agent configuration using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B model.
531 |     
532 |     Supports UITARS vision-language models for computer control.
533 |     """
534 |     
535 |     async def predict_step(
536 |         self,
537 |         messages: List[Dict[str, Any]],
538 |         model: str,
539 |         tools: Optional[List[Dict[str, Any]]] = None,
540 |         max_retries: Optional[int] = None,
541 |         stream: bool = False,
542 |         computer_handler=None,
543 |         use_prompt_caching: Optional[bool] = False,
544 |         _on_api_start=None,
545 |         _on_api_end=None,
546 |         _on_usage=None,
547 |         _on_screenshot=None,
548 |         **kwargs
549 |     ) -> Dict[str, Any]:
550 |         """
551 |         Predict the next step based on input messages.
552 |         
553 |         Args:
554 |             messages: Input messages following Responses format
555 |             model: Model name to use
556 |             tools: Optional list of tool schemas
557 |             max_retries: Maximum number of retries
558 |             stream: Whether to stream responses
559 |             computer_handler: Computer handler instance
560 |             _on_api_start: Callback for API start
561 |             _on_api_end: Callback for API end
562 |             _on_usage: Callback for usage tracking
563 |             _on_screenshot: Callback for screenshot events
564 |             **kwargs: Additional arguments
565 |             
566 |         Returns:
567 |             Dictionary with "output" (output items) and "usage" array
568 |         """
569 |         tools = tools or []
570 |         
571 |         # Create response items
572 |         response_items = []
573 |         
574 |         # Find computer tool for screen dimensions
575 |         computer_tool = None
576 |         for tool_schema in tools:
577 |             if tool_schema["type"] == "computer":
578 |                 computer_tool = tool_schema["computer"]
579 |                 break
580 |         
581 |         # Get screen dimensions
582 |         screen_width, screen_height = 1024, 768
583 |         if computer_tool:
584 |             try:
585 |                 screen_width, screen_height = await computer_tool.get_dimensions()
586 |             except:
587 |                 pass
588 |         
589 |         # Process messages to extract instruction and image
590 |         instruction = ""
591 |         image_data = None
592 |         
593 |         # Convert messages to list if string
594 |         if isinstance(messages, str):
595 |             messages = [{"role": "user", "content": messages}]
596 |         
597 |         # Extract instruction and latest screenshot
598 |         for message in reversed(messages):
599 |             if isinstance(message, dict):
600 |                 content = message.get("content", "")
601 |                 
602 |                 # Handle different content formats
603 |                 if isinstance(content, str):
604 |                     if not instruction and message.get("role") == "user":
605 |                         instruction = content
606 |                 elif isinstance(content, list):
607 |                     for item in content:
608 |                         if isinstance(item, dict):
609 |                             if item.get("type") == "text" and not instruction:
610 |                                 instruction = item.get("text", "")
611 |                             elif item.get("type") == "image_url" and not image_data:
612 |                                 image_url = item.get("image_url", {})
613 |                                 if isinstance(image_url, dict):
614 |                                     image_data = image_url.get("url", "")
615 |                                 else:
616 |                                     image_data = image_url
617 |             
618 |             # Also check for computer_call_output with screenshots
619 |             if message.get("type") == "computer_call_output" and not image_data:
620 |                 output = message.get("output", {})
621 |                 if isinstance(output, dict) and output.get("type") == "input_image":
622 |                     image_data = output.get("image_url", "")
623 |             
624 |             if instruction and image_data:
625 |                 break
626 |         
627 |         if not instruction:
628 |             instruction = "Help me complete this task by analyzing the screen and taking appropriate actions."
629 |         
630 |         # Create prompt
631 |         user_prompt = UITARS_PROMPT_TEMPLATE.format(
632 |             instruction=instruction,
633 |             action_space=UITARS_ACTION_SPACE,
634 |             language="English"
635 |         )
636 |         
637 |         # Convert conversation history to LiteLLM format
638 |         history_messages = convert_uitars_messages_to_litellm(messages)
639 |         
640 |         # Prepare messages for liteLLM
641 |         litellm_messages = [
642 |             {
643 |                 "role": "system",
644 |                 "content": "You are a helpful assistant."
645 |             }
646 |         ]
647 | 
648 |         # Add current user instruction with screenshot
649 |         current_user_message = {
650 |             "role": "user", 
651 |             "content": [
652 |                 {"type": "text", "text": user_prompt},
653 |             ]
654 |         }
655 |         litellm_messages.append(current_user_message)
656 |         
657 |         # Process image for UITARS
658 |         if not image_data:
659 |             # Take screenshot if none found in messages
660 |             if computer_handler:
661 |                 image_data = await computer_handler.screenshot()
662 |                 await _on_screenshot(image_data, "screenshot_before")
663 | 
664 |                 # Add screenshot to output items so it can be retained in history
665 |                 response_items.append(make_input_image_item(image_data))
666 |             else:
667 |                 raise ValueError("No screenshot found in messages and no computer_handler provided")
668 |         processed_image, original_width, original_height = process_image_for_uitars(image_data)
669 |         encoded_image = pil_to_base64(processed_image)
670 |         
671 |         # Add conversation history
672 |         if history_messages:
673 |             litellm_messages.extend(history_messages)
674 |         else:
675 |             litellm_messages.append({
676 |                 "role": "user",
677 |                 "content": [
678 |                     {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
679 |                 ]
680 |             })
681 | 
682 |         # Prepare API call kwargs
683 |         api_kwargs = {
684 |             "model": model,
685 |             "messages": litellm_messages,
686 |             "max_tokens": kwargs.get("max_tokens", 500),
687 |             "temperature": kwargs.get("temperature", 0.0),
688 |             "do_sample": kwargs.get("temperature", 0.0) > 0.0,
689 |             "num_retries": max_retries,
690 |             **{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]}
691 |         }
692 |         
693 |         # Call API start hook
694 |         if _on_api_start:
695 |             await _on_api_start(api_kwargs)
696 |         
697 |         # Call liteLLM with UITARS model
698 |         response = await litellm.acompletion(**api_kwargs)
699 |         
700 |         # Call API end hook
701 |         if _on_api_end:
702 |             await _on_api_end(api_kwargs, response)
703 |         
704 |         # Extract response content
705 |         response_content = response.choices[0].message.content.strip() # type: ignore
706 |         
707 |         # Parse UITARS response
708 |         parsed_responses = parse_uitars_response(response_content, original_width, original_height)
709 |         
710 |         # Convert to computer actions
711 |         computer_actions = convert_to_computer_actions(parsed_responses, original_width, original_height)
712 |         
713 |         # Add computer actions to response items
714 |         thought = parsed_responses[0].get("thought", "")
715 |         if thought:
716 |             response_items.append(make_reasoning_item(thought))
717 |         response_items.extend(computer_actions)
718 |         
719 |         # Extract usage information
720 |         response_usage = {
721 |             **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
722 |             "response_cost": response._hidden_params.get("response_cost", 0.0),
723 |         }
724 |         if _on_usage:
725 |             await _on_usage(response_usage)
726 | 
727 |         # Create agent response
728 |         agent_response = {
729 |             "output": response_items,
730 |             "usage": response_usage
731 |         }
732 |         
733 |         return agent_response
734 |     
735 |     async def predict_click(
736 |         self,
737 |         model: str,
738 |         image_b64: str,
739 |         instruction: str
740 |     ) -> Optional[Tuple[int, int]]:
741 |         """
742 |         Predict click coordinates based on image and instruction.
743 |         
744 |         UITARS supports click prediction through its action parsing.
745 |         
746 |         Args:
747 |             model: Model name to use
748 |             image_b64: Base64 encoded image
749 |             instruction: Instruction for where to click
750 |             
751 |         Returns:
752 |             Tuple with (x, y) coordinates or None
753 |         """
754 |         try:
755 |             # Create prompt using grounding template
756 |             user_prompt = GROUNDING_UITARS_PROMPT_TEMPLATE.format(
757 |                 instruction=instruction
758 |             )
759 |             
760 |             # Process image for UITARS
761 |             processed_image, original_width, original_height = process_image_for_uitars(image_b64)
762 |             encoded_image = pil_to_base64(processed_image)
763 |             
764 |             # Prepare messages for liteLLM
765 |             litellm_messages = [
766 |                 {
767 |                     "role": "system",
768 |                     "content": "You are a helpful assistant."
769 |                 },
770 |                 {
771 |                     "role": "user",
772 |                     "content": [
773 |                         {"type": "text", "text": user_prompt},
774 |                         {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
775 |                     ]
776 |                 }
777 |             ]
778 |             
779 |             # Prepare API call kwargs
780 |             api_kwargs = {
781 |                 "model": model,
782 |                 "messages": litellm_messages,
783 |                 "max_tokens": 2056,
784 |                 "temperature": 0.0,
785 |                 "do_sample": False
786 |             }
787 |             
788 |             # Call liteLLM with UITARS model
789 |             response = await litellm.acompletion(**api_kwargs)
790 |             
791 |             # Extract response content
792 |             response_content = response.choices[0].message.content.strip() # type: ignore
793 |             
794 |             print(response_content)
795 | 
796 |             # Parse the response to extract click coordinates
797 |             # Look for click action with coordinates (with special tokens)
798 |             click_pattern = r"click\(point='<\|box_start\|>\((\d+),(\d+)\)<\|box_end\|>'\)"
799 |             match = re.search(click_pattern, response_content)
800 |             
801 |             # Fallback: Look for simpler format without special tokens
802 |             if not match:
803 |                 # Pattern for: click(start_box='(x,y)') or click(point='(x,y)')
804 |                 fallback_pattern = r"click\((?:start_box|point)='\((\d+),(\d+)\)'\)"
805 |                 match = re.search(fallback_pattern, response_content)
806 |             
807 |             if match:
808 |                 x, y = int(match.group(1)), int(match.group(2))
809 |                 # Scale coordinates back to original image dimensions
810 |                 scale_x = original_width / processed_image.width
811 |                 scale_y = original_height / processed_image.height
812 |                 
813 |                 scaled_x = int(x * scale_x)
814 |                 scaled_y = int(y * scale_y)
815 |                 
816 |                 return (scaled_x, scaled_y)
817 |             
818 |             return None
819 |             
820 |         except Exception as e:
821 |             # Log error and return None
822 |             print(f"Error in predict_click: {e}")
823 |             return None
824 |     
825 |     def get_capabilities(self) -> List[AgentCapability]:
826 |         """
827 |         Get list of capabilities supported by this agent config.
828 |         
829 |         Returns:
830 |             List of capability strings
831 |         """
832 |         return ["step", "click"]
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/glm45v.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | GLM-4.5V agent loop implementation using liteLLM for GLM-4.5V model.
  3 | Supports vision-language models for computer control with bounding box parsing.
  4 | """
  5 | 
  6 | import asyncio
  7 | import json
  8 | import base64
  9 | import re
 10 | from typing import Dict, List, Any, Optional, Tuple
 11 | from io import BytesIO
 12 | from PIL import Image
 13 | import litellm
 14 | from litellm.types.utils import ModelResponse
 15 | from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
 16 | 
 17 | from ..decorators import register_agent
 18 | from ..types import Messages, AgentResponse, Tools, AgentCapability
 19 | from ..loops.base import AsyncAgentConfig
 20 | from ..responses import (
 21 |     convert_responses_items_to_completion_messages,
 22 |     convert_completion_messages_to_responses_items,
 23 |     make_reasoning_item,
 24 |     make_output_text_item,
 25 |     make_click_item,
 26 |     make_double_click_item,
 27 |     make_drag_item,
 28 |     make_keypress_item,
 29 |     make_scroll_item,
 30 |     make_type_item,
 31 |     make_wait_item,
 32 |     make_input_image_item
 33 | )
 34 | 
 35 | # GLM-4.5V specific constants
 36 | GLM_ACTION_SPACE = """
 37 | ### {left,right,middle}_click
 38 | 
 39 | Call rule: `{left,right,middle}_click(start_box='[x,y]', element_info='')`
 40 | {
 41 |     'name': ['left_click', 'right_click', 'middle_click'],
 42 |     'description': 'Perform a left/right/middle mouse click at the specified coordinates on the screen.',
 43 |     'parameters': {
 44 |         'type': 'object',
 45 |         'properties': {
 46 |             'start_box': {
 47 |                 'type': 'array',
 48 |                 'items': {
 49 |                     'type': 'integer'
 50 |                 },
 51 |                 'description': 'Coordinates [x,y] where to perform the click, normalized to 0-999 range.'
 52 |             },
 53 |             'element_info': {
 54 |                 'type': 'string',
 55 |                 'description': 'Optional text description of the UI element being clicked.'
 56 |             }
 57 |         },
 58 |         'required': ['start_box']
 59 |     }
 60 | }
 61 | 
 62 | ### hover
 63 | 
 64 | Call rule: `hover(start_box='[x,y]', element_info='')`
 65 | {
 66 |     'name': 'hover',
 67 |     'description': 'Move the mouse pointer to the specified coordinates without performing any click action.',
 68 |     'parameters': {
 69 |         'type': 'object',
 70 |         'properties': {
 71 |             'start_box': {
 72 |                 'type': 'array',
 73 |                 'items': {
 74 |                     'type': 'integer'
 75 |                 },
 76 |                 'description': 'Coordinates [x,y] where to move the mouse pointer, normalized to 0-999 range.'
 77 |             },
 78 |             'element_info': {
 79 |                 'type': 'string',
 80 |                 'description': 'Optional text description of the UI element being hovered over.'
 81 |             }
 82 |         },
 83 |         'required': ['start_box']
 84 |     }
 85 | }
 86 | 
 87 | ### left_double_click
 88 | 
 89 | Call rule: `left_double_click(start_box='[x,y]', element_info='')`
 90 | {
 91 |     'name': 'left_double_click',
 92 |     'description': 'Perform a left mouse double-click at the specified coordinates on the screen.',
 93 |     'parameters': {
 94 |         'type': 'object',
 95 |         'properties': {
 96 |             'start_box': {
 97 |                 'type': 'array',
 98 |                 'items': {
 99 |                     'type': 'integer'
100 |                 },
101 |                 'description': 'Coordinates [x,y] where to perform the double-click, normalized to 0-999 range.'
102 |             },
103 |             'element_info': {
104 |                 'type': 'string',
105 |                 'description': 'Optional text description of the UI element being double-clicked.'
106 |             }
107 |         },
108 |         'required': ['start_box']
109 |     }
110 | }
111 | 
112 | ### left_drag
113 | 
114 | Call rule: `left_drag(start_box='[x1,y1]', end_box='[x2,y2]', element_info='')`
115 | {
116 |     'name': 'left_drag',
117 |     'description': 'Drag the mouse from starting coordinates to ending coordinates while holding the left mouse button.',
118 |     'parameters': {
119 |         'type': 'object',
120 |         'properties': {
121 |             'start_box': {
122 |                 'type': 'array',
123 |                 'items': {
124 |                     'type': 'integer'
125 |                 },
126 |                 'description': 'Starting coordinates [x1,y1] for the drag operation, normalized to 0-999 range.'
127 |             },
128 |             'end_box': {
129 |                 'type': 'array',
130 |                 'items': {
131 |                     'type': 'integer'
132 |                 },
133 |                 'description': 'Ending coordinates [x2,y2] for the drag operation, normalized to 0-999 range.'
134 |             },
135 |             'element_info': {
136 |                 'type': 'string',
137 |                 'description': 'Optional text description of the UI element being dragged.'
138 |             }
139 |         },
140 |         'required': ['start_box', 'end_box']
141 |     }
142 | }
143 | 
144 | ### key
145 | 
146 | Call rule: `key(keys='')`
147 | {
148 |     'name': 'key',
149 |     'description': 'Simulate pressing a single key or combination of keys on the keyboard.',
150 |     'parameters': {
151 |         'type': 'object',
152 |         'properties': {
153 |             'keys': {
154 |                 'type': 'string',
155 |                 'description': 'The key or key combination to press. Use '+' to separate keys in combinations (e.g., 'ctrl+c', 'alt+tab').'
156 |             }
157 |         },
158 |         'required': ['keys']
159 |     }
160 | }
161 | 
162 | ### type
163 | 
164 | Call rule: `type(content='')`
165 | {
166 |     'name': 'type',
167 |     'description': 'Type text content into the currently focused text input field. This action only performs typing and does not handle field activation or clearing.',
168 |     'parameters': {
169 |         'type': 'object',
170 |         'properties': {
171 |             'content': {
172 |                 'type': 'string',
173 |                 'description': 'The text content to be typed into the active text field.'
174 |             }
175 |         },
176 |         'required': ['content']
177 |     }
178 | }
179 | 
180 | ### scroll
181 | 
182 | Call rule: `scroll(start_box='[x,y]', direction='', step=5, element_info='')`
183 | {
184 |     'name': 'scroll',
185 |     'description': 'Scroll an element at the specified coordinates in the specified direction by a given number of wheel steps.',
186 |     'parameters': {
187 |         'type': 'object',
188 |         'properties': {
189 |             'start_box': {
190 |                 'type': 'array',
191 |                 'items': {
192 |                     'type': 'integer'
193 |                 },
194 |                 'description': 'Coordinates [x,y] of the element or area to scroll, normalized to 0-999 range.'
195 |             },
196 |             'direction': {
197 |                 'type': 'string',
198 |                 'enum': ['down', 'up'],
199 |                 'description': 'The direction to scroll: 'down' or 'up'.'
200 |             },
201 |             'step': {
202 |                 'type': 'integer',
203 |                 'default': 5,
204 |                 'description': 'Number of wheel steps to scroll, default is 5.'
205 |             },
206 |             'element_info': {
207 |                 'type': 'string',
208 |                 'description': 'Optional text description of the UI element being scrolled.'
209 |             }
210 |         },
211 |         'required': ['start_box', 'direction']
212 |     }
213 | }
214 | 
215 | ### WAIT
216 | 
217 | Call rule: `WAIT()`
218 | {
219 |     'name': 'WAIT',
220 |     'description': 'Wait for 5 seconds before proceeding to the next action.',
221 |     'parameters': {
222 |         'type': 'object',
223 |         'properties': {},
224 |         'required': []
225 |     }
226 | }
227 | 
228 | ### DONE
229 | 
230 | Call rule: `DONE()`
231 | {
232 |     'name': 'DONE',
233 |     'description': 'Indicate that the current task has been completed successfully and no further actions are needed.',
234 |     'parameters': {
235 |         'type': 'object',
236 |         'properties': {},
237 |         'required': []
238 |     }
239 | }
240 | 
241 | ### FAIL
242 | 
243 | Call rule: `FAIL()`
244 | {
245 |     'name': 'FAIL',
246 |     'description': 'Indicate that the current task cannot be completed or is impossible to accomplish.',
247 |     'parameters': {
248 |         'type': 'object',
249 |         'properties': {},
250 |         'required': []
251 |     }
252 | }"""
253 | 
254 | def encode_image_to_base64(image_path: str) -> str:
255 |     """Encode image file to base64 string with data URI."""
256 |     with open(image_path, "rb") as image_file:
257 |         encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
258 |         return f"data:image/png;base64,{encoded_string}"
259 | 
260 | def parse_glm_response(response: str) -> Dict[str, Any]:
261 |     """
262 |     Parse GLM-4.5V response to extract action and memory.
263 |     
264 |     The special tokens <|begin_of_box|> and <|end_of_box|> mark bounding boxes.
265 |     Coordinates are normalized values between 0 and 1000.
266 |     """
267 |     # Extract action from between special tokens
268 |     pattern = r"<\|begin_of_box\|>(.*?)<\|end_of_box\|>"
269 |     match = re.search(pattern, response)
270 |     if match:
271 |         action = match.group(1).strip()
272 |     else:
273 |         # Fallback: look for function call patterns
274 |         action_pattern = r"[\w_]+\([^)]*\)"
275 |         matches = re.findall(action_pattern, response)
276 |         action = matches[0] if matches else None
277 |     
278 |     # Extract memory section
279 |     memory_pattern = r"Memory:(.*?)$"
280 |     memory_match = re.search(memory_pattern, response, re.DOTALL)
281 |     memory = memory_match.group(1).strip() if memory_match else "[]"
282 |     
283 |     # Extract action text (everything before Memory:)
284 |     action_text_pattern = r'^(.*?)Memory:'
285 |     action_text_match = re.search(action_text_pattern, response, re.DOTALL)
286 |     action_text = action_text_match.group(1).strip() if action_text_match else response
287 |     
288 |     # Clean up action text by removing special tokens
289 |     if action_text:
290 |         action_text = action_text.replace("<|begin_of_box|>", "").replace("<|end_of_box|>", "")
291 |     
292 |     return {
293 |         "action": action,
294 |         "action_text": action_text,
295 |         "memory": memory
296 |     }
297 | 
298 | def get_last_image_from_messages(messages: Messages) -> Optional[str]:
299 |     """Extract the last image from messages for processing."""
300 |     for message in reversed(messages):
301 |         if isinstance(message, dict):
302 |             if message.get("type") == "computer_call_output":
303 |                 output = message.get("output", {})
304 |                 if isinstance(output, dict) and output.get("type") == "input_image":
305 |                     image_url = output.get("image_url", "")
306 |                     if isinstance(image_url, str) and image_url.startswith("data:image/"):
307 |                         # Extract base64 part
308 |                         return image_url.split(",", 1)[1]
309 |             elif message.get("role") == "user":
310 |                 content = message.get("content", [])
311 |                 if isinstance(content, list):
312 |                     for item in reversed(content):
313 |                         if isinstance(item, dict) and item.get("type") == "image_url":
314 |                             image_url_obj = item.get("image_url", {})
315 |                             if isinstance(image_url_obj, dict):
316 |                                 image_url = image_url_obj.get("url", "")
317 |                                 if isinstance(image_url, str) and image_url.startswith("data:image/"):
318 |                                     return image_url.split(",", 1)[1]
319 |     return None
320 | 
321 | def convert_responses_items_to_glm45v_pc_prompt(messages: Messages, task: str, memory: str = "") -> List[Dict[str, Any]]:
322 |     """Convert responses items to GLM-4.5V PC prompt format with historical actions.
323 |     
324 |     Args:
325 |         messages: List of message items from the conversation
326 |         task: The task description
327 |         memory: Current memory state
328 |         
329 |     Returns:
330 |         List of content items for the prompt (text and image_url items)
331 |     """
332 |     action_space = GLM_ACTION_SPACE
333 |     
334 |     # Template head
335 |     head_text = f"""You are a GUI Agent, and your primary task is to respond accurately to user requests or questions. In addition to directly answering the user's queries, you can also use tools or perform GUI operations directly until you fulfill the user's request or provide a correct answer. You should carefully read and understand the images and questions provided by the user, and engage in thinking and reflection when appropriate. The coordinates involved are all represented in thousandths (0-999).
336 | 
337 | # Task:
338 | {task}
339 | 
340 | # Task Platform
341 | Ubuntu
342 | 
343 | # Action Space
344 | {action_space}
345 | 
346 | # Historical Actions and Current Memory
347 | History:"""
348 |     
349 |     # Template tail
350 |     tail_text = f"""
351 | Memory:
352 | {memory}
353 | # Output Format
354 | Plain text explanation with action(param='...')
355 | Memory:
356 | [{{"key": "value"}}, ...]
357 | 
358 | # Some Additional Notes
359 | - I'll give you the most recent 4 history screenshots(shrunked to 50%*50%) along with the historical action steps.
360 | - You should put the key information you *have to remember* in a seperated memory part and I'll give it to you in the next round. The content in this part should be a dict list. If you no longer need some given information, you should remove it from the memory. Even if you don't need to remember anything, you should also output an empty list.
361 | - My computer's password is "password", feel free to use it when you need sudo rights.
362 | - For the thunderbird account "[email protected]", the password is "gTCI";=@y7|QJ0nDa_kN3Sb&>".
363 | 
364 | Current Screenshot:
365 | """
366 |     
367 |     # Build history from messages
368 |     history = []
369 |     history_images = []
370 |     
371 |     # Group messages into steps
372 |     current_step = []
373 |     step_num = 0
374 |     
375 |     for message in messages:
376 |         msg_type = message.get("type")
377 |         
378 |         if msg_type == "reasoning":
379 |             current_step.append(message)
380 |         elif msg_type == "message" and message.get("role") == "assistant":
381 |             current_step.append(message)
382 |         elif msg_type == "computer_call":
383 |             current_step.append(message)
384 |         elif msg_type == "computer_call_output":
385 |             current_step.append(message)
386 |             # End of step - process it
387 |             if current_step:
388 |                 step_num += 1
389 |                 
390 |                 # Extract bot thought from message content
391 |                 bot_thought = ""
392 |                 for item in current_step:
393 |                     if item.get("type") == "message" and item.get("role") == "assistant":
394 |                         content = item.get("content", [])
395 |                         for content_item in content:
396 |                             if content_item.get("type") == "output_text":
397 |                                 bot_thought = content_item.get("text", "")
398 |                                 break
399 |                         break
400 |                 
401 |                 # Extract action from computer_call
402 |                 action_text = ""
403 |                 for item in current_step:
404 |                     if item.get("type") == "computer_call":
405 |                         action = item.get("action", {})
406 |                         action_type = action.get("type", "")
407 |                         
408 |                         if action_type == "click":
409 |                             x, y = action.get("x", 0), action.get("y", 0)
410 |                             # Convert to 0-999 range (assuming screen dimensions)
411 |                             # For now, use direct coordinates - this may need adjustment
412 |                             action_text = f"left_click(start_box='[{x},{y}]')"
413 |                         elif action_type == "double_click":
414 |                             x, y = action.get("x", 0), action.get("y", 0)
415 |                             action_text = f"left_double_click(start_box='[{x},{y}]')"
416 |                         elif action_type == "right_click":
417 |                             x, y = action.get("x", 0), action.get("y", 0)
418 |                             action_text = f"right_click(start_box='[{x},{y}]')"
419 |                         elif action_type == "drag":
420 |                             # Handle drag with path
421 |                             path = action.get("path", [])
422 |                             if len(path) >= 2:
423 |                                 start = path[0]
424 |                                 end = path[-1]
425 |                                 action_text = f"left_drag(start_box='[{start.get('x', 0)},{start.get('y', 0)}]', end_box='[{end.get('x', 0)},{end.get('y', 0)}]')"
426 |                         elif action_type == "keypress":
427 |                             key = action.get("key", "")
428 |                             action_text = f"key(keys='{key}')"
429 |                         elif action_type == "type":
430 |                             text = action.get("text", "")
431 |                             action_text = f"type(content='{text}')"
432 |                         elif action_type == "scroll":
433 |                             x, y = action.get("x", 0), action.get("y", 0)
434 |                             direction = action.get("direction", "down")
435 |                             action_text = f"scroll(start_box='[{x},{y}]', direction='{direction}')"
436 |                         elif action_type == "wait":
437 |                             action_text = "WAIT()"
438 |                         break
439 |                 
440 |                 # Extract screenshot from computer_call_output
441 |                 screenshot_url = None
442 |                 for item in current_step:
443 |                     if item.get("type") == "computer_call_output":
444 |                         output = item.get("output", {})
445 |                         if output.get("type") == "input_image":
446 |                             screenshot_url = output.get("image_url", "")
447 |                             break
448 |                 
449 |                 # Store step info
450 |                 step_info = {
451 |                     "step_num": step_num,
452 |                     "bot_thought": bot_thought,
453 |                     "action_text": action_text,
454 |                     "screenshot_url": screenshot_url
455 |                 }
456 |                 history.append(step_info)
457 |                 
458 |                 # Store screenshot for last 4 steps
459 |                 if screenshot_url:
460 |                     history_images.append(screenshot_url)
461 |                 
462 |                 current_step = []
463 |     
464 |     # Build content array with head, history, and tail
465 |     content = []
466 |     current_text = head_text
467 |     
468 |     total_history_steps = len(history)
469 |     history_image_count = min(4, len(history_images))  # Last 4 images
470 |     
471 |     for step_idx, step_info in enumerate(history):
472 |         step_num = step_info["step_num"]
473 |         bot_thought = step_info["bot_thought"]
474 |         action_text = step_info["action_text"]
475 |         
476 |         if step_idx < total_history_steps - history_image_count:
477 |             # For steps beyond the last 4, use text placeholder
478 |             current_text += f"\nstep {step_num}: Screenshot:(Omitted in context.) Thought: {bot_thought}\nAction: {action_text}"
479 |         else:
480 |             # For the last 4 steps, insert images
481 |             current_text += f"\nstep {step_num}: Screenshot:"
482 |             content.append({"type": "text", "text": current_text})
483 |             
484 |             # Add image
485 |             img_idx = step_idx - (total_history_steps - history_image_count)
486 |             if img_idx < len(history_images):
487 |                 content.append({"type": "image_url", "image_url": {"url": history_images[img_idx]}})
488 |             
489 |             current_text = f" Thought: {bot_thought}\nAction: {action_text}"
490 |     
491 |     # Add tail
492 |     current_text += tail_text
493 |     content.append({"type": "text", "text": current_text})
494 |     
495 |     return content
496 | 
497 | def model_dump(obj) -> Dict[str, Any]:
498 |     if isinstance(obj, dict):
499 |         return {k: model_dump(v) for k, v in obj.items()}
500 |     elif hasattr(obj, "model_dump"):
501 |         return obj.model_dump()
502 |     else:
503 |         return obj
504 | 
505 | def convert_glm_completion_to_responses_items(response: ModelResponse, image_width: int, image_height: int) -> List[Dict[str, Any]]:
506 |     """
507 |     Convert GLM-4.5V completion response to responses items format.
508 |     
509 |     Args:
510 |         response: LiteLLM ModelResponse from GLM-4.5V
511 |         image_width: Original image width for coordinate scaling
512 |         image_height: Original image height for coordinate scaling
513 |         
514 |     Returns:
515 |         List of response items in the proper format
516 |     """
517 |     import uuid
518 |     
519 |     response_items = []
520 |     
521 |     if not response.choices or not response.choices[0].message:
522 |         return response_items
523 |     
524 |     message = response.choices[0].message
525 |     content = message.content or ""
526 |     reasoning_content = getattr(message, 'reasoning_content', None)
527 |     
528 |     # Add reasoning item if present
529 |     if reasoning_content:
530 |         reasoning_item = model_dump(make_reasoning_item(reasoning_content))
531 |         response_items.append(reasoning_item)
532 |     
533 |     # Parse the content to extract action and text
534 |     parsed_response = parse_glm_response(content)
535 |     action = parsed_response.get("action", "")
536 |     action_text = parsed_response.get("action_text", "")
537 |     
538 |     # Add message item with text content (excluding action and memory)
539 |     if action_text:
540 |         # Remove action from action_text if it's there
541 |         clean_text = action_text
542 |         if action and action in clean_text:
543 |             clean_text = clean_text.replace(action, "").strip()
544 |         
545 |         # Remove memory section
546 |         memory_pattern = r"Memory:\s*\[.*?\]\s*$"
547 |         clean_text = re.sub(memory_pattern, "", clean_text, flags=re.DOTALL).strip()
548 |         
549 |         if clean_text:
550 |             message_item = model_dump(make_output_text_item(clean_text))
551 |             response_items.append(message_item)
552 |     
553 |     # Convert action to computer call if present
554 |     if action:
555 |         call_id = f"call_{uuid.uuid4().hex[:8]}"
556 |         
557 |         # Parse different action types and create appropriate computer calls
558 |         if action.startswith("left_click"):
559 |             coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
560 |             if coord_match:
561 |                 x, y = int(coord_match.group(1)), int(coord_match.group(2))
562 |                 # Convert from 0-999 to actual pixel coordinates
563 |                 actual_x = int((x / 999.0) * image_width)
564 |                 actual_y = int((y / 999.0) * image_height)
565 |                 computer_call = model_dump(make_click_item(actual_x, actual_y))
566 |                 computer_call["call_id"] = call_id
567 |                 computer_call["status"] = "completed"
568 |                 response_items.append(computer_call)
569 |         
570 |         elif action.startswith("right_click"):
571 |             coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
572 |             if coord_match:
573 |                 x, y = int(coord_match.group(1)), int(coord_match.group(2))
574 |                 actual_x = int((x / 999.0) * image_width)
575 |                 actual_y = int((y / 999.0) * image_height)
576 |                 computer_call = model_dump(make_click_item(actual_x, actual_y, button="right"))
577 |                 computer_call["call_id"] = call_id
578 |                 computer_call["status"] = "completed"
579 |                 response_items.append(computer_call)
580 |         
581 |         elif action.startswith("left_double_click"):
582 |             coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
583 |             if coord_match:
584 |                 x, y = int(coord_match.group(1)), int(coord_match.group(2))
585 |                 actual_x = int((x / 999.0) * image_width)
586 |                 actual_y = int((y / 999.0) * image_height)
587 |                 computer_call = model_dump(make_double_click_item(actual_x, actual_y))
588 |                 computer_call["call_id"] = call_id
589 |                 computer_call["status"] = "completed"
590 |                 response_items.append(computer_call)
591 |         
592 |         elif action.startswith("left_drag"):
593 |             start_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
594 |             end_match = re.search(r"end_box='?\[(\d+),\s*(\d+)\]'?", action)
595 |             if start_match and end_match:
596 |                 x1, y1 = int(start_match.group(1)), int(start_match.group(2))
597 |                 x2, y2 = int(end_match.group(1)), int(end_match.group(2))
598 |                 actual_x1 = int((x1 / 999.0) * image_width)
599 |                 actual_y1 = int((y1 / 999.0) * image_height)
600 |                 actual_x2 = int((x2 / 999.0) * image_width)
601 |                 actual_y2 = int((y2 / 999.0) * image_height)
602 |                 # Create path for drag operation
603 |                 drag_path = [{"x": actual_x1, "y": actual_y1}, {"x": actual_x2, "y": actual_y2}]
604 |                 computer_call = model_dump(make_drag_item(drag_path))
605 |                 computer_call["call_id"] = call_id
606 |                 computer_call["status"] = "completed"
607 |                 response_items.append(computer_call)
608 |         
609 |         elif action.startswith("key"):
610 |             key_match = re.search(r"keys='([^']+)'", action)
611 |             if key_match:
612 |                 keys = key_match.group(1)
613 |                 # Split keys by '+' for key combinations, or use as single key
614 |                 key_list = keys.split('+') if '+' in keys else [keys]
615 |                 computer_call = model_dump(make_keypress_item(key_list))
616 |                 computer_call["call_id"] = call_id
617 |                 computer_call["status"] = "completed"
618 |                 response_items.append(computer_call)
619 |         
620 |         elif action.startswith("type"):
621 |             content_match = re.search(r"content='([^']*)'", action)
622 |             if content_match:
623 |                 content = content_match.group(1)
624 |                 computer_call = model_dump(make_type_item(content))
625 |                 computer_call["call_id"] = call_id
626 |                 computer_call["status"] = "completed"
627 |                 response_items.append(computer_call)
628 |         
629 |         elif action.startswith("scroll"):
630 |             coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
631 |             direction_match = re.search(r"direction='([^']+)'", action)
632 |             if coord_match and direction_match:
633 |                 x, y = int(coord_match.group(1)), int(coord_match.group(2))
634 |                 direction = direction_match.group(1)
635 |                 actual_x = int((x / 999.0) * image_width)
636 |                 actual_y = int((y / 999.0) * image_height)
637 |                 # Convert direction to scroll amounts
638 |                 scroll_x, scroll_y = 0, 0
639 |                 if direction == "up":
640 |                     scroll_y = -5
641 |                 elif direction == "down":
642 |                     scroll_y = 5
643 |                 elif direction == "left":
644 |                     scroll_x = -5
645 |                 elif direction == "right":
646 |                     scroll_x = 5
647 |                 computer_call = model_dump(make_scroll_item(actual_x, actual_y, scroll_x, scroll_y))
648 |                 computer_call["call_id"] = call_id
649 |                 computer_call["status"] = "completed"
650 |                 response_items.append(computer_call)
651 |         
652 |         elif action == "WAIT()":
653 |             computer_call = model_dump(make_wait_item())
654 |             computer_call["call_id"] = call_id
655 |             computer_call["status"] = "completed"
656 |             response_items.append(computer_call)
657 |     
658 |     return response_items
659 | 
660 | @register_agent(models=r"(?i).*GLM-4\.5V.*")
661 | class Glm4vConfig(AsyncAgentConfig):
662 |     """GLM-4.5V agent configuration using liteLLM."""
663 | 
664 |     async def predict_step(
665 |         self,
666 |         messages: List[Dict[str, Any]],
667 |         model: str,
668 |         tools: Optional[List[Dict[str, Any]]] = None,
669 |         max_retries: Optional[int] = None,
670 |         stream: bool = False,
671 |         computer_handler=None,
672 |         use_prompt_caching: Optional[bool] = False,
673 |         _on_api_start=None,
674 |         _on_api_end=None,
675 |         _on_usage=None,
676 |         _on_screenshot=None,
677 |         **kwargs
678 |     ) -> Dict[str, Any]:
679 |         """
680 |         Predict the next step using GLM-4.5V model.
681 |         
682 |         Args:
683 |             messages: Input messages following Responses format
684 |             model: Model name to use
685 |             tools: Optional list of tool schemas
686 |             max_retries: Maximum number of retries for API calls
687 |             stream: Whether to stream the response
688 |             computer_handler: Computer handler for taking screenshots
689 |             use_prompt_caching: Whether to use prompt caching
690 |             _on_api_start: Callback for API start
691 |             _on_api_end: Callback for API end
692 |             _on_usage: Callback for usage tracking
693 |             _on_screenshot: Callback for screenshot events
694 |             
695 |         Returns:
696 |             Dict with "output" and "usage" keys
697 |         """
698 |         # Get the user instruction from the last user message
699 |         user_instruction = ""
700 |         for message in reversed(messages):
701 |             if isinstance(message, dict) and message.get("role") == "user":
702 |                 content = message.get("content", "")
703 |                 if isinstance(content, str):
704 |                     user_instruction = content
705 |                 elif isinstance(content, list):
706 |                     for item in content:
707 |                         if isinstance(item, dict) and item.get("type") == "text":
708 |                             user_instruction = item.get("text", "")
709 |                             break
710 |                 break
711 |         
712 |         # Get the last image for processing
713 |         last_image_b64 = get_last_image_from_messages(messages)
714 |         if not last_image_b64 and computer_handler:
715 |             # Take a screenshot if no image available
716 |             screenshot_b64 = await computer_handler.screenshot()
717 |             if screenshot_b64:
718 |                 last_image_b64 = screenshot_b64
719 |                 if _on_screenshot:
720 |                     await _on_screenshot(screenshot_b64)
721 |         
722 |         if not last_image_b64:
723 |             raise ValueError("No image available for GLM-4.5V processing")
724 |         
725 |         # Convert responses items to GLM-4.5V PC prompt format with historical actions
726 |         prompt_content = convert_responses_items_to_glm45v_pc_prompt(
727 |             messages=messages,
728 |             task=user_instruction,
729 |             memory="[]"  # Initialize with empty memory for now
730 |         )
731 |         
732 |         # Add the current screenshot to the end
733 |         prompt_content.append({
734 |             "type": "image_url",
735 |             "image_url": {"url": f"data:image/png;base64,{last_image_b64}"}
736 |         })
737 |         
738 |         # Prepare messages for liteLLM
739 |         litellm_messages = [
740 |             {
741 |                 "role": "system",
742 |                 "content": "You are a helpful GUI agent assistant."
743 |             },
744 |             {
745 |                 "role": "user", 
746 |                 "content": prompt_content
747 |             }
748 |         ]
749 |         
750 |         # Prepare API call kwargs
751 |         api_kwargs = {
752 |             "model": model,
753 |             "messages": litellm_messages,
754 |             # "max_tokens": 2048,
755 |             # "temperature": 0.001,
756 |             # "extra_body": {
757 |             #     "skip_special_tokens": False,
758 |             # }
759 |         }
760 |         
761 |         # Add API callbacks
762 |         if _on_api_start:
763 |             await _on_api_start(api_kwargs)
764 |         
765 |         # Call liteLLM
766 |         response = await litellm.acompletion(**api_kwargs)
767 |         
768 |         if _on_api_end:
769 |             await _on_api_end(api_kwargs, response)
770 |         
771 |         # Get image dimensions for coordinate scaling
772 |         image_width, image_height = 1920, 1080  # Default dimensions
773 |         
774 |         # Try to get actual dimensions from the image
775 |         try:
776 |             image_data = base64.b64decode(last_image_b64)
777 |             image = Image.open(BytesIO(image_data))
778 |             image_width, image_height = image.size
779 |         except Exception:
780 |             pass  # Use default dimensions
781 |         
782 |         # Convert GLM completion response to responses items
783 |         response_items = convert_glm_completion_to_responses_items(response, image_width, image_height)
784 |         
785 |         # Extract usage information
786 |         response_usage = {
787 |             **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
788 |             "response_cost": response._hidden_params.get("response_cost", 0.0),
789 |         }
790 |         if _on_usage:
791 |             await _on_usage(response_usage)
792 |         
793 |         # Create agent response
794 |         agent_response = {
795 |             "output": response_items,
796 |             "usage": response_usage
797 |         }
798 |         
799 |         return agent_response
800 | 
801 |     async def predict_click(
802 |         self,
803 |         model: str,
804 |         image_b64: str,
805 |         instruction: str,
806 |         **kwargs
807 |     ) -> Optional[Tuple[int, int]]:
808 |         """
809 |         Predict click coordinates using GLM-4.5V model.
810 |         
811 |         Args:
812 |             model: Model name to use
813 |             image_b64: Base64 encoded image
814 |             instruction: Instruction for where to click
815 |             
816 |         Returns:
817 |             Tuple with (x, y) coordinates or None
818 |         """
819 |         try:
820 |             # Create a simple click instruction prompt
821 |             click_prompt = f"""You are a GUI agent. Look at the screenshot and identify where to click for: {instruction}
822 | 
823 | Respond with a single click action in this format:
824 | left_click(start_box='[x,y]')
825 | 
826 | Where x,y are coordinates normalized to 0-999 range."""
827 |             
828 |             # Prepare messages for liteLLM
829 |             litellm_messages = [
830 |                 {
831 |                     "role": "system",
832 |                     "content": "You are a helpful GUI agent assistant."
833 |                 },
834 |                 {
835 |                     "role": "user",
836 |                     "content": [
837 |                         {"type": "text", "text": click_prompt},
838 |                         {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
839 |                     ]
840 |                 }
841 |             ]
842 |             
843 |             # Prepare API call kwargs
844 |             api_kwargs = {
845 |                 "model": model,
846 |                 "messages": litellm_messages,
847 |                 "max_tokens": 2056,
848 |                 "temperature": 0.001,
849 |                 "extra_body": {
850 |                     "skip_special_tokens": False,
851 |                 }
852 |             }
853 |             
854 |             # Call liteLLM
855 |             response = await litellm.acompletion(**api_kwargs)
856 |             
857 |             # Extract response content
858 |             response_content = response.choices[0].message.content.strip()
859 |             print(response)
860 |             
861 |             # Parse response for click coordinates
862 |             # Look for coordinates in the response, handling special tokens
863 |             coord_pattern = r"<\|begin_of_box\|>.*?left_click\(start_box='?\[(\d+),(\d+)\]'?\).*?<\|end_of_box\|>"
864 |             match = re.search(coord_pattern, response_content)
865 |             
866 |             if not match:
867 |                 # Fallback: look for coordinates without special tokens
868 |                 coord_pattern = r"left_click\(start_box='?\[(\d+),(\d+)\]'?\)"
869 |                 match = re.search(coord_pattern, response_content)
870 | 
871 |             if match:
872 |                 x, y = int(match.group(1)), int(match.group(2))
873 |                 
874 |                 # Get actual image dimensions for scaling
875 |                 try:
876 |                     image_data = base64.b64decode(image_b64)
877 |                     image = Image.open(BytesIO(image_data))
878 |                     image_width, image_height = image.size
879 |                 except Exception:
880 |                     # Use default dimensions
881 |                     image_width, image_height = 1920, 1080
882 |                 
883 |                 # Convert from 0-999 normalized coordinates to actual pixel coordinates
884 |                 actual_x = int((x / 999.0) * image_width)
885 |                 actual_y = int((y / 999.0) * image_height)
886 |                 
887 |                 return (actual_x, actual_y)
888 |             
889 |             return None
890 |             
891 |         except Exception as e:
892 |             # Log error and return None
893 |             print(f"Error in predict_click: {e}")
894 |             return None
895 | 
896 |     def get_capabilities(self) -> List[AgentCapability]:
897 |         """
898 |         Get list of capabilities supported by this agent config.
899 |         
900 |         Returns:
901 |             List of capability strings
902 |         """
903 |         return ["step", "click"]
904 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/ui/gradio/ui_components.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | UI Components for the Gradio interface
  3 | """
  4 | 
  5 | import os
  6 | import asyncio
  7 | import logging
  8 | import json
  9 | import platform
 10 | from pathlib import Path
 11 | from typing import Dict, List, Optional, Any, cast
 12 | import gradio as gr
 13 | from gradio.components.chatbot import MetadataDict
 14 | 
 15 | from .app import (
 16 |     load_settings, save_settings, create_agent, get_model_string, 
 17 |     get_ollama_models, global_agent, global_computer
 18 | )
 19 | 
 20 | # Global messages array to maintain conversation history
 21 | global_messages = []
 22 | 
 23 | 
 24 | def create_gradio_ui() -> gr.Blocks:
 25 |     """Create a Gradio UI for the Computer-Use Agent."""
 26 |     
 27 |     # Load settings
 28 |     saved_settings = load_settings()
 29 |     
 30 |     # Check for API keys
 31 |     openai_api_key = os.environ.get("OPENAI_API_KEY", "")
 32 |     anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
 33 |     cua_api_key = os.environ.get("CUA_API_KEY", "")
 34 |     
 35 |     # Model choices
 36 |     openai_models = ["OpenAI: Computer-Use Preview"]
 37 |     anthropic_models = [
 38 |         "Anthropic: Claude 4 Opus (20250514)",
 39 |         "Anthropic: Claude 4 Sonnet (20250514)",
 40 |         "Anthropic: Claude 3.7 Sonnet (20250219)",
 41 |         "Anthropic: Claude 3.5 Sonnet (20241022)",
 42 |     ]
 43 |     omni_models = [
 44 |         "OMNI: OpenAI GPT-4o",
 45 |         "OMNI: OpenAI GPT-4o mini",
 46 |         "OMNI: Claude 3.7 Sonnet (20250219)", 
 47 |         "OMNI: Claude 3.5 Sonnet (20241022)"
 48 |     ]
 49 |     
 50 |     # Check if API keys are available
 51 |     has_openai_key = bool(openai_api_key)
 52 |     has_anthropic_key = bool(anthropic_api_key)
 53 |     has_cua_key = bool(cua_api_key)
 54 | 
 55 |     # Get Ollama models for OMNI
 56 |     ollama_models = get_ollama_models()
 57 |     if ollama_models:
 58 |         omni_models += ollama_models
 59 | 
 60 |     # Detect platform
 61 |     is_mac = platform.system().lower() == "darwin"
 62 |     
 63 |     # Format model choices
 64 |     provider_to_models = {
 65 |         "OPENAI": openai_models,
 66 |         "ANTHROPIC": anthropic_models,
 67 |         "OMNI": omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
 68 |         "UITARS": ([
 69 |             "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
 70 |         ] if is_mac else []) + ["Custom model (OpenAI compatible API)"],
 71 |     }
 72 | 
 73 |     # Apply saved settings
 74 |     initial_loop = saved_settings.get("agent_loop", "OMNI")
 75 |     available_models_for_loop = provider_to_models.get(initial_loop, [])
 76 |     saved_model_choice = saved_settings.get("model_choice")
 77 |     if saved_model_choice and saved_model_choice in available_models_for_loop:
 78 |         initial_model = saved_model_choice
 79 |     else:
 80 |         if initial_loop == "OPENAI":
 81 |             initial_model = openai_models[0] if openai_models else "No models available"
 82 |         elif initial_loop == "ANTHROPIC":
 83 |             initial_model = anthropic_models[0] if anthropic_models else "No models available"
 84 |         else:  # OMNI
 85 |             initial_model = omni_models[0] if omni_models else "Custom model (OpenAI compatible API)"
 86 | 
 87 |     initial_custom_model = saved_settings.get("custom_model", "Qwen2.5-VL-7B-Instruct")
 88 |     initial_provider_base_url = saved_settings.get("provider_base_url", "http://localhost:1234/v1")
 89 |     initial_save_trajectory = saved_settings.get("save_trajectory", True)
 90 |     initial_recent_images = saved_settings.get("recent_images", 3)
 91 | 
 92 |     # Example prompts
 93 |     example_messages = [
 94 |         "Create a Python virtual environment, install pandas and matplotlib, then plot stock data",
 95 |         "Open a PDF in Preview, add annotations, and save it as a compressed version",
 96 |         "Open Safari, search for 'macOS automation tools', and save the first three results as bookmarks",
 97 |         "Configure SSH keys and set up a connection to a remote server",
 98 |     ]
 99 |     
100 |     def generate_python_code(agent_loop_choice, model_name, tasks, recent_images=3, save_trajectory=True, computer_os="linux", computer_provider="cloud", container_name="", cua_cloud_api_key="", max_budget=None):
101 |         """Generate Python code for the current configuration and tasks."""
102 |         tasks_str = ""
103 |         for task in tasks:
104 |             if task and task.strip():
105 |                 tasks_str += f'            "{task}",\n'
106 |         
107 |         model_string = get_model_string(model_name, agent_loop_choice)
108 |         
109 |         computer_args = []
110 |         if computer_os != "macos":
111 |             computer_args.append(f'os_type="{computer_os}"')
112 |         if computer_provider != "lume":
113 |             computer_args.append(f'provider_type="{computer_provider}"')
114 |         if container_name:
115 |             computer_args.append(f'name="{container_name}"')
116 |         if cua_cloud_api_key:
117 |             computer_args.append(f'api_key="{cua_cloud_api_key}"')
118 |         
119 |         computer_args_str = ", ".join(computer_args)
120 |         if computer_args_str:
121 |             computer_args_str = f"({computer_args_str})"
122 |         else:
123 |             computer_args_str = "()"
124 |         
125 |         code = f'''import asyncio
126 | from computer import Computer
127 | from agent import ComputerAgent
128 | 
129 | async def main():
130 |     async with Computer{computer_args_str} as computer:
131 |         agent = ComputerAgent(
132 |             model="{model_string}",
133 |             tools=[computer],
134 |             only_n_most_recent_images={recent_images},'''
135 |         
136 |         if save_trajectory:
137 |             code += '''
138 |             trajectory_dir="trajectories",'''
139 |         
140 |         if max_budget:
141 |             code += f'''
142 |             max_trajectory_budget={{"max_budget": {max_budget}, "raise_error": True}},'''
143 |             
144 |         code += '''
145 |         )
146 |         '''
147 |         
148 |         if tasks_str:
149 |             code += f'''
150 |         # Prompts for the computer-use agent
151 |         tasks = [
152 | {tasks_str.rstrip()}
153 |         ]
154 | 
155 |         for task in tasks:
156 |             print(f"Executing task: {{task}}")
157 |             messages = [{{"role": "user", "content": task}}]
158 |             async for result in agent.run(messages):
159 |                 for item in result["output"]:
160 |                     if item["type"] == "message":
161 |                         print(item["content"][0]["text"])'''
162 |         else:
163 |             code += f'''
164 |         # Execute a single task
165 |         task = "Search for information about CUA on GitHub"
166 |         print(f"Executing task: {{task}}")
167 |         messages = [{{"role": "user", "content": task}}]
168 |         async for result in agent.run(messages):
169 |             for item in result["output"]:
170 |                 if item["type"] == "message":
171 |                     print(item["content"][0]["text"])'''
172 | 
173 |         code += '''
174 | 
175 | if __name__ == "__main__":
176 |     asyncio.run(main())'''
177 |         
178 |         return code
179 | 
180 |     # Create the Gradio interface
181 |     with gr.Blocks(title="Computer-Use Agent") as demo:
182 |         with gr.Row():
183 |             # Left column for settings
184 |             with gr.Column(scale=1):
185 |                 # Logo
186 |                 gr.HTML(
187 |                     """
188 |                     <div style="display: flex; justify-content: center; margin-bottom: 0.5em">
189 |                         <img alt="CUA Logo" style="width: 80px;"
190 |                              src="https://github.com/trycua/cua/blob/main/img/logo_white.png?raw=true" />
191 |                     </div>
192 |                     """
193 |                 )
194 | 
195 |                 # Python code accordion
196 |                 with gr.Accordion("Python Code", open=False):
197 |                     code_display = gr.Code(
198 |                         language="python",
199 |                         value=generate_python_code(initial_loop, "gpt-4o", []),
200 |                         interactive=False,
201 |                     )
202 |                     
203 |                 with gr.Accordion("Computer Configuration", open=True):
204 |                     is_windows = platform.system().lower() == "windows"
205 |                     is_mac = platform.system().lower() == "darwin"
206 |                     
207 |                     providers = ["cloud", "localhost", "docker"]
208 |                     if is_mac:
209 |                         providers += ["lume"]
210 |                     if is_windows:
211 |                         providers += ["winsandbox"]
212 | 
213 |                     # Remove unavailable options
214 |                     # MacOS is unavailable if Lume is not available
215 |                     # Windows is unavailable if Winsandbox is not available
216 |                     # Linux is always available
217 |                     # This should be removed once we support macOS and Windows on the cloud provider
218 |                     computer_choices = ["macos", "linux", "windows"]
219 |                     if not is_mac or "lume" not in providers:
220 |                         computer_choices.remove("macos")
221 |                     if not is_windows or "winsandbox" not in providers:
222 |                         computer_choices.remove("windows")
223 | 
224 |                     computer_os = gr.Radio(
225 |                         choices=computer_choices,
226 |                         label="Operating System",
227 |                         value=computer_choices[0],
228 |                         info="Select the operating system for the computer",
229 |                     )
230 |                     
231 |                     computer_provider = gr.Radio(
232 |                         choices=providers,
233 |                         label="Provider",
234 |                         value="lume" if is_mac else "cloud",
235 |                         info="Select the computer provider",
236 |                     )
237 |                     
238 |                     container_name = gr.Textbox(
239 |                         label="Container Name",
240 |                         placeholder="Enter container name (optional)",
241 |                         value=os.environ.get("CUA_CONTAINER_NAME", ""),
242 |                         info="Optional name for the container",
243 |                     )
244 |                     
245 |                     cua_cloud_api_key = gr.Textbox(
246 |                         label="CUA Cloud API Key",
247 |                         placeholder="Enter your CUA Cloud API key",
248 |                         value=os.environ.get("CUA_API_KEY", ""),
249 |                         type="password",
250 |                         info="Required for cloud provider",
251 |                         visible=(not has_cua_key)
252 |                     )
253 |                     
254 |                 with gr.Accordion("Agent Configuration", open=True):
255 |                     agent_loop = gr.Dropdown(
256 |                         choices=["OPENAI", "ANTHROPIC", "OMNI", "UITARS"],
257 |                         label="Agent Loop",
258 |                         value=initial_loop,
259 |                         info="Select the agent loop provider",
260 |                     )
261 | 
262 |                     # Model selection dropdowns
263 |                     with gr.Group() as model_selection_group:
264 |                         openai_model_choice = gr.Dropdown(
265 |                             choices=openai_models,
266 |                             label="OpenAI Model",
267 |                             value=openai_models[0] if openai_models else "No models available",
268 |                             info="Select OpenAI model",
269 |                             interactive=True,
270 |                             visible=(initial_loop == "OPENAI")
271 |                         )
272 |                         
273 |                         anthropic_model_choice = gr.Dropdown(
274 |                             choices=anthropic_models,
275 |                             label="Anthropic Model",
276 |                             value=anthropic_models[0] if anthropic_models else "No models available",
277 |                             info="Select Anthropic model",
278 |                             interactive=True,
279 |                             visible=(initial_loop == "ANTHROPIC")
280 |                         )
281 |                         
282 |                         omni_model_choice = gr.Dropdown(
283 |                             choices=omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
284 |                             label="OMNI Model",
285 |                             value=omni_models[0] if omni_models else "Custom model (OpenAI compatible API)",
286 |                             info="Select OMNI model or choose a custom model option",
287 |                             interactive=True,
288 |                             visible=(initial_loop == "OMNI")
289 |                         )
290 |                         
291 |                         uitars_model_choice = gr.Dropdown(
292 |                             choices=provider_to_models.get("UITARS", ["No models available"]),
293 |                             label="UITARS Model",
294 |                             value=provider_to_models.get("UITARS", ["No models available"])[0] if provider_to_models.get("UITARS") else "No models available",
295 |                             info="Select UITARS model",
296 |                             interactive=True,
297 |                             visible=(initial_loop == "UITARS")
298 |                         )
299 |                         
300 |                         model_choice = gr.Textbox(visible=False)
301 | 
302 |                     # API key inputs
303 |                     with gr.Group(visible=not has_openai_key and (initial_loop == "OPENAI" or initial_loop == "OMNI")) as openai_key_group:
304 |                         openai_api_key_input = gr.Textbox(
305 |                             label="OpenAI API Key",
306 |                             placeholder="Enter your OpenAI API key",
307 |                             value=os.environ.get("OPENAI_API_KEY", ""),
308 |                             interactive=True,
309 |                             type="password",
310 |                             info="Required for OpenAI models"
311 |                         )
312 |                     
313 |                     with gr.Group(visible=not has_anthropic_key and (initial_loop == "ANTHROPIC" or initial_loop == "OMNI")) as anthropic_key_group:
314 |                         anthropic_api_key_input = gr.Textbox(
315 |                             label="Anthropic API Key",
316 |                             placeholder="Enter your Anthropic API key",
317 |                             value=os.environ.get("ANTHROPIC_API_KEY", ""),
318 |                             interactive=True,
319 |                             type="password",
320 |                             info="Required for Anthropic models"
321 |                         )
322 |                         
323 |                     # API key handlers
324 |                     def set_openai_api_key(key):
325 |                         if key and key.strip():
326 |                             os.environ["OPENAI_API_KEY"] = key.strip()
327 |                             print(f"DEBUG - Set OpenAI API key environment variable")
328 |                         return key
329 |                     
330 |                     def set_anthropic_api_key(key):
331 |                         if key and key.strip():
332 |                             os.environ["ANTHROPIC_API_KEY"] = key.strip()
333 |                             print(f"DEBUG - Set Anthropic API key environment variable")
334 |                         return key
335 |                     
336 |                     openai_api_key_input.change(
337 |                         fn=set_openai_api_key,
338 |                         inputs=[openai_api_key_input],
339 |                         outputs=[openai_api_key_input],
340 |                         queue=False
341 |                     )
342 |                     
343 |                     anthropic_api_key_input.change(
344 |                         fn=set_anthropic_api_key,
345 |                         inputs=[anthropic_api_key_input],
346 |                         outputs=[anthropic_api_key_input],
347 |                         queue=False
348 |                     )
349 | 
350 |                     # UI update function
351 |                     def update_ui(loop=None, openai_model=None, anthropic_model=None, omni_model=None, uitars_model=None):
352 |                         loop = loop or agent_loop.value
353 |                         
354 |                         model_value = None
355 |                         if loop == "OPENAI" and openai_model:
356 |                             model_value = openai_model
357 |                         elif loop == "ANTHROPIC" and anthropic_model:
358 |                             model_value = anthropic_model
359 |                         elif loop == "OMNI" and omni_model:
360 |                             model_value = omni_model
361 |                         elif loop == "UITARS" and uitars_model:
362 |                             model_value = uitars_model
363 |                         
364 |                         openai_visible = (loop == "OPENAI")
365 |                         anthropic_visible = (loop == "ANTHROPIC")
366 |                         omni_visible = (loop == "OMNI")
367 |                         uitars_visible = (loop == "UITARS")
368 |                         
369 |                         show_openai_key = not has_openai_key and (loop == "OPENAI" or (loop == "OMNI" and model_value and "OpenAI" in model_value and "Custom" not in model_value))
370 |                         show_anthropic_key = not has_anthropic_key and (loop == "ANTHROPIC" or (loop == "OMNI" and model_value and "Claude" in model_value and "Custom" not in model_value))
371 |                         
372 |                         is_custom_openai_api = model_value == "Custom model (OpenAI compatible API)"
373 |                         is_custom_ollama = model_value == "Custom model (ollama)"
374 |                         is_any_custom = is_custom_openai_api or is_custom_ollama
375 |                         
376 |                         model_choice_value = model_value if model_value else ""
377 |                         
378 |                         return [
379 |                             gr.update(visible=openai_visible),
380 |                             gr.update(visible=anthropic_visible),
381 |                             gr.update(visible=omni_visible),
382 |                             gr.update(visible=uitars_visible),
383 |                             gr.update(visible=show_openai_key),
384 |                             gr.update(visible=show_anthropic_key),
385 |                             gr.update(visible=is_any_custom),
386 |                             gr.update(visible=is_custom_openai_api),
387 |                             gr.update(visible=is_custom_openai_api),
388 |                             gr.update(value=model_choice_value)
389 |                         ]
390 |                         
391 |                     # Custom model inputs
392 |                     custom_model = gr.Textbox(
393 |                         label="Custom Model Name",
394 |                         placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct or llama3)",
395 |                         value=initial_custom_model,
396 |                         visible=(initial_model == "Custom model (OpenAI compatible API)" or initial_model == "Custom model (ollama)"),
397 |                         interactive=True,
398 |                     )
399 | 
400 |                     provider_base_url = gr.Textbox(
401 |                         label="Provider Base URL",
402 |                         placeholder="Enter provider base URL (e.g., http://localhost:1234/v1)",
403 |                         value=initial_provider_base_url,
404 |                         visible=(initial_model == "Custom model (OpenAI compatible API)"),
405 |                         interactive=True,
406 |                     )
407 | 
408 |                     provider_api_key = gr.Textbox(
409 |                         label="Provider API Key",
410 |                         placeholder="Enter provider API key (if required)",
411 |                         value="",
412 |                         visible=(initial_model == "Custom model (OpenAI compatible API)"),
413 |                         interactive=True,
414 |                         type="password",
415 |                     )
416 |                     
417 |                     # Provider visibility update function
418 |                     def update_provider_visibility(provider):
419 |                         """Update visibility of container name and API key based on selected provider."""
420 |                         is_localhost = provider == "localhost"
421 |                         return [
422 |                             gr.update(visible=not is_localhost),  # container_name
423 |                             gr.update(visible=not is_localhost and not has_cua_key)  # cua_cloud_api_key
424 |                         ]
425 |                     
426 |                     # Connect provider change event
427 |                     computer_provider.change(
428 |                         fn=update_provider_visibility,
429 |                         inputs=[computer_provider],
430 |                         outputs=[container_name, cua_cloud_api_key],
431 |                         queue=False
432 |                     )
433 |                     
434 |                     # Connect UI update events
435 |                     for dropdown in [agent_loop, omni_model_choice, uitars_model_choice, openai_model_choice, anthropic_model_choice]:
436 |                         dropdown.change(
437 |                             fn=update_ui,
438 |                             inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
439 |                             outputs=[
440 |                                 openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice, 
441 |                                 openai_key_group, anthropic_key_group,
442 |                                 custom_model, provider_base_url, provider_api_key,
443 |                                 model_choice
444 |                             ],
445 |                             queue=False
446 |                         )
447 | 
448 |                     save_trajectory = gr.Checkbox(
449 |                         label="Save Trajectory",
450 |                         value=initial_save_trajectory,
451 |                         info="Save the agent's trajectory for debugging",
452 |                         interactive=True,
453 |                     )
454 | 
455 |                     recent_images = gr.Slider(
456 |                         label="Recent Images",
457 |                         minimum=1,
458 |                         maximum=10,
459 |                         value=initial_recent_images,
460 |                         step=1,
461 |                         info="Number of recent images to keep in context",
462 |                         interactive=True,
463 |                     )
464 |                     
465 |                     max_budget = gr.Number(
466 |                         label="Max Budget ($)",
467 |                         value=lambda: None,
468 |                         minimum=-1,
469 |                         maximum=100.0,
470 |                         step=0.1,
471 |                         info="Optional budget limit for trajectory (0 = no limit)",
472 |                         interactive=True,
473 |                     )
474 | 
475 |             # Right column for chat interface
476 |             with gr.Column(scale=2):
477 |                 gr.Markdown(
478 |                     "Ask me to perform tasks in a virtual environment.<br>Built with <a href='https://github.com/trycua/cua' target='_blank'>github.com/trycua/cua</a>."
479 |                 )
480 | 
481 |                 chatbot_history = gr.Chatbot(type="messages")
482 |                 msg = gr.Textbox(
483 |                     placeholder="Ask me to perform tasks in a virtual environment"
484 |                 )
485 |                 clear = gr.Button("Clear")
486 |                 cancel_button = gr.Button("Cancel", variant="stop")
487 | 
488 |                 # Add examples
489 |                 example_group = gr.Examples(examples=example_messages, inputs=msg)
490 | 
491 |                 # Chat submission function
492 |                 def chat_submit(message, history):
493 |                     history.append(gr.ChatMessage(role="user", content=message))
494 |                     return "", history
495 | 
496 |                 # Cancel function
497 |                 async def cancel_agent_task(history):
498 |                     global global_agent
499 |                     if global_agent:
500 |                         print("DEBUG - Cancelling agent task")
501 |                         history.append(gr.ChatMessage(role="assistant", content="Task cancelled by user", metadata={"title": "❌ Cancelled"}))
502 |                     else:
503 |                         history.append(gr.ChatMessage(role="assistant", content="No active agent task to cancel", metadata={"title": "ℹ️ Info"}))
504 |                     return history
505 |                 
506 |                 # Process response function
507 |                 async def process_response(
508 |                     history,
509 |                     openai_model_value,
510 |                     anthropic_model_value,
511 |                     omni_model_value,
512 |                     uitars_model_value,
513 |                     custom_model_value,
514 |                     agent_loop_choice,
515 |                     save_traj,
516 |                     recent_imgs,
517 |                     custom_url_value=None,
518 |                     custom_api_key=None,
519 |                     openai_key_input=None,
520 |                     anthropic_key_input=None,
521 |                     computer_os="linux",
522 |                     computer_provider="cloud",
523 |                     container_name="",
524 |                     cua_cloud_api_key="",
525 |                     max_budget_value=None,
526 |                 ):
527 |                     if not history:
528 |                         yield history
529 |                         return
530 | 
531 |                     # Get the last user message
532 |                     last_user_message = history[-1]["content"]
533 | 
534 |                     # Get the appropriate model value based on the agent loop
535 |                     if agent_loop_choice == "OPENAI":
536 |                         model_choice_value = openai_model_value
537 |                     elif agent_loop_choice == "ANTHROPIC":
538 |                         model_choice_value = anthropic_model_value
539 |                     elif agent_loop_choice == "OMNI":
540 |                         model_choice_value = omni_model_value
541 |                     elif agent_loop_choice == "UITARS":
542 |                         model_choice_value = uitars_model_value
543 |                     else:
544 |                         model_choice_value = "No models available"
545 |                     
546 |                     # Determine if this is a custom model selection
547 |                     is_custom_model_selected = model_choice_value in ["Custom model (OpenAI compatible API)", "Custom model (ollama)"]
548 |                     
549 |                     # Determine the model name string to analyze
550 |                     if is_custom_model_selected:
551 |                         model_string_to_analyze = custom_model_value
552 |                     else:
553 |                         model_string_to_analyze = model_choice_value
554 | 
555 |                     try:
556 |                         # Get the model string
557 |                         model_string = get_model_string(model_string_to_analyze, agent_loop_choice)
558 | 
559 |                         # Set API keys if provided
560 |                         if openai_key_input:
561 |                             os.environ["OPENAI_API_KEY"] = openai_key_input
562 |                         if anthropic_key_input:
563 |                             os.environ["ANTHROPIC_API_KEY"] = anthropic_key_input
564 |                         if cua_cloud_api_key:
565 |                             os.environ["CUA_API_KEY"] = cua_cloud_api_key
566 | 
567 |                         # Save settings
568 |                         current_settings = {
569 |                             "agent_loop": agent_loop_choice,
570 |                             "model_choice": model_choice_value,
571 |                             "custom_model": custom_model_value,
572 |                             "provider_base_url": custom_url_value,
573 |                             "save_trajectory": save_traj,
574 |                             "recent_images": recent_imgs,
575 |                             "computer_os": computer_os,
576 |                             "computer_provider": computer_provider,
577 |                             "container_name": container_name,
578 |                         }
579 |                         save_settings(current_settings)
580 | 
581 |                         # Create agent
582 |                         global_agent = create_agent(
583 |                             model_string=model_string,
584 |                             save_trajectory=save_traj,
585 |                             only_n_most_recent_images=recent_imgs,
586 |                             custom_model_name=custom_model_value if is_custom_model_selected else None,
587 |                             computer_os=computer_os,
588 |                             computer_provider=computer_provider,
589 |                             computer_name=container_name,
590 |                             computer_api_key=cua_cloud_api_key,
591 |                             verbosity=logging.DEBUG,
592 |                             max_trajectory_budget=max_budget_value if max_budget_value and max_budget_value > 0 else None,
593 |                         )
594 | 
595 |                         if global_agent is None:
596 |                             history.append(
597 |                                 gr.ChatMessage(
598 |                                     role="assistant",
599 |                                     content="Failed to create agent. Check API keys and configuration.",
600 |                                 )
601 |                             )
602 |                             yield history
603 |                             return
604 | 
605 |                         # Add user message to global history
606 |                         global global_messages
607 |                         global_messages.append({"role": "user", "content": last_user_message})
608 |                         
609 |                         # Stream responses from the agent
610 |                         async for result in global_agent.run(global_messages):
611 |                             global_messages += result.get("output", [])
612 |                             # print(f"DEBUG - Agent response ------- START")
613 |                             # from pprint import pprint
614 |                             # pprint(result)
615 |                             # print(f"DEBUG - Agent response ------- END")
616 |                             
617 |                             # Process the result output
618 |                             for item in result.get("output", []):
619 |                                 if item.get("type") == "message":
620 |                                     content = item.get("content", [])
621 |                                     for content_part in content:
622 |                                         if content_part.get("text"):
623 |                                             history.append(gr.ChatMessage(
624 |                                                 role=item.get("role", "assistant"),
625 |                                                 content=content_part.get("text", ""),
626 |                                                 metadata=content_part.get("metadata", {})
627 |                                             ))
628 |                                 elif item.get("type") == "computer_call":
629 |                                     action = item.get("action", {})
630 |                                     action_type = action.get("type", "")
631 |                                     if action_type:
632 |                                         action_title = f"🛠️ Performing {action_type}"
633 |                                         if action.get("x") and action.get("y"):
634 |                                             action_title += f" at ({action['x']}, {action['y']})"
635 |                                         history.append(gr.ChatMessage(
636 |                                             role="assistant",
637 |                                             content=f"```json\n{json.dumps(action)}\n```",
638 |                                             metadata={"title": action_title}
639 |                                         ))
640 |                                 elif item.get("type") == "function_call":
641 |                                     function_name = item.get("name", "")
642 |                                     arguments = item.get("arguments", "{}")
643 |                                     history.append(gr.ChatMessage(
644 |                                         role="assistant",
645 |                                         content=f"🔧 Calling function: {function_name}\n```json\n{arguments}\n```",
646 |                                         metadata={"title": f"Function Call: {function_name}"}
647 |                                     ))
648 |                                 elif item.get("type") == "function_call_output":
649 |                                     output = item.get("output", "")
650 |                                     history.append(gr.ChatMessage(
651 |                                         role="assistant",
652 |                                         content=f"📤 Function output:\n```\n{output}\n```",
653 |                                         metadata={"title": "Function Output"}
654 |                                     ))
655 |                                 elif item.get("type") == "computer_call_output":
656 |                                     output = item.get("output", {}).get("image_url", "")
657 |                                     image_markdown = f"![Computer output]({output})"
658 |                                     history.append(gr.ChatMessage(
659 |                                         role="assistant",
660 |                                         content=image_markdown,
661 |                                         metadata={"title": "🖥️ Computer Output"}
662 |                                     ))
663 |                             
664 |                             yield history
665 |                             
666 |                     except Exception as e:
667 |                         import traceback
668 |                         traceback.print_exc()
669 |                         history.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
670 |                         yield history
671 |                         
672 |                 # Connect the submit button
673 |                 submit_event = msg.submit(
674 |                     fn=chat_submit,
675 |                     inputs=[msg, chatbot_history],
676 |                     outputs=[msg, chatbot_history],
677 |                     queue=False,
678 |                 ).then(
679 |                     fn=process_response,
680 |                     inputs=[
681 |                         chatbot_history,
682 |                         openai_model_choice,
683 |                         anthropic_model_choice,
684 |                         omni_model_choice,
685 |                         uitars_model_choice,
686 |                         custom_model,
687 |                         agent_loop,
688 |                         save_trajectory,
689 |                         recent_images,
690 |                         provider_base_url,
691 |                         provider_api_key,
692 |                         openai_api_key_input,
693 |                         anthropic_api_key_input,
694 |                         computer_os,
695 |                         computer_provider,
696 |                         container_name,
697 |                         cua_cloud_api_key,
698 |                         max_budget,
699 |                     ],
700 |                     outputs=[chatbot_history],
701 |                     queue=True,
702 |                 )
703 | 
704 |                 # Clear button functionality
705 |                 def clear_chat():
706 |                     global global_messages
707 |                     global_messages.clear()
708 |                     return None
709 |                 
710 |                 clear.click(clear_chat, None, chatbot_history, queue=False)
711 |                 
712 |                 # Connect cancel button
713 |                 cancel_button.click(
714 |                     cancel_agent_task,
715 |                     [chatbot_history],
716 |                     [chatbot_history],
717 |                     queue=False
718 |                 )
719 | 
720 |                 # Code display update function
721 |                 def update_code_display(agent_loop, model_choice_val, custom_model_val, chat_history, recent_images_val, save_trajectory_val, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget_val):
722 |                     messages = []
723 |                     if chat_history:
724 |                         for msg in chat_history:
725 |                             if isinstance(msg, dict) and msg.get("role") == "user":
726 |                                 messages.append(msg.get("content", ""))
727 |                     
728 |                     return generate_python_code(
729 |                         agent_loop, 
730 |                         model_choice_val or custom_model_val or "gpt-4o", 
731 |                         messages, 
732 |                         recent_images_val,
733 |                         save_trajectory_val,
734 |                         computer_os,
735 |                         computer_provider,
736 |                         container_name,
737 |                         cua_cloud_api_key,
738 |                         max_budget_val
739 |                     )
740 |                 
741 |                 # Update code display when configuration changes
742 |                 for component in [agent_loop, model_choice, custom_model, chatbot_history, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget]:
743 |                     component.change(
744 |                         update_code_display,
745 |                         inputs=[agent_loop, model_choice, custom_model, chatbot_history, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget],
746 |                         outputs=[code_display]
747 |                     )
748 | 
749 |     return demo
750 | 
```