#
tokens: 40112/50000 2/501 files (page 19/21)
lines: on (toggle) GitHub
raw markdown copy reset
This is page 19 of 21. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .all-contributorsrc
├── .cursorignore
├── .devcontainer
│   ├── devcontainer.json
│   ├── post-install.sh
│   └── README.md
├── .dockerignore
├── .gitattributes
├── .github
│   ├── FUNDING.yml
│   ├── scripts
│   │   ├── get_pyproject_version.py
│   │   └── tests
│   │       ├── __init__.py
│   │       ├── README.md
│   │       └── test_get_pyproject_version.py
│   └── workflows
│       ├── ci-lume.yml
│       ├── docker-publish-kasm.yml
│       ├── docker-publish-xfce.yml
│       ├── docker-reusable-publish.yml
│       ├── npm-publish-computer.yml
│       ├── npm-publish-core.yml
│       ├── publish-lume.yml
│       ├── pypi-publish-agent.yml
│       ├── pypi-publish-computer-server.yml
│       ├── pypi-publish-computer.yml
│       ├── pypi-publish-core.yml
│       ├── pypi-publish-mcp-server.yml
│       ├── pypi-publish-pylume.yml
│       ├── pypi-publish-som.yml
│       ├── pypi-reusable-publish.yml
│       └── test-validation-script.yml
├── .gitignore
├── .vscode
│   ├── docs.code-workspace
│   ├── launch.json
│   ├── libs-ts.code-workspace
│   ├── lume.code-workspace
│   ├── lumier.code-workspace
│   └── py.code-workspace
├── blog
│   ├── app-use.md
│   ├── assets
│   │   ├── composite-agents.png
│   │   ├── docker-ubuntu-support.png
│   │   ├── hack-booth.png
│   │   ├── hack-closing-ceremony.jpg
│   │   ├── hack-cua-ollama-hud.jpeg
│   │   ├── hack-leaderboard.png
│   │   ├── hack-the-north.png
│   │   ├── hack-winners.jpeg
│   │   ├── hack-workshop.jpeg
│   │   ├── hud-agent-evals.png
│   │   └── trajectory-viewer.jpeg
│   ├── bringing-computer-use-to-the-web.md
│   ├── build-your-own-operator-on-macos-1.md
│   ├── build-your-own-operator-on-macos-2.md
│   ├── composite-agents.md
│   ├── cua-hackathon.md
│   ├── hack-the-north.md
│   ├── hud-agent-evals.md
│   ├── human-in-the-loop.md
│   ├── introducing-cua-cloud-containers.md
│   ├── lume-to-containerization.md
│   ├── sandboxed-python-execution.md
│   ├── training-computer-use-models-trajectories-1.md
│   ├── trajectory-viewer.md
│   ├── ubuntu-docker-support.md
│   └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│   ├── .gitignore
│   ├── .prettierrc
│   ├── content
│   │   └── docs
│   │       ├── agent-sdk
│   │       │   ├── agent-loops.mdx
│   │       │   ├── benchmarks
│   │       │   │   ├── index.mdx
│   │       │   │   ├── interactive.mdx
│   │       │   │   ├── introduction.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── osworld-verified.mdx
│   │       │   │   ├── screenspot-pro.mdx
│   │       │   │   └── screenspot-v2.mdx
│   │       │   ├── callbacks
│   │       │   │   ├── agent-lifecycle.mdx
│   │       │   │   ├── cost-saving.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── logging.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── pii-anonymization.mdx
│   │       │   │   └── trajectories.mdx
│   │       │   ├── chat-history.mdx
│   │       │   ├── custom-computer-handlers.mdx
│   │       │   ├── custom-tools.mdx
│   │       │   ├── customizing-computeragent.mdx
│   │       │   ├── integrations
│   │       │   │   ├── hud.mdx
│   │       │   │   └── meta.json
│   │       │   ├── message-format.mdx
│   │       │   ├── meta.json
│   │       │   ├── migration-guide.mdx
│   │       │   ├── prompt-caching.mdx
│   │       │   ├── supported-agents
│   │       │   │   ├── composed-agents.mdx
│   │       │   │   ├── computer-use-agents.mdx
│   │       │   │   ├── grounding-models.mdx
│   │       │   │   ├── human-in-the-loop.mdx
│   │       │   │   └── meta.json
│   │       │   ├── supported-model-providers
│   │       │   │   ├── index.mdx
│   │       │   │   └── local-models.mdx
│   │       │   └── usage-tracking.mdx
│   │       ├── computer-sdk
│   │       │   ├── cloud-vm-management.mdx
│   │       │   ├── commands.mdx
│   │       │   ├── computer-ui.mdx
│   │       │   ├── computers.mdx
│   │       │   ├── meta.json
│   │       │   └── sandboxed-python.mdx
│   │       ├── index.mdx
│   │       ├── libraries
│   │       │   ├── agent
│   │       │   │   └── index.mdx
│   │       │   ├── computer
│   │       │   │   └── index.mdx
│   │       │   ├── computer-server
│   │       │   │   ├── Commands.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── REST-API.mdx
│   │       │   │   └── WebSocket-API.mdx
│   │       │   ├── core
│   │       │   │   └── index.mdx
│   │       │   ├── lume
│   │       │   │   ├── cli-reference.mdx
│   │       │   │   ├── faq.md
│   │       │   │   ├── http-api.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── meta.json
│   │       │   │   └── prebuilt-images.mdx
│   │       │   ├── lumier
│   │       │   │   ├── building-lumier.mdx
│   │       │   │   ├── docker-compose.mdx
│   │       │   │   ├── docker.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   └── meta.json
│   │       │   ├── mcp-server
│   │       │   │   ├── client-integrations.mdx
│   │       │   │   ├── configuration.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── llm-integrations.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── tools.mdx
│   │       │   │   └── usage.mdx
│   │       │   └── som
│   │       │       ├── configuration.mdx
│   │       │       └── index.mdx
│   │       ├── meta.json
│   │       ├── quickstart-cli.mdx
│   │       ├── quickstart-devs.mdx
│   │       └── telemetry.mdx
│   ├── next.config.mjs
│   ├── package-lock.json
│   ├── package.json
│   ├── pnpm-lock.yaml
│   ├── postcss.config.mjs
│   ├── public
│   │   └── img
│   │       ├── agent_gradio_ui.png
│   │       ├── agent.png
│   │       ├── cli.png
│   │       ├── computer.png
│   │       ├── som_box_threshold.png
│   │       └── som_iou_threshold.png
│   ├── README.md
│   ├── source.config.ts
│   ├── src
│   │   ├── app
│   │   │   ├── (home)
│   │   │   │   ├── [[...slug]]
│   │   │   │   │   └── page.tsx
│   │   │   │   └── layout.tsx
│   │   │   ├── api
│   │   │   │   └── search
│   │   │   │       └── route.ts
│   │   │   ├── favicon.ico
│   │   │   ├── global.css
│   │   │   ├── layout.config.tsx
│   │   │   ├── layout.tsx
│   │   │   ├── llms.mdx
│   │   │   │   └── [[...slug]]
│   │   │   │       └── route.ts
│   │   │   └── llms.txt
│   │   │       └── route.ts
│   │   ├── assets
│   │   │   ├── discord-black.svg
│   │   │   ├── discord-white.svg
│   │   │   ├── logo-black.svg
│   │   │   └── logo-white.svg
│   │   ├── components
│   │   │   ├── iou.tsx
│   │   │   └── mermaid.tsx
│   │   ├── lib
│   │   │   ├── llms.ts
│   │   │   └── source.ts
│   │   └── mdx-components.tsx
│   └── tsconfig.json
├── examples
│   ├── agent_examples.py
│   ├── agent_ui_examples.py
│   ├── cloud_api_examples.py
│   ├── computer_examples_windows.py
│   ├── computer_examples.py
│   ├── computer_ui_examples.py
│   ├── computer-example-ts
│   │   ├── .env.example
│   │   ├── .gitignore
│   │   ├── .prettierrc
│   │   ├── package-lock.json
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── README.md
│   │   ├── src
│   │   │   ├── helpers.ts
│   │   │   └── index.ts
│   │   └── tsconfig.json
│   ├── docker_examples.py
│   ├── evals
│   │   ├── hud_eval_examples.py
│   │   └── wikipedia_most_linked.txt
│   ├── pylume_examples.py
│   ├── sandboxed_functions_examples.py
│   ├── som_examples.py
│   ├── utils.py
│   └── winsandbox_example.py
├── img
│   ├── agent_gradio_ui.png
│   ├── agent.png
│   ├── cli.png
│   ├── computer.png
│   ├── logo_black.png
│   └── logo_white.png
├── libs
│   ├── kasm
│   │   ├── Dockerfile
│   │   ├── LICENSE
│   │   ├── README.md
│   │   └── src
│   │       └── ubuntu
│   │           └── install
│   │               └── firefox
│   │                   ├── custom_startup.sh
│   │                   ├── firefox.desktop
│   │                   └── install_firefox.sh
│   ├── lume
│   │   ├── .cursorignore
│   │   ├── CONTRIBUTING.md
│   │   ├── Development.md
│   │   ├── img
│   │   │   └── cli.png
│   │   ├── Package.resolved
│   │   ├── Package.swift
│   │   ├── README.md
│   │   ├── resources
│   │   │   └── lume.entitlements
│   │   ├── scripts
│   │   │   ├── build
│   │   │   │   ├── build-debug.sh
│   │   │   │   ├── build-release-notarized.sh
│   │   │   │   └── build-release.sh
│   │   │   └── install.sh
│   │   ├── src
│   │   │   ├── Commands
│   │   │   │   ├── Clone.swift
│   │   │   │   ├── Config.swift
│   │   │   │   ├── Create.swift
│   │   │   │   ├── Delete.swift
│   │   │   │   ├── Get.swift
│   │   │   │   ├── Images.swift
│   │   │   │   ├── IPSW.swift
│   │   │   │   ├── List.swift
│   │   │   │   ├── Logs.swift
│   │   │   │   ├── Options
│   │   │   │   │   └── FormatOption.swift
│   │   │   │   ├── Prune.swift
│   │   │   │   ├── Pull.swift
│   │   │   │   ├── Push.swift
│   │   │   │   ├── Run.swift
│   │   │   │   ├── Serve.swift
│   │   │   │   ├── Set.swift
│   │   │   │   └── Stop.swift
│   │   │   ├── ContainerRegistry
│   │   │   │   ├── ImageContainerRegistry.swift
│   │   │   │   ├── ImageList.swift
│   │   │   │   └── ImagesPrinter.swift
│   │   │   ├── Errors
│   │   │   │   └── Errors.swift
│   │   │   ├── FileSystem
│   │   │   │   ├── Home.swift
│   │   │   │   ├── Settings.swift
│   │   │   │   ├── VMConfig.swift
│   │   │   │   ├── VMDirectory.swift
│   │   │   │   └── VMLocation.swift
│   │   │   ├── LumeController.swift
│   │   │   ├── Main.swift
│   │   │   ├── Server
│   │   │   │   ├── Handlers.swift
│   │   │   │   ├── HTTP.swift
│   │   │   │   ├── Requests.swift
│   │   │   │   ├── Responses.swift
│   │   │   │   └── Server.swift
│   │   │   ├── Utils
│   │   │   │   ├── CommandRegistry.swift
│   │   │   │   ├── CommandUtils.swift
│   │   │   │   ├── Logger.swift
│   │   │   │   ├── NetworkUtils.swift
│   │   │   │   ├── Path.swift
│   │   │   │   ├── ProcessRunner.swift
│   │   │   │   ├── ProgressLogger.swift
│   │   │   │   ├── String.swift
│   │   │   │   └── Utils.swift
│   │   │   ├── Virtualization
│   │   │   │   ├── DarwinImageLoader.swift
│   │   │   │   ├── DHCPLeaseParser.swift
│   │   │   │   ├── ImageLoaderFactory.swift
│   │   │   │   └── VMVirtualizationService.swift
│   │   │   ├── VM
│   │   │   │   ├── DarwinVM.swift
│   │   │   │   ├── LinuxVM.swift
│   │   │   │   ├── VM.swift
│   │   │   │   ├── VMDetails.swift
│   │   │   │   ├── VMDetailsPrinter.swift
│   │   │   │   ├── VMDisplayResolution.swift
│   │   │   │   └── VMFactory.swift
│   │   │   └── VNC
│   │   │       ├── PassphraseGenerator.swift
│   │   │       └── VNCService.swift
│   │   └── tests
│   │       ├── Mocks
│   │       │   ├── MockVM.swift
│   │       │   ├── MockVMVirtualizationService.swift
│   │       │   └── MockVNCService.swift
│   │       ├── VM
│   │       │   └── VMDetailsPrinterTests.swift
│   │       ├── VMTests.swift
│   │       ├── VMVirtualizationServiceTests.swift
│   │       └── VNCServiceTests.swift
│   ├── lumier
│   │   ├── .dockerignore
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   └── src
│   │       ├── bin
│   │       │   └── entry.sh
│   │       ├── config
│   │       │   └── constants.sh
│   │       ├── hooks
│   │       │   └── on-logon.sh
│   │       └── lib
│   │           ├── utils.sh
│   │           └── vm.sh
│   ├── python
│   │   ├── agent
│   │   │   ├── agent
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── adapters
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── huggingfacelocal_adapter.py
│   │   │   │   │   ├── human_adapter.py
│   │   │   │   │   ├── mlxvlm_adapter.py
│   │   │   │   │   └── models
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── generic.py
│   │   │   │   │       ├── internvl.py
│   │   │   │   │       ├── opencua.py
│   │   │   │   │       └── qwen2_5_vl.py
│   │   │   │   ├── agent.py
│   │   │   │   ├── callbacks
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── budget_manager.py
│   │   │   │   │   ├── image_retention.py
│   │   │   │   │   ├── logging.py
│   │   │   │   │   ├── operator_validator.py
│   │   │   │   │   ├── pii_anonymization.py
│   │   │   │   │   ├── prompt_instructions.py
│   │   │   │   │   ├── telemetry.py
│   │   │   │   │   └── trajectory_saver.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── computers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cua.py
│   │   │   │   │   └── custom.py
│   │   │   │   ├── decorators.py
│   │   │   │   ├── human_tool
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   ├── server.py
│   │   │   │   │   └── ui.py
│   │   │   │   ├── integrations
│   │   │   │   │   └── hud
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── agent.py
│   │   │   │   │       └── proxy.py
│   │   │   │   ├── loops
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── anthropic.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── composed_grounded.py
│   │   │   │   │   ├── gemini.py
│   │   │   │   │   ├── glm45v.py
│   │   │   │   │   ├── gta1.py
│   │   │   │   │   ├── holo.py
│   │   │   │   │   ├── internvl.py
│   │   │   │   │   ├── model_types.csv
│   │   │   │   │   ├── moondream3.py
│   │   │   │   │   ├── omniparser.py
│   │   │   │   │   ├── openai.py
│   │   │   │   │   ├── opencua.py
│   │   │   │   │   └── uitars.py
│   │   │   │   ├── proxy
│   │   │   │   │   ├── examples.py
│   │   │   │   │   └── handlers.py
│   │   │   │   ├── responses.py
│   │   │   │   ├── types.py
│   │   │   │   └── ui
│   │   │   │       ├── __init__.py
│   │   │   │       ├── __main__.py
│   │   │   │       └── gradio
│   │   │   │           ├── __init__.py
│   │   │   │           ├── app.py
│   │   │   │           └── ui_components.py
│   │   │   ├── benchmarks
│   │   │   │   ├── .gitignore
│   │   │   │   ├── contrib.md
│   │   │   │   ├── interactive.py
│   │   │   │   ├── models
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   └── gta1.py
│   │   │   │   ├── README.md
│   │   │   │   ├── ss-pro.py
│   │   │   │   ├── ss-v2.py
│   │   │   │   └── utils.py
│   │   │   ├── example.py
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── computer
│   │   │   ├── computer
│   │   │   │   ├── __init__.py
│   │   │   │   ├── computer.py
│   │   │   │   ├── diorama_computer.py
│   │   │   │   ├── helpers.py
│   │   │   │   ├── interface
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   ├── models.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── logger.py
│   │   │   │   ├── models.py
│   │   │   │   ├── providers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cloud
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── docker
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── lume
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── lume_api.py
│   │   │   │   │   ├── lumier
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── types.py
│   │   │   │   │   └── winsandbox
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── provider.py
│   │   │   │   │       └── setup_script.ps1
│   │   │   │   ├── ui
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   └── gradio
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       └── app.py
│   │   │   │   └── utils.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── computer-server
│   │   │   ├── computer_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── diorama
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── diorama_computer.py
│   │   │   │   │   ├── diorama.py
│   │   │   │   │   ├── draw.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── safezone.py
│   │   │   │   ├── handlers
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── main.py
│   │   │   │   ├── server.py
│   │   │   │   └── watchdog.py
│   │   │   ├── examples
│   │   │   │   ├── __init__.py
│   │   │   │   └── usage_example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   ├── run_server.py
│   │   │   └── test_connection.py
│   │   ├── core
│   │   │   ├── core
│   │   │   │   ├── __init__.py
│   │   │   │   └── telemetry
│   │   │   │       ├── __init__.py
│   │   │   │       └── posthog.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── mcp-server
│   │   │   ├── mcp_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   └── server.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── scripts
│   │   │       ├── install_mcp_server.sh
│   │   │       └── start_mcp_server.sh
│   │   ├── pylume
│   │   │   ├── __init__.py
│   │   │   ├── pylume
│   │   │   │   ├── __init__.py
│   │   │   │   ├── client.py
│   │   │   │   ├── exceptions.py
│   │   │   │   ├── lume
│   │   │   │   ├── models.py
│   │   │   │   ├── pylume.py
│   │   │   │   └── server.py
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   └── som
│   │       ├── LICENSE
│   │       ├── poetry.toml
│   │       ├── pyproject.toml
│   │       ├── README.md
│   │       ├── som
│   │       │   ├── __init__.py
│   │       │   ├── detect.py
│   │       │   ├── detection.py
│   │       │   ├── models.py
│   │       │   ├── ocr.py
│   │       │   ├── util
│   │       │   │   └── utils.py
│   │       │   └── visualization.py
│   │       └── tests
│   │           └── test_omniparser.py
│   ├── typescript
│   │   ├── .gitignore
│   │   ├── .nvmrc
│   │   ├── agent
│   │   │   ├── examples
│   │   │   │   ├── playground-example.html
│   │   │   │   └── README.md
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── client.ts
│   │   │   │   ├── index.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   └── client.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── biome.json
│   │   ├── computer
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── computer
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── providers
│   │   │   │   │   │   ├── base.ts
│   │   │   │   │   │   ├── cloud.ts
│   │   │   │   │   │   └── index.ts
│   │   │   │   │   └── types.ts
│   │   │   │   ├── index.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── base.ts
│   │   │   │   │   ├── factory.ts
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── linux.ts
│   │   │   │   │   ├── macos.ts
│   │   │   │   │   └── windows.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   ├── computer
│   │   │   │   │   └── cloud.test.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── factory.test.ts
│   │   │   │   │   ├── index.test.ts
│   │   │   │   │   ├── linux.test.ts
│   │   │   │   │   ├── macos.test.ts
│   │   │   │   │   └── windows.test.ts
│   │   │   │   └── setup.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── core
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── index.ts
│   │   │   │   └── telemetry
│   │   │   │       ├── clients
│   │   │   │       │   ├── index.ts
│   │   │   │       │   └── posthog.ts
│   │   │   │       └── index.ts
│   │   │   ├── tests
│   │   │   │   └── telemetry.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── pnpm-workspace.yaml
│   │   └── README.md
│   └── xfce
│       ├── .dockerignore
│       ├── .gitignore
│       ├── Dockerfile
│       ├── README.md
│       └── src
│           ├── scripts
│           │   ├── resize-display.sh
│           │   ├── start-computer-server.sh
│           │   ├── start-novnc.sh
│           │   ├── start-vnc.sh
│           │   └── xstartup.sh
│           ├── supervisor
│           │   └── supervisord.conf
│           └── xfce-config
│               ├── helpers.rc
│               ├── xfce4-power-manager.xml
│               └── xfce4-session.xml
├── LICENSE.md
├── notebooks
│   ├── agent_nb.ipynb
│   ├── blog
│   │   ├── build-your-own-operator-on-macos-1.ipynb
│   │   └── build-your-own-operator-on-macos-2.ipynb
│   ├── composite_agents_docker_nb.ipynb
│   ├── computer_nb.ipynb
│   ├── computer_server_nb.ipynb
│   ├── customizing_computeragent.ipynb
│   ├── eval_osworld.ipynb
│   ├── ollama_nb.ipynb
│   ├── pylume_nb.ipynb
│   ├── README.md
│   ├── sota_hackathon_cloud.ipynb
│   └── sota_hackathon.ipynb
├── pdm.lock
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── samples
│   └── community
│       ├── global-online
│       │   └── README.md
│       └── hack-the-north
│           └── README.md
├── scripts
│   ├── build-uv.sh
│   ├── build.ps1
│   ├── build.sh
│   ├── cleanup.sh
│   ├── playground-docker.sh
│   ├── playground.sh
│   └── run-docker-dev.sh
└── tests
    ├── pytest.ini
    ├── shell_cmd.py
    ├── test_files.py
    ├── test_shell_bash.py
    ├── test_telemetry.py
    ├── test_venv.py
    └── test_watchdog.py
```

# Files

--------------------------------------------------------------------------------
/libs/python/computer-server/computer_server/handlers/macos.py:
--------------------------------------------------------------------------------

```python
   1 | import pyautogui
   2 | pyautogui.FAILSAFE = False
   3 | from pynput.mouse import Button, Controller as MouseController
   4 | from pynput.keyboard import Key, Controller as KeyboardController
   5 | import time
   6 | import base64
   7 | from io import BytesIO
   8 | from typing import Optional, Dict, Any, List, Tuple
   9 | from ctypes import byref, c_void_p, POINTER
  10 | from AppKit import NSWorkspace  # type: ignore
  11 | import AppKit
  12 | from Quartz.CoreGraphics import *  # type: ignore
  13 | from Quartz.CoreGraphics import CGPoint, CGSize  # type: ignore
  14 | import Foundation
  15 | from ApplicationServices import (
  16 |     AXUIElementCreateSystemWide,  # type: ignore
  17 |     AXUIElementCreateApplication,  # type: ignore
  18 |     AXUIElementCopyAttributeValue,  # type: ignore
  19 |     AXUIElementCopyAttributeValues,  # type: ignore
  20 |     kAXFocusedWindowAttribute,  # type: ignore
  21 |     kAXWindowsAttribute,  # type: ignore
  22 |     kAXMainWindowAttribute,  # type: ignore
  23 |     kAXChildrenAttribute,  # type: ignore
  24 |     kAXRoleAttribute,  # type: ignore
  25 |     kAXTitleAttribute,  # type: ignore
  26 |     kAXValueAttribute,  # type: ignore
  27 |     kAXDescriptionAttribute,  # type: ignore
  28 |     kAXEnabledAttribute,  # type: ignore
  29 |     kAXPositionAttribute,  # type: ignore
  30 |     kAXSizeAttribute,  # type: ignore
  31 |     kAXErrorSuccess,  # type: ignore
  32 |     AXValueGetType,  # type: ignore
  33 |     kAXValueCGSizeType,  # type: ignore
  34 |     kAXValueCGPointType,  # type: ignore
  35 |     kAXValueCFRangeType,  # type: ignore
  36 |     AXUIElementGetTypeID,  # type: ignore
  37 |     AXValueGetValue,  # type: ignore
  38 |     kAXVisibleChildrenAttribute,  # type: ignore
  39 |     kAXRoleDescriptionAttribute,  # type: ignore
  40 |     kAXFocusedApplicationAttribute,  # type: ignore
  41 |     kAXFocusedUIElementAttribute,  # type: ignore
  42 |     kAXSelectedTextAttribute,  # type: ignore
  43 |     kAXSelectedTextRangeAttribute,  # type: ignore
  44 | )
  45 | import objc
  46 | import re
  47 | import json
  48 | import copy
  49 | import asyncio
  50 | from .base import BaseAccessibilityHandler, BaseAutomationHandler
  51 | import logging
  52 | 
  53 | logger = logging.getLogger(__name__)
  54 | 
  55 | # Constants for accessibility API
  56 | kAXErrorSuccess = 0
  57 | kAXRoleAttribute = "AXRole"
  58 | kAXTitleAttribute = "AXTitle"
  59 | kAXValueAttribute = "AXValue"
  60 | kAXWindowsAttribute = "AXWindows"
  61 | kAXFocusedAttribute = "AXFocused"
  62 | kAXPositionAttribute = "AXPosition"
  63 | kAXSizeAttribute = "AXSize"
  64 | kAXChildrenAttribute = "AXChildren"
  65 | kAXMenuBarAttribute = "AXMenuBar"
  66 | kAXMenuBarItemAttribute = "AXMenuBarItem"
  67 | 
  68 | # Constants for window properties
  69 | kCGWindowLayer = "kCGWindowLayer"  # Z-order information (lower values are higher in the stack)
  70 | kCGWindowAlpha = "kCGWindowAlpha"  # Window opacity
  71 | 
  72 | # Constants for application activation options
  73 | NSApplicationActivationOptions = {
  74 |     "regular": 0,  # Default activation
  75 |     "bringing_all_windows_forward": 1 << 0,  # NSApplicationActivateAllWindows
  76 |     "ignoring_other_apps": 1 << 1  # NSApplicationActivateIgnoringOtherApps
  77 | }
  78 | 
  79 | def CFAttributeToPyObject(attrValue):
  80 |     """Convert Core Foundation attribute values to Python objects.
  81 |     
  82 |     Args:
  83 |         attrValue: Core Foundation attribute value to convert
  84 |         
  85 |     Returns:
  86 |         Converted Python object or None if conversion fails
  87 |     """
  88 |     def list_helper(list_value):
  89 |         """Helper function to convert CF arrays to Python lists.
  90 |         
  91 |         Args:
  92 |             list_value: Core Foundation array to convert
  93 |             
  94 |         Returns:
  95 |             Python list containing converted items
  96 |         """
  97 |         list_builder = []
  98 |         for item in list_value:
  99 |             list_builder.append(CFAttributeToPyObject(item))
 100 |         return list_builder
 101 | 
 102 |     def number_helper(number_value):
 103 |         """Helper function to convert CF numbers to Python numbers.
 104 |         
 105 |         Args:
 106 |             number_value: Core Foundation number to convert
 107 |             
 108 |         Returns:
 109 |             Python int or float, or None if conversion fails
 110 |         """
 111 |         success, int_value = Foundation.CFNumberGetValue(  # type: ignore
 112 |             number_value, Foundation.kCFNumberIntType, None  # type: ignore
 113 |         )
 114 |         if success:
 115 |             return int(int_value)
 116 | 
 117 |         success, float_value = Foundation.CFNumberGetValue(  # type: ignore
 118 |             number_value, Foundation.kCFNumberDoubleType, None  # type: ignore
 119 |         )
 120 |         if success:
 121 |             return float(float_value)
 122 |         return None
 123 | 
 124 |     def axuielement_helper(element_value):
 125 |         """Helper function to handle AX UI elements.
 126 |         
 127 |         Args:
 128 |             element_value: Accessibility UI element to process
 129 |             
 130 |         Returns:
 131 |             The element value unchanged
 132 |         """
 133 |         return element_value
 134 | 
 135 |     cf_attr_type = Foundation.CFGetTypeID(attrValue)  # type: ignore
 136 |     cf_type_mapping = {
 137 |         Foundation.CFStringGetTypeID(): str,  # type: ignore
 138 |         Foundation.CFBooleanGetTypeID(): bool,  # type: ignore
 139 |         Foundation.CFArrayGetTypeID(): list_helper,  # type: ignore
 140 |         Foundation.CFNumberGetTypeID(): number_helper,  # type: ignore
 141 |         AXUIElementGetTypeID(): axuielement_helper,  # type: ignore
 142 |     }
 143 |     try:
 144 |         return cf_type_mapping[cf_attr_type](attrValue)
 145 |     except KeyError:
 146 |         # did not get a supported CF type. Move on to AX type
 147 |         pass
 148 | 
 149 |     ax_attr_type = AXValueGetType(attrValue)
 150 |     ax_type_map = {
 151 |         kAXValueCGSizeType: Foundation.NSSizeFromString,  # type: ignore
 152 |         kAXValueCGPointType: Foundation.NSPointFromString,  # type: ignore
 153 |         kAXValueCFRangeType: Foundation.NSRangeFromString,  # type: ignore
 154 |     }
 155 |     try:
 156 |         search_result = re.search("{.*}", attrValue.description())
 157 |         if search_result:
 158 |             extracted_str = search_result.group()
 159 |             return tuple(ax_type_map[ax_attr_type](extracted_str))
 160 |         return None
 161 |     except KeyError:
 162 |         return None
 163 | 
 164 | 
 165 | def element_attribute(element, attribute):
 166 |     """Get an attribute value from an accessibility element.
 167 |     
 168 |     Args:
 169 |         element: The accessibility element
 170 |         attribute: The attribute name to retrieve
 171 |         
 172 |     Returns:
 173 |         The attribute value or None if not found
 174 |     """
 175 |     if attribute == kAXChildrenAttribute:
 176 |         err, value = AXUIElementCopyAttributeValues(element, attribute, 0, 999, None)
 177 |         if err == kAXErrorSuccess:
 178 |             if isinstance(value, Foundation.NSArray):  # type: ignore
 179 |                 return CFAttributeToPyObject(value)
 180 |             else:
 181 |                 return value
 182 |     err, value = AXUIElementCopyAttributeValue(element, attribute, None)
 183 |     if err == kAXErrorSuccess:
 184 |         if isinstance(value, Foundation.NSArray):  # type: ignore
 185 |             return CFAttributeToPyObject(value)
 186 |         else:
 187 |             return value
 188 |     return None
 189 | 
 190 | 
 191 | def element_value(element, type):
 192 |     """Extract a typed value from an accessibility element.
 193 |     
 194 |     Args:
 195 |         element: The accessibility element containing the value
 196 |         type: The expected value type
 197 |         
 198 |     Returns:
 199 |         The extracted value or None if extraction fails
 200 |     """
 201 |     err, value = AXValueGetValue(element, type, None)
 202 |     if err == True:
 203 |         return value
 204 |     return None
 205 | 
 206 | 
 207 | class UIElement:
 208 |     """Represents a UI element in the accessibility tree with position, size, and hierarchy information."""
 209 |     
 210 |     def __init__(self, element, offset_x=0, offset_y=0, max_depth=None, parents_visible_bbox=None):
 211 |         """Initialize a UIElement from an accessibility element.
 212 |         
 213 |         Args:
 214 |             element: The accessibility element to wrap
 215 |             offset_x: X offset for position calculations
 216 |             offset_y: Y offset for position calculations
 217 |             max_depth: Maximum depth to traverse for children
 218 |             parents_visible_bbox: Parent's visible bounding box for clipping
 219 |         """
 220 |         self.ax_element = element
 221 |         self.content_identifier = ""
 222 |         self.identifier = ""
 223 |         self.name = ""
 224 |         self.children = []
 225 |         self.description = ""
 226 |         self.role_description = ""
 227 |         self.value = None
 228 |         self.max_depth = max_depth
 229 | 
 230 |         # Set role
 231 |         self.role = element_attribute(element, kAXRoleAttribute)
 232 |         if self.role is None:
 233 |             self.role = "No role"
 234 | 
 235 |         # Set name
 236 |         self.name = element_attribute(element, kAXTitleAttribute)
 237 |         if self.name is not None:
 238 |             # Convert tuple to string if needed
 239 |             if isinstance(self.name, tuple):
 240 |                 self.name = str(self.name[0]) if self.name else ""
 241 |             self.name = self.name.replace(" ", "_")
 242 | 
 243 |         # Set enabled
 244 |         self.enabled = element_attribute(element, kAXEnabledAttribute)
 245 |         if self.enabled is None:
 246 |             self.enabled = False
 247 | 
 248 |         # Set position and size
 249 |         position = element_attribute(element, kAXPositionAttribute)
 250 |         size = element_attribute(element, kAXSizeAttribute)
 251 |         start_position = element_value(position, kAXValueCGPointType)
 252 | 
 253 |         if self.role == "AXWindow" and start_position is not None:
 254 |             offset_x = start_position.x
 255 |             offset_y = start_position.y
 256 | 
 257 |         self.absolute_position = copy.copy(start_position)
 258 |         self.position = start_position
 259 |         if self.position is not None:
 260 |             self.position.x -= max(0, offset_x)
 261 |             self.position.y -= max(0, offset_y)
 262 |         self.size = element_value(size, kAXValueCGSizeType)
 263 | 
 264 |         self._set_bboxes(parents_visible_bbox)
 265 | 
 266 |         # Set component center
 267 |         if start_position is None or self.size is None:
 268 |             print("Position is None")
 269 |             return
 270 |         self.center = (
 271 |             start_position.x + offset_x + self.size.width / 2,
 272 |             start_position.y + offset_y + self.size.height / 2,
 273 |         )
 274 | 
 275 |         self.description = element_attribute(element, kAXDescriptionAttribute)
 276 |         self.role_description = element_attribute(element, kAXRoleDescriptionAttribute)
 277 |         attribute_value = element_attribute(element, kAXValueAttribute)
 278 | 
 279 |         # Set value
 280 |         self.value = attribute_value
 281 |         if attribute_value is not None:
 282 |             if isinstance(attribute_value, Foundation.NSArray):  # type: ignore
 283 |                 self.value = []
 284 |                 for value in attribute_value:
 285 |                     self.value.append(value)
 286 |             # Check if it's an accessibility element by checking its type ID
 287 |             elif Foundation.CFGetTypeID(attribute_value) == AXUIElementGetTypeID():  # type: ignore
 288 |                 self.value = UIElement(attribute_value, offset_x, offset_y)
 289 | 
 290 |         # Set children
 291 |         if self.max_depth is None or self.max_depth > 0:
 292 |             self.children = self._get_children(element, start_position, offset_x, offset_y)
 293 |         else:
 294 |             self.children = []
 295 | 
 296 |         self.calculate_hashes()
 297 | 
 298 |     def _set_bboxes(self, parents_visible_bbox):
 299 |         """Set bounding box and visible bounding box for the element.
 300 |         
 301 |         Args:
 302 |             parents_visible_bbox: Parent's visible bounding box for intersection calculation
 303 |         """
 304 |         if not self.absolute_position or not self.size:
 305 |             self.bbox = None
 306 |             self.visible_bbox = None
 307 |             return
 308 |         self.bbox = [
 309 |             int(self.absolute_position.x),
 310 |             int(self.absolute_position.y),
 311 |             int(self.absolute_position.x + self.size.width),
 312 |             int(self.absolute_position.y + self.size.height),
 313 |         ]
 314 |         if parents_visible_bbox:
 315 |             # check if not intersected
 316 |             if (
 317 |                 self.bbox[0] > parents_visible_bbox[2]
 318 |                 or self.bbox[1] > parents_visible_bbox[3]
 319 |                 or self.bbox[2] < parents_visible_bbox[0]
 320 |                 or self.bbox[3] < parents_visible_bbox[1]
 321 |             ):
 322 |                 self.visible_bbox = None
 323 |             else:
 324 |                 self.visible_bbox = [
 325 |                     int(max(self.bbox[0], parents_visible_bbox[0])),
 326 |                     int(max(self.bbox[1], parents_visible_bbox[1])),
 327 |                     int(min(self.bbox[2], parents_visible_bbox[2])),
 328 |                     int(min(self.bbox[3], parents_visible_bbox[3])),
 329 |                 ]
 330 |         else:
 331 |             self.visible_bbox = self.bbox
 332 | 
 333 |     def _get_children(self, element, start_position, offset_x, offset_y):
 334 |         """Get child elements from the accessibility element.
 335 |         
 336 |         Args:
 337 |             element: The parent accessibility element
 338 |             start_position: Starting position for offset calculations
 339 |             offset_x: X offset for child positioning
 340 |             offset_y: Y offset for child positioning
 341 |             
 342 |         Returns:
 343 |             List of UIElement children
 344 |         """
 345 |         children = element_attribute(element, kAXChildrenAttribute)
 346 |         visible_children = element_attribute(element, kAXVisibleChildrenAttribute)
 347 |         found_children = []
 348 |         if children is not None:
 349 |             found_children.extend(children)
 350 |         else:
 351 |             if visible_children is not None:
 352 |                 found_children.extend(visible_children)
 353 | 
 354 |         result = []
 355 |         if self.max_depth is None or self.max_depth > 0:
 356 |             for child in found_children:
 357 |                 child = UIElement(
 358 |                     child,
 359 |                     offset_x,
 360 |                     offset_y,
 361 |                     self.max_depth - 1 if self.max_depth is not None else None,
 362 |                     self.visible_bbox,
 363 |                 )
 364 |                 result.append(child)
 365 |         return result
 366 | 
 367 |     def calculate_hashes(self):
 368 |         """Calculate unique identifiers for the element and its content."""
 369 |         self.identifier = self.component_hash()
 370 |         self.content_identifier = self.children_content_hash(self.children)
 371 | 
 372 |     def component_hash(self):
 373 |         """Generate a hash identifier for this component based on its properties.
 374 |         
 375 |         Returns:
 376 |             MD5 hash string of component properties
 377 |         """
 378 |         if self.position is None or self.size is None:
 379 |             return ""
 380 |         position_string = f"{self.position.x:.0f};{self.position.y:.0f}"
 381 |         size_string = f"{self.size.width:.0f};{self.size.height:.0f}"
 382 |         enabled_string = str(self.enabled)
 383 |         # Ensure role is a string
 384 |         role_string = ""
 385 |         if self.role is not None:
 386 |             role_string = str(self.role[0]) if isinstance(self.role, tuple) else str(self.role)
 387 |         return self.hash_from_string(position_string + size_string + enabled_string + role_string)
 388 | 
 389 |     def hash_from_string(self, string):
 390 |         """Generate MD5 hash from a string.
 391 |         
 392 |         Args:
 393 |             string: Input string to hash
 394 |             
 395 |         Returns:
 396 |             MD5 hash hexdigest or empty string if input is None/empty
 397 |         """
 398 |         if string is None or string == "":
 399 |             return ""
 400 |         from hashlib import md5
 401 | 
 402 |         return md5(string.encode()).hexdigest()
 403 | 
 404 |     def children_content_hash(self, children):
 405 |         """Generate a hash representing the content and structure of child elements.
 406 |         
 407 |         Args:
 408 |             children: List of child UIElement objects
 409 |             
 410 |         Returns:
 411 |             Combined hash of children content and structure
 412 |         """
 413 |         if len(children) == 0:
 414 |             return ""
 415 |         all_content_hashes = []
 416 |         all_hashes = []
 417 |         for child in children:
 418 |             all_content_hashes.append(child.content_identifier)
 419 |             all_hashes.append(child.identifier)
 420 |         all_content_hashes.sort()
 421 |         if len(all_content_hashes) == 0:
 422 |             return ""
 423 |         content_hash = self.hash_from_string("".join(all_content_hashes))
 424 |         content_structure_hash = self.hash_from_string("".join(all_hashes))
 425 |         return self.hash_from_string(content_hash.join(content_structure_hash))
 426 | 
 427 |     def to_dict(self):
 428 |         """Convert the UIElement to a dictionary representation.
 429 |         
 430 |         Returns:
 431 |             Dictionary containing all element properties and children
 432 |         """
 433 |         def children_to_dict(children):
 434 |             """Convert list of children to dictionary format.
 435 |             
 436 |             Args:
 437 |                 children: List of UIElement children to convert
 438 |                 
 439 |             Returns:
 440 |                 List of dictionaries representing the children
 441 |             """
 442 |             result = []
 443 |             for child in children:
 444 |                 result.append(child.to_dict())
 445 |             return result
 446 | 
 447 |         value = self.value
 448 |         if isinstance(value, UIElement):
 449 |             value = json.dumps(value.to_dict(), indent=4)
 450 |         elif isinstance(value, AppKit.NSDate):  # type: ignore
 451 |             value = str(value)
 452 | 
 453 |         if self.absolute_position is not None:
 454 |             absolute_position = f"{self.absolute_position.x:.2f};{self.absolute_position.y:.2f}"
 455 |         else:
 456 |             absolute_position = ""
 457 | 
 458 |         if self.position is not None:
 459 |             position = f"{self.position.x:.2f};{self.position.y:.2f}"
 460 |         else:
 461 |             position = ""
 462 | 
 463 |         if self.size is not None:
 464 |             size = f"{self.size.width:.0f};{self.size.height:.0f}"
 465 |         else:
 466 |             size = ""
 467 |             
 468 |         return {
 469 |             "id": self.identifier,
 470 |             "name": self.name,
 471 |             "role": self.role,
 472 |             "description": self.description,
 473 |             "role_description": self.role_description,
 474 |             "value": value,
 475 |             "absolute_position": absolute_position,
 476 |             "position": position,
 477 |             "size": size,
 478 |             "enabled": self.enabled,
 479 |             "bbox": self.bbox,
 480 |             "visible_bbox": self.visible_bbox,
 481 |             "children": children_to_dict(self.children),
 482 |         }
 483 | 
 484 | 
 485 | import Quartz
 486 | from AppKit import NSWorkspace, NSRunningApplication
 487 | from pathlib import Path
 488 | 
 489 | def get_all_windows_zorder():
 490 |     """Get all windows in the system with their z-order information.
 491 |     
 492 |     Returns:
 493 |         List of window dictionaries sorted by z-index, containing window properties
 494 |         like id, name, pid, owner, bounds, layer, and opacity
 495 |     """
 496 |     window_list = Quartz.CGWindowListCopyWindowInfo(
 497 |         Quartz.kCGWindowListOptionOnScreenOnly,
 498 |         Quartz.kCGNullWindowID
 499 |     )
 500 |     z_order = {window['kCGWindowNumber']: z_index for z_index, window in enumerate(window_list[::-1])}
 501 |     window_list_all = Quartz.CGWindowListCopyWindowInfo(
 502 |         Quartz.kCGWindowListOptionAll,
 503 |         Quartz.kCGNullWindowID
 504 |     )
 505 |     windows = []
 506 |     for window in window_list_all:
 507 |         window_id = window.get('kCGWindowNumber', 0)
 508 |         window_name = window.get('kCGWindowName', '')
 509 |         window_pid = window.get('kCGWindowOwnerPID', 0)
 510 |         window_bounds = window.get('kCGWindowBounds', {})
 511 |         window_owner = window.get('kCGWindowOwnerName', '')
 512 |         window_is_on_screen = window.get('kCGWindowIsOnscreen', False)
 513 |         layer = window.get('kCGWindowLayer', 0)
 514 |         opacity = window.get('kCGWindowAlpha', 1.0)
 515 |         z_index = z_order.get(window_id, -1)
 516 |         if window_name == "Dock" and window_owner == "Dock":
 517 |             role = "dock"
 518 |         elif window_name == "Menubar" and window_owner == "Window Server":
 519 |             role = "menubar"
 520 |         elif window_owner in ["Window Server", "Dock"]:
 521 |             role = "desktop"
 522 |         else:
 523 |             role = "app"
 524 |         if window_bounds:
 525 |             windows.append({
 526 |                 "id": window_id,
 527 |                 "name": window_name or "Unnamed Window",
 528 |                 "pid": window_pid,
 529 |                 "owner": window_owner,
 530 |                 "role": role,
 531 |                 "is_on_screen": window_is_on_screen,
 532 |                 "bounds": {
 533 |                     "x": window_bounds.get('X', 0),
 534 |                     "y": window_bounds.get('Y', 0),
 535 |                     "width": window_bounds.get('Width', 0),
 536 |                     "height": window_bounds.get('Height', 0)
 537 |                 },
 538 |                 "layer": layer,
 539 |                 "z_index": z_index,
 540 |                 "opacity": opacity
 541 |             })
 542 |     windows = sorted(windows, key=lambda x: x["z_index"])
 543 |     return windows
 544 | 
 545 | def get_app_info(app):
 546 |     """Extract information from an NSRunningApplication object.
 547 |     
 548 |     Args:
 549 |         app: NSRunningApplication instance
 550 |         
 551 |     Returns:
 552 |         Dictionary containing app name, bundle ID, PID, and status flags
 553 |     """
 554 |     return {
 555 |         "name": app.localizedName(),
 556 |         "bundle_id": app.bundleIdentifier(),
 557 |         "pid": app.processIdentifier(),
 558 |         "active": app.isActive(),
 559 |         "hidden": app.isHidden(),
 560 |         "terminated": app.isTerminated(),
 561 |     }
 562 | 
 563 | def get_menubar_items(active_app_pid=None):
 564 |     """Get menubar items for the active application.
 565 |     
 566 |     Args:
 567 |         active_app_pid: Process ID of the active application, or None to use frontmost app
 568 |         
 569 |     Returns:
 570 |         List of menubar item dictionaries with title, bounds, index, and app_pid
 571 |     """
 572 |     menubar_items = []
 573 |     if active_app_pid is None:
 574 |         frontmost_app = NSWorkspace.sharedWorkspace().frontmostApplication()
 575 |         if frontmost_app:
 576 |             active_app_pid = frontmost_app.processIdentifier()
 577 |         else:
 578 |             return menubar_items
 579 |     app_element = AXUIElementCreateApplication(active_app_pid)
 580 |     if app_element is None:
 581 |         return menubar_items
 582 |     menubar = element_attribute(app_element, kAXMenuBarAttribute)
 583 |     if menubar is None:
 584 |         return menubar_items
 585 |     children = element_attribute(menubar, kAXChildrenAttribute)
 586 |     if children is None:
 587 |         return menubar_items
 588 |     for i, item in enumerate(children):
 589 |         title = element_attribute(item, kAXTitleAttribute) or "Untitled"
 590 |         bounds = {"x": 0, "y": 0, "width": 0, "height": 0}
 591 |         position_value = element_attribute(item, kAXPositionAttribute)
 592 |         if position_value:
 593 |             position_value = element_value(position_value, kAXValueCGPointType)
 594 |             bounds["x"] = getattr(position_value, 'x', 0)
 595 |             bounds["y"] = getattr(position_value, 'y', 0)
 596 |         size_value = element_attribute(item, kAXSizeAttribute)
 597 |         if size_value:
 598 |             size_value = element_value(size_value, kAXValueCGSizeType)
 599 |             bounds["width"] = getattr(size_value, 'width', 0)
 600 |             bounds["height"] = getattr(size_value, 'height', 0)
 601 |         menubar_items.append({
 602 |             "title": title,
 603 |             "bounds": bounds,
 604 |             "index": i,
 605 |             "app_pid": active_app_pid
 606 |         })
 607 |     return menubar_items
 608 | 
 609 | def get_dock_items():
 610 |     """Get all items in the macOS Dock.
 611 |     
 612 |     Returns:
 613 |         List of dock item dictionaries with title, description, bounds, index, 
 614 |         type, role, and subrole information
 615 |     """
 616 |     dock_items = []
 617 |     dock_pid = None
 618 |     running_apps = NSWorkspace.sharedWorkspace().runningApplications()
 619 |     for app in running_apps:
 620 |         if app.localizedName() == "Dock" and app.bundleIdentifier() == "com.apple.dock":
 621 |             dock_pid = app.processIdentifier()
 622 |             break
 623 |     if dock_pid is None:
 624 |         return dock_items
 625 |     dock_element = AXUIElementCreateApplication(dock_pid)
 626 |     if dock_element is None:
 627 |         return dock_items
 628 |     dock_list = element_attribute(dock_element, kAXChildrenAttribute)
 629 |     if dock_list is None or len(dock_list) == 0:
 630 |         return dock_items
 631 |     dock_app_list = None
 632 |     for child in dock_list:
 633 |         role = element_attribute(child, kAXRoleAttribute)
 634 |         if role == "AXList":
 635 |             dock_app_list = child
 636 |             break
 637 |     if dock_app_list is None:
 638 |         return dock_items
 639 |     items = element_attribute(dock_app_list, kAXChildrenAttribute)
 640 |     if items is None:
 641 |         return dock_items
 642 |     for i, item in enumerate(items):
 643 |         title = element_attribute(item, kAXTitleAttribute) or "Untitled"
 644 |         description = element_attribute(item, kAXDescriptionAttribute) or ""
 645 |         role = element_attribute(item, kAXRoleAttribute) or ""
 646 |         subrole = element_attribute(item, "AXSubrole") or ""
 647 |         bounds = {"x": 0, "y": 0, "width": 0, "height": 0}
 648 |         position_value = element_attribute(item, kAXPositionAttribute)
 649 |         if position_value:
 650 |             position_value = element_value(position_value, kAXValueCGPointType)
 651 |             bounds["x"] = getattr(position_value, 'x', 0)
 652 |             bounds["y"] = getattr(position_value, 'y', 0)
 653 |         size_value = element_attribute(item, kAXSizeAttribute)
 654 |         if size_value:
 655 |             size_value = element_value(size_value, kAXValueCGSizeType)
 656 |             bounds["width"] = getattr(size_value, 'width', 0)
 657 |             bounds["height"] = getattr(size_value, 'height', 0)
 658 |         item_type = "unknown"
 659 |         if subrole == "AXApplicationDockItem":
 660 |             item_type = "application"
 661 |         elif subrole == "AXFolderDockItem":
 662 |             item_type = "folder"
 663 |         elif subrole == "AXDocumentDockItem":
 664 |             item_type = "document"
 665 |         elif subrole == "AXSeparatorDockItem" or role == "AXSeparator":
 666 |             item_type = "separator"
 667 |         elif "trash" in title.lower():
 668 |             item_type = "trash"
 669 |         dock_items.append({
 670 |             "title": title,
 671 |             "description": description,
 672 |             "bounds": bounds,
 673 |             "index": i,
 674 |             "type": item_type,
 675 |             "role": role,
 676 |             "subrole": subrole
 677 |         })
 678 |     return dock_items
 679 | 
 680 | class MacOSAccessibilityHandler(BaseAccessibilityHandler):
 681 |     """Handler for macOS accessibility features and UI element inspection."""
 682 |     
 683 |     def get_desktop_state(self):
 684 |         """Get the current state of the desktop including windows, apps, menubar, and dock.
 685 |         
 686 |         Returns:
 687 |             Dictionary containing applications, windows, menubar_items, and dock_items
 688 |         """
 689 |         windows = [w for w in get_all_windows_zorder() if w.get("is_on_screen")]
 690 |         running_apps = self.get_running_apps()
 691 |         applications = []
 692 |         pid_to_window_ids = {}
 693 |         # Build a mapping: pid -> list of AX window trees
 694 |         pid_to_ax_trees = {}
 695 |         for app in running_apps:
 696 |             pid = app.processIdentifier()
 697 |             try:
 698 |                 app_elem = AXUIElementCreateApplication(pid)
 699 |                 err, app_windows = AXUIElementCopyAttributeValue(app_elem, kAXWindowsAttribute, None)
 700 |                 trees = []
 701 |                 if err == kAXErrorSuccess and app_windows:
 702 |                     for ax_win in app_windows:
 703 |                         try:
 704 |                             trees.append(UIElement(ax_win).to_dict())
 705 |                         except Exception as e:
 706 |                             trees.append({"error": str(e)})
 707 |                 pid_to_ax_trees[pid] = trees
 708 |             except Exception as e:
 709 |                 pid_to_ax_trees[pid] = [{"error": str(e)}]
 710 |         # Attach children by pid and index (order)
 711 |         pid_to_idx = {}
 712 |         for win in windows:
 713 |             pid = win["pid"]
 714 |             idx = pid_to_idx.get(pid, 0)
 715 |             ax_trees = pid_to_ax_trees.get(pid, [])
 716 |             win["children"] = ax_trees[idx]["children"] if idx < len(ax_trees) and "children" in ax_trees[idx] else []
 717 |             pid_to_idx[pid] = idx + 1
 718 |             pid_to_window_ids.setdefault(pid, []).append(win["id"])
 719 |         for app in running_apps:
 720 |             info = get_app_info(app)
 721 |             app_pid = info["pid"]
 722 |             applications.append({
 723 |                 "info": info,
 724 |                 "windows": pid_to_window_ids.get(app_pid, [])
 725 |             })
 726 |         menubar_items = get_menubar_items()
 727 |         dock_items = get_dock_items()
 728 |         return {
 729 |             "applications": applications,
 730 |             "windows": windows,
 731 |             "menubar_items": menubar_items,
 732 |             "dock_items": dock_items
 733 |         }
 734 | 
 735 |     def get_application_windows(self, pid: int):
 736 |         """Get all windows for a specific application.
 737 |         
 738 |         Args:
 739 |             pid: Process ID of the application
 740 |             
 741 |         Returns:
 742 |             List of accessibility window elements or empty list if none found
 743 |         """
 744 |         try:
 745 |             app = AXUIElementCreateApplication(pid)
 746 |             err, windows = AXUIElementCopyAttributeValue(app, kAXWindowsAttribute, None)
 747 |             if err == kAXErrorSuccess and windows:
 748 |                 if isinstance(windows, Foundation.NSArray):  # type: ignore
 749 |                     return windows
 750 |             return []
 751 |         except:
 752 |             return []
 753 | 
 754 |     def get_all_windows(self):
 755 |         """Get all visible windows in the system.
 756 |         
 757 |         Returns:
 758 |             List of window dictionaries with app information and window details
 759 |         """
 760 |         try:
 761 |             windows = []
 762 |             running_apps = self.get_running_apps()
 763 | 
 764 |             for app in running_apps:
 765 |                 try:
 766 |                     app_name = app.localizedName()
 767 |                     pid = app.processIdentifier()
 768 | 
 769 |                     # Skip system processes and background apps
 770 |                     if not app.activationPolicy() == 0:  # NSApplicationActivationPolicyRegular
 771 |                         continue
 772 | 
 773 |                     # Get application windows
 774 |                     app_windows = self.get_application_windows(pid)
 775 | 
 776 |                     windows.append(
 777 |                         {
 778 |                             "app_name": app_name,
 779 |                             "pid": pid,
 780 |                             "frontmost": app.isActive(),
 781 |                             "has_windows": len(app_windows) > 0,
 782 |                             "windows": app_windows,
 783 |                         }
 784 |                     )
 785 |                 except:
 786 |                     continue
 787 | 
 788 |             return windows
 789 |         except:
 790 |             return []
 791 | 
 792 |     def get_running_apps(self):
 793 |         """Get all currently running applications.
 794 |         
 795 |         Returns:
 796 |             List of NSRunningApplication objects
 797 |         """
 798 |         # From NSWorkspace.runningApplications docs: https://developer.apple.com/documentation/appkit/nsworkspace/runningapplications
 799 |         # "Similar to the NSRunningApplication class's properties, this property will only change when the main run loop runs in a common mode"
 800 |         # So we need to run the main run loop to get the latest running applications
 801 |         Foundation.CFRunLoopRunInMode(Foundation.kCFRunLoopDefaultMode, 0.1, False)  # type: ignore
 802 |         return NSWorkspace.sharedWorkspace().runningApplications()
 803 | 
 804 |     def get_ax_attribute(self, element, attribute):
 805 |         """Get an accessibility attribute from an element.
 806 |         
 807 |         Args:
 808 |             element: The accessibility element
 809 |             attribute: The attribute name to retrieve
 810 |             
 811 |         Returns:
 812 |             The attribute value or None if not found
 813 |         """
 814 |         return element_attribute(element, attribute)
 815 | 
 816 |     def serialize_node(self, element):
 817 |         """Create a serializable dictionary representation of an accessibility element.
 818 |         
 819 |         Args:
 820 |             element: The accessibility element to serialize
 821 |             
 822 |         Returns:
 823 |             Dictionary containing element properties like role, title, value, position, and size
 824 |         """
 825 |         # Create a serializable dictionary representation of an accessibility element
 826 |         result = {}
 827 | 
 828 |         # Get basic attributes
 829 |         result["role"] = self.get_ax_attribute(element, kAXRoleAttribute)
 830 |         result["title"] = self.get_ax_attribute(element, kAXTitleAttribute)
 831 |         result["value"] = self.get_ax_attribute(element, kAXValueAttribute)
 832 | 
 833 |         # Get position and size if available
 834 |         position = self.get_ax_attribute(element, kAXPositionAttribute)
 835 |         if position:
 836 |             try:
 837 |                 position_dict = {"x": position[0], "y": position[1]}
 838 |                 result["position"] = position_dict
 839 |             except (IndexError, TypeError):
 840 |                 pass
 841 | 
 842 |         size = self.get_ax_attribute(element, kAXSizeAttribute)
 843 |         if size:
 844 |             try:
 845 |                 size_dict = {"width": size[0], "height": size[1]}
 846 |                 result["size"] = size_dict
 847 |             except (IndexError, TypeError):
 848 |                 pass
 849 | 
 850 |         return result
 851 | 
 852 |     async def get_accessibility_tree(self) -> Dict[str, Any]:
 853 |         """Get the complete accessibility tree for the current desktop state.
 854 |         
 855 |         Returns:
 856 |             Dictionary containing success status and desktop state information
 857 |         """        
 858 |         try:
 859 |             desktop_state = self.get_desktop_state()
 860 |             return {
 861 |                 "success": True,
 862 |                 **desktop_state
 863 |             } 
 864 | 
 865 |         except Exception as e:
 866 |             return {"success": False, "error": str(e)}
 867 | 
 868 |     async def find_element(
 869 |         self, role: Optional[str] = None, title: Optional[str] = None, value: Optional[str] = None
 870 |     ) -> Dict[str, Any]:
 871 |         """Find an accessibility element matching the specified criteria.
 872 |         
 873 |         Args:
 874 |             role: The accessibility role to match (optional)
 875 |             title: The title to match (optional)
 876 |             value: The value to match (optional)
 877 |             
 878 |         Returns:
 879 |             Dictionary containing success status and the found element or error message
 880 |         """
 881 |         try:
 882 |             system = AXUIElementCreateSystemWide()
 883 | 
 884 |             def match_element(element):
 885 |                 """Check if an element matches the search criteria.
 886 |                 
 887 |                 Args:
 888 |                     element: The accessibility element to check
 889 |                     
 890 |                 Returns:
 891 |                     True if element matches all specified criteria, False otherwise
 892 |                 """
 893 |                 if role and self.get_ax_attribute(element, kAXRoleAttribute) != role:
 894 |                     return False
 895 |                 if title and self.get_ax_attribute(element, kAXTitleAttribute) != title:
 896 |                     return False
 897 |                 if value and str(self.get_ax_attribute(element, kAXValueAttribute)) != value:
 898 |                     return False
 899 |                 return True
 900 | 
 901 |             def search_tree(element):
 902 |                 """Recursively search the accessibility tree for matching elements.
 903 |                 
 904 |                 Args:
 905 |                     element: The accessibility element to search from
 906 |                     
 907 |                 Returns:
 908 |                     Serialized element dictionary if match found, None otherwise
 909 |                 """
 910 |                 if match_element(element):
 911 |                     return self.serialize_node(element)
 912 | 
 913 |                 children = self.get_ax_attribute(element, kAXChildrenAttribute)
 914 |                 if children:
 915 |                     for child in children:
 916 |                         result = search_tree(child)
 917 |                         if result:
 918 |                             return result
 919 |                 return None
 920 | 
 921 |             element = search_tree(system)
 922 |             return {"success": True, "element": element}
 923 | 
 924 |         except Exception as e:
 925 |             return {"success": False, "error": str(e)}
 926 | 
 927 | class MacOSAutomationHandler(BaseAutomationHandler):
 928 |     """Handler for macOS automation including mouse, keyboard, and screen operations."""
 929 |     
 930 |     # Mouse Actions
 931 |     mouse = MouseController()
 932 |     keyboard = KeyboardController()
 933 |     
 934 |     async def mouse_down(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> Dict[str, Any]:
 935 |         """Press and hold a mouse button at the specified coordinates.
 936 |         
 937 |         Args:
 938 |             x: X coordinate (optional, uses current position if None)
 939 |             y: Y coordinate (optional, uses current position if None)
 940 |             button: Mouse button to press ("left", "right", or "middle")
 941 |             
 942 |         Returns:
 943 |             Dictionary containing success status and error message if failed
 944 |         """
 945 |         try:
 946 |             if x is not None and y is not None:
 947 |                 self.mouse.position = (x, y)
 948 |             self.mouse.press(Button.left if button == "left" else Button.right if button == "right" else Button.middle)
 949 |             return {"success": True}
 950 |         except Exception as e:
 951 |             return {"success": False, "error": str(e)}
 952 | 
 953 |     async def mouse_up(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> Dict[str, Any]:
 954 |         """Release a mouse button at the specified coordinates.
 955 |         
 956 |         Args:
 957 |             x: X coordinate (optional, uses current position if None)
 958 |             y: Y coordinate (optional, uses current position if None)
 959 |             button: Mouse button to release ("left", "right", or "middle")
 960 |             
 961 |         Returns:
 962 |             Dictionary containing success status and error message if failed
 963 |         """
 964 |         try:
 965 |             if x is not None and y is not None:
 966 |                 self.mouse.position = (x, y)
 967 |             self.mouse.release(Button.left if button == "left" else Button.right if button == "right" else Button.middle)
 968 |             return {"success": True}
 969 |         except Exception as e:
 970 |             return {"success": False, "error": str(e)}
 971 | 
 972 |     async def left_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]:
 973 |         """Perform a left mouse click at the specified coordinates.
 974 |         
 975 |         Args:
 976 |             x: X coordinate (optional, uses current position if None)
 977 |             y: Y coordinate (optional, uses current position if None)
 978 |             
 979 |         Returns:
 980 |             Dictionary containing success status and error message if failed
 981 |         """
 982 |         try:
 983 |             if x is not None and y is not None:
 984 |                 self.mouse.position = (x, y)
 985 |             self.mouse.click(Button.left, 1)
 986 |             return {"success": True}
 987 |         except Exception as e:
 988 |             return {"success": False, "error": str(e)}
 989 | 
 990 |     async def right_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]:
 991 |         """Perform a right mouse click at the specified coordinates.
 992 |         
 993 |         Args:
 994 |             x: X coordinate (optional, uses current position if None)
 995 |             y: Y coordinate (optional, uses current position if None)
 996 |             
 997 |         Returns:
 998 |             Dictionary containing success status and error message if failed
 999 |         """
1000 |         try:
1001 |             if x is not None and y is not None:
1002 |                 self.mouse.position = (x, y)
1003 |             self.mouse.click(Button.right, 1)
1004 |             return {"success": True}
1005 |         except Exception as e:
1006 |             return {"success": False, "error": str(e)}
1007 | 
1008 |     async def double_click(
1009 |         self, x: Optional[int] = None, y: Optional[int] = None
1010 |     ) -> Dict[str, Any]:
1011 |         """Perform a double left mouse click at the specified coordinates.
1012 |         
1013 |         Args:
1014 |             x: X coordinate (optional, uses current position if None)
1015 |             y: Y coordinate (optional, uses current position if None)
1016 |             
1017 |         Returns:
1018 |             Dictionary containing success status and error message if failed
1019 |         """
1020 |         try:
1021 |             if x is not None and y is not None:
1022 |                 self.mouse.position = (x, y)
1023 |             self.mouse.click(Button.left, 2)
1024 |             return {"success": True}
1025 |         except Exception as e:
1026 |             return {"success": False, "error": str(e)}
1027 | 
1028 |     async def move_cursor(self, x: int, y: int) -> Dict[str, Any]:
1029 |         """Move the mouse cursor to the specified coordinates.
1030 |         
1031 |         Args:
1032 |             x: Target X coordinate
1033 |             y: Target Y coordinate
1034 |             
1035 |         Returns:
1036 |             Dictionary containing success status and error message if failed
1037 |         """
1038 |         try:
1039 |             self.mouse.position = (x, y)
1040 |             return {"success": True}
1041 |         except Exception as e:
1042 |             return {"success": False, "error": str(e)}
1043 | 
1044 |     async def drag_to(
1045 |         self, x: int, y: int, button: str = "left", duration: float = 0.5
1046 |     ) -> Dict[str, Any]:
1047 |         """Drag from current position to target coordinates.
1048 |         
1049 |         Args:
1050 |             x: Target X coordinate
1051 |             y: Target Y coordinate
1052 |             button: Mouse button to use for dragging ("left", "right", or "middle")
1053 |             duration: Duration of the drag operation in seconds
1054 |             
1055 |         Returns:
1056 |             Dictionary containing success status and error message if failed
1057 |         """
1058 |         try:
1059 |             btn = Button.left if button == "left" else Button.right if button == "right" else Button.middle
1060 |             # Press
1061 |             self.mouse.press(btn)
1062 |             # Move with sleep to simulate drag duration
1063 |             start = self.mouse.position
1064 |             steps = 20
1065 |             start_x, start_y = start
1066 |             dx = (x - start_x) / steps
1067 |             dy = (y - start_y) / steps
1068 |             for i in range(steps):
1069 |                 self.mouse.position = (int(start_x + dx * (i + 1)), int(start_y + dy * (i + 1)))
1070 |                 time.sleep(duration / steps)
1071 |             # Release
1072 |             self.mouse.release(btn)
1073 |             return {"success": True}
1074 |         except Exception as e:
1075 |             try:
1076 |                 self.mouse.release(btn)
1077 |             except:
1078 |                 pass
1079 |             return {"success": False, "error": str(e)}
1080 | 
1081 |     async def drag(
1082 |         self, path: List[Tuple[int, int]], button: str = "left", duration: float = 0.5
1083 |     ) -> Dict[str, Any]:
1084 |         """Drag the mouse along a specified path of coordinates.
1085 |         
1086 |         Args:
1087 |             path: List of (x, y) coordinate tuples defining the drag path
1088 |             button: Mouse button to use for dragging ("left", "right", or "middle")
1089 |             duration: Total duration of the drag operation in seconds
1090 |             
1091 |         Returns:
1092 |             Dictionary containing success status and error message if failed
1093 |         """
1094 |         try:
1095 |             if not path or len(path) < 2:
1096 |                 return {"success": False, "error": "Path must contain at least 2 points"}
1097 |             btn = Button.left if button == "left" else Button.right if button == "right" else Button.middle
1098 |             # Move to the first point
1099 |             self.mouse.position = path[0]
1100 |             self.mouse.press(btn)
1101 |             step_duration = duration / (len(path) - 1) if len(path) > 1 else duration
1102 |             for x, y in path[1:]:
1103 |                 self.mouse.position = (x, y)
1104 |                 time.sleep(step_duration)
1105 |             self.mouse.release(btn)
1106 |             return {"success": True}
1107 |         except Exception as e:
1108 |             try:
1109 |                 self.mouse.release(btn)
1110 |             except:
1111 |                 pass
1112 |             return {"success": False, "error": str(e)}
1113 | 
1114 |     # Keyboard Actions
1115 |     async def key_down(self, key: str) -> Dict[str, Any]:
1116 |         """Press and hold a keyboard key.
1117 |         
1118 |         Args:
1119 |             key: Key name to press (using pyautogui key names)
1120 |             
1121 |         Returns:
1122 |             Dictionary containing success status and error message if failed
1123 |         """
1124 |         try:
1125 |             # use pyautogui for their key names
1126 |             pyautogui.keyDown(key)
1127 |             return {"success": True}
1128 |         except Exception as e:
1129 |             return {"success": False, "error": str(e)}
1130 |     
1131 |     async def key_up(self, key: str) -> Dict[str, Any]:
1132 |         """Release a keyboard key.
1133 |         
1134 |         Args:
1135 |             key: Key name to release (using pyautogui key names)
1136 |             
1137 |         Returns:
1138 |             Dictionary containing success status and error message if failed
1139 |         """
1140 |         try:
1141 |             # use pyautogui for their key names
1142 |             pyautogui.keyUp(key)
1143 |             return {"success": True}
1144 |         except Exception as e:
1145 |             return {"success": False, "error": str(e)}
1146 |     
1147 |     async def type_text(self, text: str) -> Dict[str, Any]:
1148 |         """Type text using the keyboard with Unicode support.
1149 |         
1150 |         Args:
1151 |             text: Text string to type
1152 |             
1153 |         Returns:
1154 |             Dictionary containing success status and error message if failed
1155 |         """
1156 |         try:
1157 |             # use pynput for Unicode support
1158 |             self.keyboard.type(text)
1159 |             return {"success": True}
1160 |         except Exception as e:
1161 |             return {"success": False, "error": str(e)}
1162 | 
1163 |     async def press_key(self, key: str) -> Dict[str, Any]:
1164 |         """Press and release a keyboard key.
1165 |         
1166 |         Args:
1167 |             key: Key name to press (using pyautogui key names)
1168 |             
1169 |         Returns:
1170 |             Dictionary containing success status and error message if failed
1171 |         """
1172 |         try:
1173 |             # use pyautogui for their key names
1174 |             pyautogui.press(key)
1175 |             return {"success": True}
1176 |         except Exception as e:
1177 |             return {"success": False, "error": str(e)}
1178 | 
1179 |     async def hotkey(self, keys: List[str]) -> Dict[str, Any]:
1180 |         """Press a combination of keys simultaneously.
1181 |         
1182 |         Args:
1183 |             keys: List of key names to press together (using pyautogui key names)
1184 |             
1185 |         Returns:
1186 |             Dictionary containing success status and error message if failed
1187 |         """
1188 |         try:
1189 |             # use pyautogui for their key names
1190 |             pyautogui.hotkey(*keys)
1191 |             return {"success": True}
1192 |         except Exception as e:
1193 |             return {"success": False, "error": str(e)}
1194 | 
1195 |     # Scrolling Actions
1196 |     async def scroll(self, x: int, y: int) -> Dict[str, Any]:
1197 |         """Scroll the mouse wheel in the specified direction.
1198 |         
1199 |         Args:
1200 |             x: Horizontal scroll amount
1201 |             y: Vertical scroll amount (positive for up, negative for down)
1202 |             
1203 |         Returns:
1204 |             Dictionary containing success status and error message if failed
1205 |         """
1206 |         try:
1207 |             self.mouse.scroll(x, y)
1208 |             return {"success": True}
1209 |         except Exception as e:
1210 |             return {"success": False, "error": str(e)}
1211 |     
1212 |     async def scroll_down(self, clicks: int = 1) -> Dict[str, Any]:
1213 |         """Scroll down by the specified number of clicks.
1214 |         
1215 |         Args:
1216 |             clicks: Number of scroll clicks to perform
1217 |             
1218 |         Returns:
1219 |             Dictionary containing success status and error message if failed
1220 |         """
1221 |         try:
1222 |             self.mouse.scroll(0, -clicks)
1223 |             return {"success": True}
1224 |         except Exception as e:
1225 |             return {"success": False, "error": str(e)}
1226 | 
1227 |     async def scroll_up(self, clicks: int = 1) -> Dict[str, Any]:
1228 |         """Scroll up by the specified number of clicks.
1229 |         
1230 |         Args:
1231 |             clicks: Number of scroll clicks to perform
1232 |             
1233 |         Returns:
1234 |             Dictionary containing success status and error message if failed
1235 |         """
1236 |         try:
1237 |             self.mouse.scroll(0, clicks)
1238 |             return {"success": True}
1239 |         except Exception as e:
1240 |             return {"success": False, "error": str(e)}
1241 | 
1242 |     # Screen Actions
1243 |     async def screenshot(self) -> Dict[str, Any]:
1244 |         """Capture a screenshot of the current screen.
1245 |         
1246 |         Returns:
1247 |             Dictionary containing success status and base64-encoded image data or error message
1248 |         """
1249 |         try:
1250 |             from PIL import Image
1251 | 
1252 |             screenshot = pyautogui.screenshot()
1253 |             if not isinstance(screenshot, Image.Image):
1254 |                 return {"success": False, "error": "Failed to capture screenshot"}
1255 | 
1256 |             buffered = BytesIO()
1257 |             screenshot.save(buffered, format="PNG", optimize=True)
1258 |             buffered.seek(0)
1259 |             image_data = base64.b64encode(buffered.getvalue()).decode()
1260 |             return {"success": True, "image_data": image_data}
1261 |         except Exception as e:
1262 |             return {"success": False, "error": f"Screenshot error: {str(e)}"}
1263 | 
1264 |     async def get_screen_size(self) -> Dict[str, Any]:
1265 |         """Get the dimensions of the current screen.
1266 |         
1267 |         Returns:
1268 |             Dictionary containing success status and screen size or error message
1269 |         """
1270 |         try:
1271 |             size = pyautogui.size()
1272 |             return {"success": True, "size": {"width": size.width, "height": size.height}}
1273 |         except Exception as e:
1274 |             return {"success": False, "error": str(e)}
1275 | 
1276 |     async def get_cursor_position(self) -> Dict[str, Any]:
1277 |         """Get the current position of the mouse cursor.
1278 |         
1279 |         Returns:
1280 |             Dictionary containing success status and cursor position or error message
1281 |         """
1282 |         try:
1283 |             x, y = self.mouse.position
1284 |             return {"success": True, "position": {"x": x, "y": y}}
1285 |         except Exception as e:
1286 |             return {"success": False, "error": str(e)}
1287 | 
1288 |     # Clipboard Actions
1289 |     async def copy_to_clipboard(self) -> Dict[str, Any]:
1290 |         """Get the current content of the system clipboard.
1291 |         
1292 |         Returns:
1293 |             Dictionary containing success status and clipboard content or error message
1294 |         """
1295 |         try:
1296 |             import pyperclip
1297 | 
1298 |             content = pyperclip.paste()
1299 |             return {"success": True, "content": content}
1300 |         except Exception as e:
1301 |             return {"success": False, "error": str(e)}
1302 | 
1303 |     async def set_clipboard(self, text: str) -> Dict[str, Any]:
1304 |         """Set the content of the system clipboard.
1305 |         
1306 |         Args:
1307 |             text: Text to copy to the clipboard
1308 |             
1309 |         Returns:
1310 |             Dictionary containing success status and error message if failed
1311 |         """
1312 |         try:
1313 |             import pyperclip
1314 | 
1315 |             pyperclip.copy(text)
1316 |             return {"success": True}
1317 |         except Exception as e:
1318 |             return {"success": False, "error": str(e)}
1319 | 
1320 |     async def run_command(self, command: str) -> Dict[str, Any]:
1321 |         """Run a shell command and return its output.
1322 |         
1323 |         Args:
1324 |             command: Shell command to execute
1325 |             
1326 |         Returns:
1327 |             Dictionary containing success status, stdout, stderr, and return code
1328 |         """
1329 |         try:
1330 |             # Create subprocess
1331 |             process = await asyncio.create_subprocess_shell(
1332 |                 command,
1333 |                 stdout=asyncio.subprocess.PIPE,
1334 |                 stderr=asyncio.subprocess.PIPE
1335 |             )
1336 |             # Wait for the subprocess to finish
1337 |             stdout, stderr = await process.communicate()
1338 |             # Return decoded output
1339 |             return {
1340 |                 "success": True, 
1341 |                 "stdout": stdout.decode() if stdout else "", 
1342 |                 "stderr": stderr.decode() if stderr else "",
1343 |                 "return_code": process.returncode
1344 |             }
1345 |         except Exception as e:
1346 |             return {"success": False, "error": str(e)}
1347 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/anthropic.py:
--------------------------------------------------------------------------------

```python
   1 | """
   2 | Anthropic hosted tools agent loop implementation using liteLLM
   3 | """
   4 | 
   5 | import asyncio
   6 | import json
   7 | from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
   8 | import litellm
   9 | from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
  10 | 
  11 | from ..decorators import register_agent
  12 | from ..types import Messages, AgentResponse, Tools, AgentCapability
  13 | from ..loops.base import AsyncAgentConfig
  14 | from ..responses import (
  15 |     make_reasoning_item,
  16 |     make_output_text_item,
  17 |     make_click_item,
  18 |     make_double_click_item,
  19 |     make_drag_item,
  20 |     make_keypress_item,
  21 |     make_move_item,
  22 |     make_scroll_item,
  23 |     make_type_item,
  24 |     make_wait_item,
  25 |     make_input_image_item,
  26 |     make_screenshot_item,
  27 |     make_failed_tool_call_items,
  28 |     make_left_mouse_down_item,
  29 |     make_left_mouse_up_item
  30 | )
  31 | 
  32 | # Model version mapping to tool version and beta flag
  33 | MODEL_TOOL_MAPPING = [
  34 |     # Claude 4 models
  35 |     {
  36 |         "pattern": r"claude-4|claude-opus-4|claude-sonnet-4",
  37 |         "tool_version": "computer_20250124",
  38 |         "beta_flag": "computer-use-2025-01-24"
  39 |     },
  40 |     # Claude 3.7 models
  41 |     {
  42 |         "pattern": r"claude-3\.?7|claude-3-7",
  43 |         "tool_version": "computer_20250124",
  44 |         "beta_flag": "computer-use-2025-01-24"
  45 |     },
  46 |     # Claude 3.5 models (fallback)
  47 |     {
  48 |         "pattern": r"claude-3\.?5|claude-3-5",
  49 |         "tool_version": "computer_20241022",
  50 |         "beta_flag": "computer-use-2024-10-22"
  51 |     }
  52 | ]
  53 | 
  54 | def _get_tool_config_for_model(model: str) -> Dict[str, str]:
  55 |     """Get tool version and beta flag for the given model."""
  56 |     import re
  57 |     
  58 |     for mapping in MODEL_TOOL_MAPPING:
  59 |         if re.search(mapping["pattern"], model, re.IGNORECASE):
  60 |             return {
  61 |                 "tool_version": mapping["tool_version"],
  62 |                 "beta_flag": mapping["beta_flag"]
  63 |             }
  64 |     
  65 |     # Default to Claude 3.5 configuration
  66 |     return {
  67 |         "tool_version": "computer_20241022",
  68 |         "beta_flag": "computer-use-2024-10-22"
  69 |     }
  70 | 
  71 | async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
  72 |     """Map a computer tool to Anthropic's hosted tool schema."""
  73 |     # Get dimensions from the computer handler
  74 |     try:
  75 |         width, height = await computer_tool.get_dimensions()
  76 |     except Exception:
  77 |         # Fallback to default dimensions if method fails
  78 |         width, height = 1024, 768
  79 |     
  80 |     return {
  81 |         "type": tool_version,
  82 |         "function": {
  83 |             "name": "computer",
  84 |             "parameters": {
  85 |                 "display_height_px": height,
  86 |                 "display_width_px": width,
  87 |                 "display_number": 1,
  88 |             },
  89 |         },
  90 |     }
  91 | 
  92 | async def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
  93 |     """Prepare tools for Anthropic API format."""
  94 |     tool_config = _get_tool_config_for_model(model)
  95 |     anthropic_tools = []
  96 |     
  97 |     for schema in tool_schemas:
  98 |         if schema["type"] == "computer":
  99 |             # Map computer tool to Anthropic format
 100 |             anthropic_tools.append(await _map_computer_tool_to_anthropic(
 101 |                 schema["computer"], 
 102 |                 tool_config["tool_version"]
 103 |             ))
 104 |         elif schema["type"] == "function":
 105 |             # Function tools - convert to Anthropic format
 106 |             function_schema = schema["function"]
 107 |             anthropic_tools.append({
 108 |                 "type": "function",
 109 |                 "function": {
 110 |                     "name": function_schema["name"],
 111 |                     "description": function_schema.get("description", ""),
 112 |                     "parameters": function_schema.get("parameters", {})
 113 |                 }
 114 |             })
 115 |     
 116 |     return anthropic_tools
 117 | 
 118 | def _convert_responses_items_to_completion_messages(messages: Messages) -> List[Dict[str, Any]]:
 119 |     """Convert responses_items message format to liteLLM completion format."""
 120 |     completion_messages = []
 121 |     call_id_to_fn_name = {}
 122 | 
 123 |     for message in messages:
 124 |         msg_type = message.get("type")
 125 |         role = message.get("role")
 126 |         
 127 |         # Handle user messages (both with and without explicit type)
 128 |         if role == "user" or msg_type == "user":
 129 |             content = message.get("content", "")
 130 |             if isinstance(content, list):
 131 |                 # Multi-modal content - convert input_image to image format
 132 |                 converted_content = []
 133 |                 for item in content:
 134 |                     if isinstance(item, dict) and item.get("type") == "input_image":
 135 |                         # Convert input_image to OpenAI image format
 136 |                         image_url = item.get("image_url", "")
 137 |                         if image_url and image_url != "[omitted]":
 138 |                             converted_content.append({
 139 |                                 "type": "image_url",
 140 |                                 "image_url": {
 141 |                                     "url": image_url
 142 |                                 }
 143 |                             })
 144 |                     elif isinstance(item, dict) and item.get("type") == "input_text":
 145 |                         # Convert input_text to OpenAI text format
 146 |                         text = item.get("text", "")
 147 |                         converted_content.append({
 148 |                             "type": "text",
 149 |                             "text": text
 150 |                         })
 151 |                     else:
 152 |                         # Keep other content types as-is
 153 |                         converted_content.append(item)
 154 |                 
 155 |                 completion_messages.append({
 156 |                     "role": "user",
 157 |                     "content": converted_content if converted_content else content
 158 |                 })
 159 |             else:
 160 |                 # Text content
 161 |                 completion_messages.append({
 162 |                     "role": "user",
 163 |                     "content": content
 164 |                 })
 165 |         
 166 |         # Handle assistant messages
 167 |         elif role == "assistant":
 168 |             content = message.get("content", [])
 169 |             if isinstance(content, str):
 170 |                 content = [{ "type": "output_text", "text": content }]
 171 |             
 172 |             content = "\n".join(item.get("text", "") for item in content)
 173 |             completion_messages.append({
 174 |                 "role": "assistant",
 175 |                 "content": content
 176 |             })
 177 |         
 178 |         elif msg_type == "reasoning":
 179 |             # Reasoning becomes part of assistant message
 180 |             summary = message.get("summary", [])
 181 |             reasoning_text = ""
 182 |             
 183 |             if isinstance(summary, list) and summary:
 184 |                 # Extract text from summary items
 185 |                 for item in summary:
 186 |                     if isinstance(item, dict) and item.get("type") == "summary_text":
 187 |                         reasoning_text = item.get("text", "")
 188 |                         break
 189 |             else:
 190 |                 # Fallback to direct reasoning field
 191 |                 reasoning_text = message.get("reasoning", "")
 192 |             
 193 |             if reasoning_text:
 194 |                 completion_messages.append({
 195 |                     "role": "assistant",
 196 |                     "content": reasoning_text
 197 |                 })
 198 |         
 199 |         elif msg_type == "function_call":
 200 |             fn_name = message.get("name")
 201 |             fn_args = message.get("arguments", "{}")
 202 |             call_id = message.get("call_id", "call_1")
 203 |             call_id_to_fn_name[call_id] = fn_name
 204 |             openai_tool_calls = [{
 205 |                 "id": call_id,
 206 |                 "type": "function",
 207 |                 "function": {
 208 |                     "name": fn_name,
 209 |                     "arguments": fn_args
 210 |                 }
 211 |             }]            # If the last completion message is an assistant message, extend the tool_calls
 212 |             if completion_messages and completion_messages[-1].get("role") == "assistant":
 213 |                 if "tool_calls" not in completion_messages[-1]:
 214 |                     completion_messages[-1]["tool_calls"] = []
 215 |                 completion_messages[-1]["tool_calls"].extend(openai_tool_calls)
 216 |             else:
 217 |                 # Create new assistant message with tool calls
 218 |                 completion_messages.append({
 219 |                     "role": "assistant",
 220 |                     "content": None,
 221 |                     "tool_calls": openai_tool_calls
 222 |                 })
 223 |         
 224 |         elif msg_type == "function_call_output":
 225 |             call_id = message.get("call_id", "call_1")
 226 |             fn_output = message.get("output", "")
 227 |             fn_name = call_id_to_fn_name.get(call_id, "computer")
 228 | 
 229 |             completion_messages.append({
 230 |                 "role": "function",
 231 |                 "name": fn_name,
 232 |                 "tool_call_id": call_id,
 233 |                 "content": str(fn_output)
 234 |             })
 235 |             
 236 |         elif msg_type == "computer_call":
 237 |             # Computer call becomes tool use in assistant message
 238 |             action = message.get("action", {})
 239 |             action_type = action.get("type")
 240 |             call_id = message.get("call_id", "call_1")
 241 |             
 242 |             tool_use_content = []
 243 |             
 244 |             # Basic actions (all versions)
 245 |             if action_type == "click":
 246 |                 # Input:
 247 |                 # {
 248 |                 #     "type": "computer_call",
 249 |                 #     "call_id": "call_1",
 250 |                 #     "action": {
 251 |                 #         "type": "click",
 252 |                 #         "x": 100,
 253 |                 #         "y": 200
 254 |                 #     }
 255 |                 # }
 256 |                 
 257 |                 # Output:
 258 |                 # {
 259 |                 #     "function": {
 260 |                 #         "name": "computer",
 261 |                 #         "arguments": json.dumps({
 262 |                 #             "action": "click",
 263 |                 #             "coordinate": [100, 200]
 264 |                 #         })
 265 |                 #     },
 266 |                 #     "id": "call_1",
 267 |                 #     "type": "function"
 268 |                 # }
 269 |                 button = action.get("button", "left")
 270 |                 action_name = "right_click" if button == "right" else "middle_click" if button == "wheel" else "left_click"
 271 |                 tool_use_content.append({
 272 |                     "type": "tool_use",
 273 |                     "id": call_id,
 274 |                     "name": "computer",
 275 |                     "input": {
 276 |                         "action": action_name,
 277 |                         "coordinate": [action.get("x", 0), action.get("y", 0)]
 278 |                     }
 279 |                 })
 280 |             elif action_type == "double_click":
 281 |                 # Input:
 282 |                 # {
 283 |                 #     "type": "computer_call",
 284 |                 #     "call_id": "call_1",
 285 |                 #     "action": {
 286 |                 #         "type": "double_click",
 287 |                 #         "x": 160,
 288 |                 #         "y": 240
 289 |                 #     }
 290 |                 # }
 291 |                 
 292 |                 # Output:
 293 |                 # {
 294 |                 #     "function": {
 295 |                 #         "name": "computer",
 296 |                 #         "arguments": json.dumps({
 297 |                 #             "action": "double_click",
 298 |                 #             "coordinate": [160, 240]
 299 |                 #         })
 300 |                 #     },
 301 |                 #     "id": "call_1",
 302 |                 #     "type": "function"
 303 |                 # }
 304 |                 tool_use_content.append({
 305 |                     "type": "tool_use",
 306 |                     "id": call_id,
 307 |                     "name": "computer",
 308 |                     "input": {
 309 |                         "action": "double_click",
 310 |                         "coordinate": [action.get("x", 0), action.get("y", 0)]
 311 |                     }
 312 |                 })
 313 |             elif action_type == "type":
 314 |                 # Input:
 315 |                 # {
 316 |                 #     "type": "computer_call",
 317 |                 #     "call_id": "call_1",
 318 |                 #     "action": {
 319 |                 #         "type": "type",
 320 |                 #         "text": "Hello World"
 321 |                 #     }
 322 |                 # }
 323 |                 
 324 |                 # Output:
 325 |                 # {
 326 |                 #     "function": {
 327 |                 #         "name": "computer",
 328 |                 #         "arguments": json.dumps({
 329 |                 #             "action": "type",
 330 |                 #             "text": "Hello World"
 331 |                 #         })
 332 |                 #     },
 333 |                 #     "id": "call_1",
 334 |                 #     "type": "function"
 335 |                 # }
 336 |                 tool_use_content.append({
 337 |                     "type": "tool_use",
 338 |                     "id": call_id,
 339 |                     "name": "computer",
 340 |                     "input": {
 341 |                         "action": "type",
 342 |                         "text": action.get("text", "")
 343 |                     }
 344 |                 })
 345 |             elif action_type == "keypress":
 346 |                 # Input:
 347 |                 # {
 348 |                 #     "type": "computer_call",
 349 |                 #     "call_id": "call_1",
 350 |                 #     "action": {
 351 |                 #         "type": "keypress",
 352 |                 #         "keys": ["ctrl", "c"]
 353 |                 #     }
 354 |                 # }
 355 |                 
 356 |                 # Output:
 357 |                 # {
 358 |                 #     "function": {
 359 |                 #         "name": "computer",
 360 |                 #         "arguments": json.dumps({
 361 |                 #             "action": "key",
 362 |                 #             "text": "ctrl+c"
 363 |                 #         })
 364 |                 #     },
 365 |                 #     "id": "call_1",
 366 |                 #     "type": "function"
 367 |                 # }
 368 |                 tool_use_content.append({
 369 |                     "type": "tool_use",
 370 |                     "id": call_id,
 371 |                     "name": "computer",
 372 |                     "input": {
 373 |                         "action": "key",
 374 |                         "text": "+".join(action.get("keys", []))
 375 |                     }
 376 |                 })
 377 |             elif action_type in ["mouse_move", "move"]:
 378 |                 # Input:
 379 |                 # {
 380 |                 #     "type": "computer_call",
 381 |                 #     "call_id": "call_1",
 382 |                 #     "action": {
 383 |                 #         "type": "move",
 384 |                 #         "x": 150,
 385 |                 #         "y": 250
 386 |                 #     }
 387 |                 # }
 388 |                 
 389 |                 # Output:
 390 |                 # {
 391 |                 #     "function": {
 392 |                 #         "name": "computer",
 393 |                 #         "arguments": json.dumps({
 394 |                 #             "action": "mouse_move",
 395 |                 #             "coordinate": [150, 250]
 396 |                 #         })
 397 |                 #     },
 398 |                 #     "id": "call_1",
 399 |                 #     "type": "function"
 400 |                 # }
 401 |                 tool_use_content.append({
 402 |                     "type": "tool_use",
 403 |                     "id": call_id,
 404 |                     "name": "computer",
 405 |                     "input": {
 406 |                         "action": "mouse_move",
 407 |                         "coordinate": [action.get("x", 0), action.get("y", 0)]
 408 |                     }
 409 |                 })
 410 |             elif action_type == "scroll":
 411 |                 # Input:
 412 |                 # {
 413 |                 #     "type": "computer_call",
 414 |                 #     "call_id": "call_1",
 415 |                 #     "action": {
 416 |                 #         "type": "scroll",
 417 |                 #         "x": 300,
 418 |                 #         "y": 400,
 419 |                 #         "scroll_x": 0,
 420 |                 #         "scroll_y": -5
 421 |                 #     }
 422 |                 # }
 423 |                 
 424 |                 # Output:
 425 |                 # {
 426 |                 #     "function": {
 427 |                 #         "name": "computer",
 428 |                 #         "arguments": json.dumps({
 429 |                 #             "action": "scroll",
 430 |                 #             "coordinate": [300, 400],
 431 |                 #             "scroll_direction": "down",
 432 |                 #             "scroll_amount": 5
 433 |                 #         })
 434 |                 #     },
 435 |                 #     "id": "call_1",
 436 |                 #     "type": "function"
 437 |                 # }
 438 |                 scroll_x = action.get("scroll_x", 0)
 439 |                 scroll_y = action.get("scroll_y", 0)
 440 |                 # Determine direction and amount from scroll values
 441 |                 if scroll_x > 0:
 442 |                     direction = "left"
 443 |                     amount = scroll_x
 444 |                 elif scroll_x < 0:
 445 |                     direction = "right"
 446 |                     amount = -scroll_x
 447 |                 elif scroll_y > 0:
 448 |                     direction = "up"
 449 |                     amount = scroll_y
 450 |                 elif scroll_y < 0:
 451 |                     direction = "down"
 452 |                     amount = -scroll_y
 453 |                 else:
 454 |                     direction = "down"
 455 |                     amount = 3
 456 |                 
 457 |                 tool_use_content.append({
 458 |                     "type": "tool_use",
 459 |                     "id": call_id,
 460 |                     "name": "computer",
 461 |                     "input": {
 462 |                         "action": "scroll",
 463 |                         "coordinate": [action.get("x", 0), action.get("y", 0)],
 464 |                         "scroll_direction": direction,
 465 |                         "scroll_amount": amount
 466 |                     }
 467 |                 })
 468 |             elif action_type == "drag":
 469 |                 # Input:
 470 |                 # {
 471 |                 #     "type": "computer_call",
 472 |                 #     "call_id": "call_1",
 473 |                 #     "action": {
 474 |                 #         "type": "drag",
 475 |                 #         "path": [
 476 |                 #             {"x": 100, "y": 150},
 477 |                 #             {"x": 200, "y": 250}
 478 |                 #         ]
 479 |                 #     }
 480 |                 # }
 481 |                 
 482 |                 # Output:
 483 |                 # {
 484 |                 #     "function": {
 485 |                 #         "name": "computer",
 486 |                 #         "arguments": json.dumps({
 487 |                 #             "action": "left_click_drag",
 488 |                 #             "start_coordinate": [100, 150],
 489 |                 #             "end_coordinate": [200, 250]
 490 |                 #         })
 491 |                 #     },
 492 |                 #     "id": "call_1",
 493 |                 #     "type": "function"
 494 |                 # }
 495 |                 path = action.get("path", [])
 496 |                 start_coord = [0, 0]
 497 |                 end_coord = [0, 0]
 498 |                 if isinstance(path, list) and len(path) >= 2:
 499 |                     start_coord = [path[0].get("x", 0), path[0].get("y", 0)]
 500 |                     end_coord = [path[-1].get("x", 0), path[-1].get("y", 0)]
 501 |                 
 502 |                 tool_use_content.append({
 503 |                     "type": "tool_use",
 504 |                     "id": call_id,
 505 |                     "name": "computer",
 506 |                     "input": {
 507 |                         "action": "left_click_drag",
 508 |                         "start_coordinate": start_coord,
 509 |                         "end_coordinate": end_coord
 510 |                     }
 511 |                 })
 512 |             elif action_type == "wait":
 513 |                 # Input:
 514 |                 # {
 515 |                 #     "type": "computer_call",
 516 |                 #     "call_id": "call_1",
 517 |                 #     "action": {
 518 |                 #         "type": "wait"
 519 |                 #     }
 520 |                 # }
 521 |                 
 522 |                 # Output:
 523 |                 # {
 524 |                 #     "function": {
 525 |                 #         "name": "computer",
 526 |                 #         "arguments": json.dumps({
 527 |                 #             "action": "wait"
 528 |                 #         })
 529 |                 #     },
 530 |                 #     "id": "call_1",
 531 |                 #     "type": "function"
 532 |                 # }
 533 |                 tool_use_content.append({
 534 |                     "type": "tool_use",
 535 |                     "id": call_id,
 536 |                     "name": "computer",
 537 |                     "input": {
 538 |                         "action": "wait"
 539 |                     }
 540 |                 })
 541 |             elif action_type == "screenshot":
 542 |                 # Input:
 543 |                 # {
 544 |                 #     "type": "computer_call",
 545 |                 #     "call_id": "call_1",
 546 |                 #     "action": {
 547 |                 #         "type": "screenshot"
 548 |                 #     }
 549 |                 # }
 550 |                 
 551 |                 # Output:
 552 |                 # {
 553 |                 #     "function": {
 554 |                 #         "name": "computer",
 555 |                 #         "arguments": json.dumps({
 556 |                 #             "action": "screenshot"
 557 |                 #         })
 558 |                 #     },
 559 |                 #     "id": "call_1",
 560 |                 #     "type": "function"
 561 |                 # }
 562 |                 tool_use_content.append({
 563 |                     "type": "tool_use",
 564 |                     "id": call_id,
 565 |                     "name": "computer",
 566 |                     "input": {
 567 |                         "action": "screenshot"
 568 |                     }
 569 |                 })
 570 |             elif action_type == "left_mouse_down":
 571 |                 tool_use_content.append({
 572 |                     "type": "tool_use",
 573 |                     "id": call_id,
 574 |                     "name": "computer",
 575 |                     "input": {
 576 |                         "action": "left_mouse_down",
 577 |                         "coordinate": [action.get("x", None), action.get("y", None)]
 578 |                     }
 579 |                 })
 580 |             elif action_type == "left_mouse_up":
 581 |                 tool_use_content.append({
 582 |                     "type": "tool_use",
 583 |                     "id": call_id,
 584 |                     "name": "computer",
 585 |                     "input": {
 586 |                         "action": "left_mouse_up",
 587 |                         "coordinate": [action.get("x", None), action.get("y", None)]
 588 |                     }
 589 |                 })
 590 |             
 591 |             # Convert tool_use_content to OpenAI tool_calls format
 592 |             openai_tool_calls = []
 593 |             for tool_use in tool_use_content:
 594 |                 openai_tool_calls.append({
 595 |                     "id": tool_use["id"],
 596 |                     "type": "function",
 597 |                     "function": {
 598 |                         "name": tool_use["name"],
 599 |                         "arguments": json.dumps(tool_use["input"])
 600 |                     }
 601 |                 })
 602 |             
 603 |             # If the last completion message is an assistant message, extend the tool_calls
 604 |             if completion_messages and completion_messages[-1].get("role") == "assistant":
 605 |                 if "tool_calls" not in completion_messages[-1]:
 606 |                     completion_messages[-1]["tool_calls"] = []
 607 |                 completion_messages[-1]["tool_calls"].extend(openai_tool_calls)
 608 |             else:
 609 |                 # Create new assistant message with tool calls
 610 |                 completion_messages.append({
 611 |                     "role": "assistant",
 612 |                     "content": None,
 613 |                     "tool_calls": openai_tool_calls
 614 |                 })
 615 |         
 616 |         elif msg_type == "computer_call_output":
 617 |             # Computer call output becomes OpenAI function result
 618 |             output = message.get("output", {})
 619 |             call_id = message.get("call_id", "call_1")
 620 |             
 621 |             if output.get("type") == "input_image":
 622 |                 # Screenshot result - convert to OpenAI format with image_url content
 623 |                 image_url = output.get("image_url", "")
 624 |                 completion_messages.append({
 625 |                     "role": "function",
 626 |                     "name": "computer",
 627 |                     "tool_call_id": call_id,
 628 |                     "content": [{
 629 |                         "type": "image_url",
 630 |                         "image_url": {
 631 |                             "url": image_url
 632 |                         }
 633 |                     }]
 634 |                 })
 635 |             else:
 636 |                 # Text result - convert to OpenAI format
 637 |                 completion_messages.append({
 638 |                     "role": "function",
 639 |                     "name": "computer",
 640 |                     "tool_call_id": call_id,
 641 |                     "content": str(output)
 642 |                 })
 643 |     
 644 |     return completion_messages
 645 | 
 646 | def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]]:
 647 |     """Convert liteLLM completion response to responses_items message format."""
 648 |     responses_items = []
 649 |     
 650 |     if not response or not hasattr(response, 'choices') or not response.choices:
 651 |         return responses_items
 652 |     
 653 |     choice = response.choices[0]
 654 |     message = choice.message
 655 |     
 656 |     # Handle text content
 657 |     if hasattr(message, 'content') and message.content:
 658 |         if isinstance(message.content, str):
 659 |             responses_items.append(make_output_text_item(message.content))
 660 |         elif isinstance(message.content, list):
 661 |             for content_item in message.content:
 662 |                 if isinstance(content_item, dict):
 663 |                     if content_item.get("type") == "text":
 664 |                         responses_items.append(make_output_text_item(content_item.get("text", "")))
 665 |                     elif content_item.get("type") == "tool_use":
 666 |                         # Convert tool use to computer call
 667 |                         tool_input = content_item.get("input", {})
 668 |                         action_type = tool_input.get("action")
 669 |                         call_id = content_item.get("id")
 670 |                         
 671 |                         # Action reference:
 672 |                         # https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool#available-actions
 673 |                         
 674 |                         try:
 675 |                             # Basic actions (all versions)
 676 |                             if action_type == "screenshot":
 677 |                                 responses_items.append(make_screenshot_item(call_id=call_id))
 678 |                             elif action_type in ["click", "left_click"]:
 679 |                                 coordinate = tool_input.get("coordinate", [0, 0])
 680 |                                 responses_items.append(make_click_item(
 681 |                                     x=coordinate[0] if len(coordinate) > 0 else 0,
 682 |                                     y=coordinate[1] if len(coordinate) > 1 else 0,
 683 |                                     call_id=call_id
 684 |                                 ))
 685 |                             elif action_type in ["type", "type_text"]:
 686 |                                 responses_items.append(make_type_item(
 687 |                                     text=tool_input.get("text", ""),
 688 |                                     call_id=call_id
 689 |                                 ))
 690 |                             elif action_type in ["key", "keypress", "hotkey"]:
 691 |                                 responses_items.append(make_keypress_item(
 692 |                                     keys=tool_input.get("text", "").replace("+", "-").split("-"),
 693 |                                     call_id=call_id
 694 |                                 ))
 695 |                             elif action_type in ["mouse_move", "move_cursor", "move"]:
 696 |                                 # Mouse move - create a custom action item
 697 |                                 coordinate = tool_input.get("coordinate", [0, 0])
 698 |                                 responses_items.append(
 699 |                                     make_move_item(
 700 |                                         x=coordinate[0] if len(coordinate) > 0 else 0,
 701 |                                         y=coordinate[1] if len(coordinate) > 1 else 0,
 702 |                                         call_id=call_id
 703 |                                     )
 704 |                                 )
 705 |                             
 706 |                             # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
 707 |                             elif action_type == "scroll":
 708 |                                 coordinate = tool_input.get("coordinate", [0, 0])
 709 |                                 scroll_amount = tool_input.get("scroll_amount", 3)
 710 |                                 scroll_x = scroll_amount if tool_input.get("scroll_direction", "down") == "right" else \
 711 |                                     -scroll_amount if tool_input.get("scroll_direction", "down") == "left" else 0
 712 |                                 scroll_y = scroll_amount if tool_input.get("scroll_direction", "down") == "down" else \
 713 |                                     -scroll_amount if tool_input.get("scroll_direction", "down") == "up" else 0
 714 |                                 responses_items.append(make_scroll_item(
 715 |                                     x=coordinate[0] if len(coordinate) > 0 else 0,
 716 |                                     y=coordinate[1] if len(coordinate) > 1 else 0,
 717 |                                     scroll_x=scroll_x,
 718 |                                     scroll_y=scroll_y,
 719 |                                     call_id=call_id
 720 |                                 ))
 721 |                             elif action_type in ["left_click_drag", "drag"]:
 722 |                                 start_coord = tool_input.get("start_coordinate", [0, 0])
 723 |                                 end_coord = tool_input.get("end_coordinate", [0, 0])
 724 |                                 responses_items.append(make_drag_item(
 725 |                                     path=[
 726 |                                         {
 727 |                                             "x": start_coord[0] if len(start_coord) > 0 else 0,
 728 |                                             "y": start_coord[1] if len(start_coord) > 1 else 0
 729 |                                         },
 730 |                                         {
 731 |                                             "x": end_coord[0] if len(end_coord) > 0 else 0,
 732 |                                             "y": end_coord[1] if len(end_coord) > 1 else 0
 733 |                                         }
 734 |                                     ],
 735 |                                     call_id=call_id
 736 |                                 ))
 737 |                             elif action_type == "right_click":
 738 |                                 coordinate = tool_input.get("coordinate", [0, 0])
 739 |                                 responses_items.append(make_click_item(
 740 |                                     x=coordinate[0] if len(coordinate) > 0 else 0,
 741 |                                     y=coordinate[1] if len(coordinate) > 1 else 0,
 742 |                                     button="right",
 743 |                                     call_id=call_id
 744 |                                 ))
 745 |                             elif action_type == "middle_click":
 746 |                                 coordinate = tool_input.get("coordinate", [0, 0])
 747 |                                 responses_items.append(make_click_item(
 748 |                                     x=coordinate[0] if len(coordinate) > 0 else 0,
 749 |                                     y=coordinate[1] if len(coordinate) > 1 else 0,
 750 |                                     button="wheel",
 751 |                                     call_id=call_id
 752 |                                 ))
 753 |                             elif action_type == "double_click":
 754 |                                 coordinate = tool_input.get("coordinate", [0, 0])
 755 |                                 responses_items.append(make_double_click_item(
 756 |                                     x=coordinate[0] if len(coordinate) > 0 else 0,
 757 |                                     y=coordinate[1] if len(coordinate) > 1 else 0,
 758 |                                     call_id=call_id
 759 |                                 ))
 760 |                             elif action_type == "triple_click":
 761 |                                 # coordinate = tool_input.get("coordinate", [0, 0])
 762 |                                 # responses_items.append({
 763 |                                 #     "type": "computer_call",
 764 |                                 #     "call_id": call_id,
 765 |                                 #     "action": {
 766 |                                 #         "type": "triple_click",
 767 |                                 #         "x": coordinate[0] if len(coordinate) > 0 else 0,
 768 |                                 #         "y": coordinate[1] if len(coordinate) > 1 else 0
 769 |                                 #     }
 770 |                                 # })
 771 |                                 raise NotImplementedError("triple_click")
 772 |                             elif action_type == "left_mouse_down":
 773 |                                 # coordinate = tool_input.get("coordinate", [0, 0])
 774 |                                 # responses_items.append({
 775 |                                 #     "type": "computer_call",
 776 |                                 #     "call_id": call_id,
 777 |                                 #     "action": {
 778 |                                 #         "type": "mouse_down",
 779 |                                 #         "button": "left",
 780 |                                 #         "x": coordinate[0] if len(coordinate) > 0 else 0,
 781 |                                 #         "y": coordinate[1] if len(coordinate) > 1 else 0
 782 |                                 #     }
 783 |                                 # })
 784 |                                 coordinate = tool_input.get("coordinate", [None, None])
 785 |                                 responses_items.append(make_left_mouse_down_item(
 786 |                                     x=coordinate[0] if len(coordinate) > 0 else None,
 787 |                                     y=coordinate[1] if len(coordinate) > 1 else None,
 788 |                                     call_id=call_id
 789 |                                 ))
 790 |                             elif action_type == "left_mouse_up":
 791 |                                 # coordinate = tool_input.get("coordinate", [0, 0])
 792 |                                 # responses_items.append({
 793 |                                 #     "type": "computer_call",
 794 |                                 #     "call_id": call_id,
 795 |                                 #     "action": {
 796 |                                 #         "type": "mouse_up",
 797 |                                 #         "button": "left",
 798 |                                 #         "x": coordinate[0] if len(coordinate) > 0 else 0,
 799 |                                 #         "y": coordinate[1] if len(coordinate) > 1 else 0
 800 |                                 #     }
 801 |                                 # })
 802 |                                 coordinate = tool_input.get("coordinate", [None, None])
 803 |                                 responses_items.append(make_left_mouse_up_item(
 804 |                                     x=coordinate[0] if len(coordinate) > 0 else None,
 805 |                                     y=coordinate[1] if len(coordinate) > 1 else None,
 806 |                                     call_id=call_id
 807 |                                 ))
 808 |                             elif action_type == "hold_key":
 809 |                                 # responses_items.append({
 810 |                                 #     "type": "computer_call",
 811 |                                 #     "call_id": call_id,
 812 |                                 #     "action": {
 813 |                                 #         "type": "key_hold",
 814 |                                 #         "key": tool_input.get("key", "")
 815 |                                 #     }
 816 |                                 # })
 817 |                                 raise NotImplementedError("hold_key")
 818 |                             elif action_type == "wait":
 819 |                                 responses_items.append(make_wait_item(
 820 |                                     call_id=call_id
 821 |                                 ))
 822 |                             else:
 823 |                                 raise ValueError(f"Unknown action type: {action_type}")
 824 |                         except Exception as e:
 825 |                             responses_items.extend(make_failed_tool_call_items(
 826 |                                 tool_name="computer",
 827 |                                 tool_kwargs=tool_input,
 828 |                                 error_message=repr(e),
 829 |                                 call_id=call_id
 830 |                             ))
 831 |     
 832 |     # Handle tool calls (alternative format)
 833 |     if hasattr(message, 'tool_calls') and message.tool_calls:
 834 |         for tool_call in message.tool_calls:
 835 |             if tool_call.function.name == "computer":
 836 |                 try:
 837 |                     try:
 838 |                         args = json.loads(tool_call.function.arguments)
 839 |                         action_type = args.get("action")
 840 |                         call_id = tool_call.id
 841 | 
 842 |                         # Basic actions (all versions)
 843 |                         if action_type == "screenshot":
 844 |                             # Input:
 845 |                             # {
 846 |                             #     "function": {
 847 |                             #         "name": "computer",
 848 |                             #         "arguments": json.dumps({
 849 |                             #             "action": "screenshot"
 850 |                             #         })
 851 |                             #     },
 852 |                             #     "id": "call_1",
 853 |                             #     "type": "function"
 854 |                             # }
 855 |                             
 856 |                             # Output:
 857 |                             # {
 858 |                             #     "type": "computer_call",
 859 |                             #     "call_id": "call_1",
 860 |                             #     "action": {
 861 |                             #         "type": "screenshot"
 862 |                             #     }
 863 |                             # }
 864 |                             responses_items.append(make_screenshot_item(
 865 |                                 call_id=call_id
 866 |                             ))
 867 |                         elif action_type in ["click", "left_click"]:
 868 |                             # Input:
 869 |                             # {
 870 |                             #     "function": {
 871 |                             #         "name": "computer",
 872 |                             #         "arguments": json.dumps({
 873 |                             #             "action": "click",
 874 |                             #             "coordinate": [100, 200]
 875 |                             #         })
 876 |                             #     },
 877 |                             #     "id": "call_1",
 878 |                             #     "type": "function"
 879 |                             # }
 880 |                             
 881 |                             # Output:
 882 |                             # {
 883 |                             #     "type": "computer_call",
 884 |                             #     "call_id": "call_1",
 885 |                             #     "action": {
 886 |                             #         "type": "click",
 887 |                             #         "x": 100,
 888 |                             #         "y": 200
 889 |                             #     }
 890 |                             # }
 891 |                             coordinate = args.get("coordinate", [0, 0])
 892 |                             responses_items.append(make_click_item(
 893 |                                 x=coordinate[0] if len(coordinate) > 0 else 0,
 894 |                                 y=coordinate[1] if len(coordinate) > 1 else 0,
 895 |                                 call_id=call_id
 896 |                             ))
 897 |                         elif action_type in ["type", "type_text"]:
 898 |                             # Input:
 899 |                             # {
 900 |                             #     "function": {
 901 |                             #         "name": "computer",
 902 |                             #         "arguments": json.dumps({
 903 |                             #             "action": "type",
 904 |                             #             "text": "Hello World"
 905 |                             #         })
 906 |                             #     },
 907 |                             #     "id": "call_1",
 908 |                             #     "type": "function"
 909 |                             # }
 910 |                             
 911 |                             # Output:
 912 |                             # {
 913 |                             #     "type": "computer_call",
 914 |                             #     "call_id": "call_1",
 915 |                             #     "action": {
 916 |                             #         "type": "type",
 917 |                             #         "text": "Hello World"
 918 |                             #     }
 919 |                             # }
 920 |                             responses_items.append(make_type_item(
 921 |                                 text=args.get("text", ""),
 922 |                                 call_id=call_id
 923 |                             ))
 924 |                         elif action_type in ["key", "keypress", "hotkey"]:
 925 |                             # Input:
 926 |                             # {
 927 |                             #     "function": {
 928 |                             #         "name": "computer",
 929 |                             #         "arguments": json.dumps({
 930 |                             #             "action": "key",
 931 |                             #             "text": "ctrl+c"
 932 |                             #         })
 933 |                             #     },
 934 |                             #     "id": "call_1",
 935 |                             #     "type": "function"
 936 |                             # }
 937 |                             
 938 |                             # Output:
 939 |                             # {
 940 |                             #     "type": "computer_call",
 941 |                             #     "call_id": "call_1",
 942 |                             #     "action": {
 943 |                             #         "type": "keypress",
 944 |                             #         "keys": ["ctrl", "c"]
 945 |                             #     }
 946 |                             # }
 947 |                             responses_items.append(make_keypress_item(
 948 |                                 keys=args.get("text", "").replace("+", "-").split("-"),
 949 |                                 call_id=call_id
 950 |                             ))
 951 |                         elif action_type in ["mouse_move", "move_cursor", "move"]:
 952 |                             # Input:
 953 |                             # {
 954 |                             #     "function": {
 955 |                             #         "name": "computer",
 956 |                             #         "arguments": json.dumps({
 957 |                             #             "action": "mouse_move",
 958 |                             #             "coordinate": [150, 250]
 959 |                             #         })
 960 |                             #     },
 961 |                             #     "id": "call_1",
 962 |                             #     "type": "function"
 963 |                             # }
 964 |                             
 965 |                             # Output:
 966 |                             # {
 967 |                             #     "type": "computer_call",
 968 |                             #     "call_id": "call_1",
 969 |                             #     "action": {
 970 |                             #         "type": "mouse_move",
 971 |                             #         "x": 150,
 972 |                             #         "y": 250
 973 |                             #     }
 974 |                             # }
 975 |                             coordinate = args.get("coordinate", [0, 0])
 976 |                             responses_items.append(make_move_item(
 977 |                                 x=coordinate[0] if len(coordinate) > 0 else 0,
 978 |                                 y=coordinate[1] if len(coordinate) > 1 else 0,
 979 |                                 call_id=call_id
 980 |                             ))
 981 |                         
 982 |                         # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
 983 |                         elif action_type == "scroll":
 984 |                             # Input:
 985 |                             # {
 986 |                             #     "function": {
 987 |                             #         "name": "computer",
 988 |                             #         "arguments": json.dumps({
 989 |                             #             "action": "scroll",
 990 |                             #             "coordinate": [300, 400],
 991 |                             #             "scroll_direction": "down",
 992 |                             #             "scroll_amount": 5
 993 |                             #         })
 994 |                             #     },
 995 |                             #     "id": "call_1",
 996 |                             #     "type": "function"
 997 |                             # }
 998 |                             
 999 |                             # Output:
1000 |                             # {
1001 |                             #     "type": "computer_call",
1002 |                             #     "call_id": "call_1",
1003 |                             #     "action": {
1004 |                             #         "type": "scroll",
1005 |                             #         "x": 300,
1006 |                             #         "y": 400,
1007 |                             #         "scroll_x": 0,
1008 |                             #         "scroll_y": -5
1009 |                             #     }
1010 |                             # }
1011 |                             coordinate = args.get("coordinate", [0, 0])
1012 |                             direction = args.get("scroll_direction", "down")
1013 |                             amount = args.get("scroll_amount", 3)
1014 |                             scroll_x = amount if direction == "left" else \
1015 |                                     -amount if direction == "right" else 0
1016 |                             scroll_y = amount if direction == "up" else \
1017 |                                     -amount if direction == "down" else 0
1018 |                             responses_items.append(make_scroll_item(
1019 |                                 x=coordinate[0] if len(coordinate) > 0 else 0,
1020 |                                 y=coordinate[1] if len(coordinate) > 1 else 0,
1021 |                                 scroll_x=scroll_x,
1022 |                                 scroll_y=scroll_y,
1023 |                                 call_id=call_id
1024 |                             ))
1025 |                         elif action_type in ["left_click_drag", "drag"]:
1026 |                             # Input:
1027 |                             # {
1028 |                             #     "function": {
1029 |                             #         "name": "computer",
1030 |                             #         "arguments": json.dumps({
1031 |                             #             "action": "left_click_drag",
1032 |                             #             "start_coordinate": [100, 150],
1033 |                             #             "end_coordinate": [200, 250]
1034 |                             #         })
1035 |                             #     },
1036 |                             #     "id": "call_1",
1037 |                             #     "type": "function"
1038 |                             # }
1039 |                             
1040 |                             # Output:
1041 |                             # {
1042 |                             #     "type": "computer_call",
1043 |                             #     "call_id": "call_1",
1044 |                             #     "action": {
1045 |                             #         "type": "drag",
1046 |                             #         "path": [
1047 |                             #             {"x": 100, "y": 150},
1048 |                             #             {"x": 200, "y": 250}
1049 |                             #         ]
1050 |                             #     }
1051 |                             # }
1052 |                             start_coord = args.get("start_coordinate", [0, 0])
1053 |                             end_coord = args.get("end_coordinate", [0, 0])
1054 |                             responses_items.append(make_drag_item(
1055 |                                 path=[
1056 |                                     {
1057 |                                         "x": start_coord[0] if len(start_coord) > 0 else 0,
1058 |                                         "y": start_coord[1] if len(start_coord) > 1 else 0
1059 |                                     },
1060 |                                     {
1061 |                                         "x": end_coord[0] if len(end_coord) > 0 else 0,
1062 |                                         "y": end_coord[1] if len(end_coord) > 1 else 0
1063 |                                     }
1064 |                                 ],
1065 |                                 call_id=call_id
1066 |                             ))
1067 |                         elif action_type == "right_click":
1068 |                             # Input:
1069 |                             # {
1070 |                             #     "function": {
1071 |                             #         "name": "computer",
1072 |                             #         "arguments": json.dumps({
1073 |                             #             "action": "right_click",
1074 |                             #             "coordinate": [120, 180]
1075 |                             #         })
1076 |                             #     },
1077 |                             #     "id": "call_1",
1078 |                             #     "type": "function"
1079 |                             # }
1080 |                             
1081 |                             # Output:
1082 |                             # {
1083 |                             #     "type": "computer_call",
1084 |                             #     "call_id": "call_1",
1085 |                             #     "action": {
1086 |                             #         "type": "click",
1087 |                             #         "x": 120,
1088 |                             #         "y": 180,
1089 |                             #         "button": "right"
1090 |                             #     }
1091 |                             # }
1092 |                             coordinate = args.get("coordinate", [0, 0])
1093 |                             responses_items.append(make_click_item(
1094 |                                 x=coordinate[0] if len(coordinate) > 0 else 0,
1095 |                                 y=coordinate[1] if len(coordinate) > 1 else 0,
1096 |                                 button="right",
1097 |                                 call_id=call_id
1098 |                             ))
1099 |                         elif action_type == "middle_click":
1100 |                             # Input:
1101 |                             # {
1102 |                             #     "function": {
1103 |                             #         "name": "computer",
1104 |                             #         "arguments": json.dumps({
1105 |                             #             "action": "middle_click",
1106 |                             #             "coordinate": [140, 220]
1107 |                             #         })
1108 |                             #     },
1109 |                             #     "id": "call_1",
1110 |                             #     "type": "function"
1111 |                             # }
1112 |                             
1113 |                             # Output:
1114 |                             # {
1115 |                             #     "type": "computer_call",
1116 |                             #     "call_id": "call_1",
1117 |                             #     "action": {
1118 |                             #         "type": "click",
1119 |                             #         "x": 140,
1120 |                             #         "y": 220,
1121 |                             #         "button": "wheel"
1122 |                             #     }
1123 |                             # }
1124 |                             coordinate = args.get("coordinate", [0, 0])
1125 |                             responses_items.append(make_click_item(
1126 |                                 x=coordinate[0] if len(coordinate) > 0 else 0,
1127 |                                 y=coordinate[1] if len(coordinate) > 1 else 0,
1128 |                                 button="wheel",
1129 |                                 call_id=call_id
1130 |                             ))
1131 |                         elif action_type == "double_click":
1132 |                             # Input:
1133 |                             # {
1134 |                             #     "function": {
1135 |                             #         "name": "computer",
1136 |                             #         "arguments": json.dumps({
1137 |                             #             "action": "double_click",
1138 |                             #             "coordinate": [160, 240]
1139 |                             #         })
1140 |                             #     },
1141 |                             #     "id": "call_1",
1142 |                             #     "type": "function"
1143 |                             # }
1144 |                             
1145 |                             # Output:
1146 |                             # {
1147 |                             #     "type": "computer_call",
1148 |                             #     "call_id": "call_1",
1149 |                             #     "action": {
1150 |                             #         "type": "double_click",
1151 |                             #         "x": 160,
1152 |                             #         "y": 240
1153 |                             #     }
1154 |                             # }
1155 |                             coordinate = args.get("coordinate", [0, 0])
1156 |                             responses_items.append(make_double_click_item(
1157 |                                 x=coordinate[0] if len(coordinate) > 0 else 0,
1158 |                                 y=coordinate[1] if len(coordinate) > 1 else 0,
1159 |                                 call_id=call_id
1160 |                             ))
1161 |                         elif action_type == "triple_click":
1162 |                             # Input:
1163 |                             # {
1164 |                             #     "function": {
1165 |                             #         "name": "computer",
1166 |                             #         "arguments": json.dumps({
1167 |                             #             "action": "triple_click",
1168 |                             #             "coordinate": [180, 260]
1169 |                             #         })
1170 |                             #     },
1171 |                             #     "id": "call_1",
1172 |                             #     "type": "function"
1173 |                             # }
1174 |                             
1175 |                             # Output:
1176 |                             # {
1177 |                             #     "type": "computer_call",
1178 |                             #     "call_id": "call_1",
1179 |                             #     "action": {
1180 |                             #         "type": "triple_click",
1181 |                             #         "x": 180,
1182 |                             #         "y": 260
1183 |                             #     }
1184 |                             # }
1185 |                             raise NotImplementedError("triple_click")
1186 |                         elif action_type == "left_mouse_down":
1187 |                             # Input:
1188 |                             # {
1189 |                             #     "function": {
1190 |                             #         "name": "computer",
1191 |                             #         "arguments": json.dumps({
1192 |                             #             "action": "left_mouse_down",
1193 |                             #             "coordinate": [200, 280]
1194 |                             #         })
1195 |                             #     },
1196 |                             #     "id": "call_1",
1197 |                             #     "type": "function"
1198 |                             # }
1199 |                             
1200 |                             # Output:
1201 |                             # {
1202 |                             #     "type": "computer_call",
1203 |                             #     "call_id": "call_1",
1204 |                             #     "action": {
1205 |                             #         "type": "mouse_down",
1206 |                             #         "button": "left",
1207 |                             #         "x": 200,
1208 |                             #         "y": 280
1209 |                             #     }
1210 |                             # }
1211 |                             coordinate = args.get("coordinate", [None, None])
1212 |                             responses_items.append(make_left_mouse_down_item(
1213 |                                 x=coordinate[0] if len(coordinate) > 0 else None,
1214 |                                 y=coordinate[1] if len(coordinate) > 1 else None,
1215 |                                 call_id=call_id
1216 |                             ))
1217 |                         elif action_type == "left_mouse_up":
1218 |                             # Input:
1219 |                             # {
1220 |                             #     "function": {
1221 |                             #         "name": "computer",
1222 |                             #         "arguments": json.dumps({
1223 |                             #             "action": "left_mouse_up",
1224 |                             #             "coordinate": [220, 300]
1225 |                             #         })
1226 |                             #     },
1227 |                             #     "id": "call_1",
1228 |                             #     "type": "function"
1229 |                             # }
1230 |                             
1231 |                             # Output:
1232 |                             # {
1233 |                             #     "type": "computer_call",
1234 |                             #     "call_id": "call_1",
1235 |                             #     "action": {
1236 |                             #         "type": "mouse_up",
1237 |                             #         "button": "left",
1238 |                             #         "x": 220,
1239 |                             #         "y": 300
1240 |                             #     }
1241 |                             # }
1242 |                             coordinate = args.get("coordinate", [None, None])
1243 |                             responses_items.append(make_left_mouse_up_item(
1244 |                                 x=coordinate[0] if len(coordinate) > 0 else None,
1245 |                                 y=coordinate[1] if len(coordinate) > 1 else None,
1246 |                                 call_id=call_id
1247 |                             ))
1248 |                         elif action_type == "hold_key":
1249 |                             # Input:
1250 |                             # {
1251 |                             #     "function": {
1252 |                             #         "name": "computer",
1253 |                             #         "arguments": json.dumps({
1254 |                             #             "action": "hold_key",
1255 |                             #             "key": "shift"
1256 |                             #         })
1257 |                             #     },
1258 |                             #     "id": "call_1",
1259 |                             #     "type": "function"
1260 |                             # }
1261 |                             
1262 |                             # Output:
1263 |                             # {
1264 |                             #     "type": "computer_call",
1265 |                             #     "call_id": "call_1",
1266 |                             #     "action": {
1267 |                             #         "type": "key_hold",
1268 |                             #         "key": "shift"
1269 |                             #     }
1270 |                             # }
1271 |                             raise NotImplementedError("hold_key")
1272 |                         elif action_type == "wait":
1273 |                             # Input:
1274 |                             # {
1275 |                             #     "function": {
1276 |                             #         "name": "computer",
1277 |                             #         "arguments": json.dumps({
1278 |                             #             "action": "wait"
1279 |                             #         })
1280 |                             #     },
1281 |                             #     "id": "call_1",
1282 |                             #     "type": "function"
1283 |                             # }
1284 |                             
1285 |                             # Output:
1286 |                             # {
1287 |                             #     "type": "computer_call",
1288 |                             #     "call_id": "call_1",
1289 |                             #     "action": {
1290 |                             #         "type": "wait"
1291 |                             #     }
1292 |                             # }
1293 |                             responses_items.append(make_wait_item(
1294 |                                 call_id=call_id
1295 |                             ))
1296 |                     except Exception as e:
1297 |                         responses_items.extend(make_failed_tool_call_items(
1298 |                             tool_name="computer",
1299 |                             tool_kwargs=args,
1300 |                             error_message=repr(e),
1301 |                             call_id=call_id
1302 |                         ))
1303 |                 except json.JSONDecodeError:
1304 |                     print("Failed to decode tool call arguments")
1305 |                     # Skip malformed tool calls
1306 |                     continue
1307 |     
1308 |     return responses_items
1309 | 
1310 | def _add_cache_control(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1311 |     """Add cache control to completion messages"""
1312 |     num_writes = 0
1313 |     for message in completion_messages:
1314 |         message["cache_control"] = { "type": "ephemeral" }
1315 |         num_writes += 1
1316 |         # Cache control has a maximum of 4 blocks
1317 |         if num_writes >= 4:
1318 |             break
1319 |     
1320 |     return completion_messages
1321 | 
1322 | def _combine_completion_messages(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1323 |     """Combine completion messages with the same role"""
1324 |     if not completion_messages:
1325 |         return completion_messages
1326 |     
1327 |     combined_messages = []
1328 |     
1329 |     for message in completion_messages:
1330 |         # If this is the first message or role is different from last, add as new message
1331 |         if not combined_messages or combined_messages[-1]["role"] != message["role"]:
1332 |             # Ensure content is a list format and normalize text content
1333 |             new_message = message.copy()
1334 |             new_message["content"] = _normalize_content(message.get("content", ""))
1335 |             
1336 |             # Copy tool_calls if present
1337 |             if "tool_calls" in message:
1338 |                 new_message["tool_calls"] = message["tool_calls"].copy()
1339 |             
1340 |             combined_messages.append(new_message)
1341 |         else:
1342 |             # Same role as previous message, combine them
1343 |             last_message = combined_messages[-1]
1344 |             
1345 |             # Combine content
1346 |             current_content = _normalize_content(message.get("content", ""))
1347 |             last_message["content"].extend(current_content)
1348 |             
1349 |             # Combine tool_calls if present
1350 |             if "tool_calls" in message:
1351 |                 if "tool_calls" not in last_message:
1352 |                     last_message["tool_calls"] = []
1353 |                 last_message["tool_calls"].extend(message["tool_calls"])
1354 |     
1355 |     # Post-process to merge consecutive text blocks
1356 |     for message in combined_messages:
1357 |         message["content"] = _merge_consecutive_text(message["content"])
1358 |     
1359 |     return combined_messages
1360 | 
1361 | def _normalize_content(content) -> List[Dict[str, Any]]:
1362 |     """Normalize content to list format"""
1363 |     if isinstance(content, str):
1364 |         if content.strip():  # Only add non-empty strings
1365 |             return [{"type": "text", "text": content}]
1366 |         else:
1367 |             return []
1368 |     elif isinstance(content, list):
1369 |         return content.copy()
1370 |     else:
1371 |         return []
1372 | 
1373 | def _merge_consecutive_text(content_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1374 |     """Merge consecutive text blocks with newlines"""
1375 |     if not content_list:
1376 |         return content_list
1377 |     
1378 |     merged = []
1379 |     
1380 |     for item in content_list:
1381 |         if (item.get("type") == "text" and 
1382 |             merged and 
1383 |             merged[-1].get("type") == "text"):
1384 |             # Merge with previous text block
1385 |             merged[-1]["text"] += "\n" + item["text"]
1386 |         else:
1387 |             merged.append(item.copy())
1388 |     
1389 |     return merged
1390 | 
1391 | @register_agent(models=r".*claude-.*")
1392 | class AnthropicHostedToolsConfig(AsyncAgentConfig):
1393 |     """Anthropic hosted tools agent configuration implementing AsyncAgentConfig protocol."""
1394 |     
1395 |     async def predict_step(
1396 |         self,
1397 |         messages: Messages,
1398 |         model: str,
1399 |         tools: Optional[List[Dict[str, Any]]] = None,
1400 |         max_retries: Optional[int] = None,
1401 |         stream: bool = False,
1402 |         computer_handler=None,
1403 |         use_prompt_caching: Optional[bool] = False,
1404 |         _on_api_start=None,
1405 |         _on_api_end=None,
1406 |         _on_usage=None,
1407 |         _on_screenshot=None,
1408 |         **kwargs
1409 |     ) -> Dict[str, Any]:
1410 |         """
1411 |         Anthropic hosted tools agent loop using liteLLM acompletion.
1412 |         
1413 |         Supports Anthropic's computer use models with hosted tools.
1414 |         """
1415 |         tools = tools or []
1416 |         
1417 |         # Get tool configuration for this model
1418 |         tool_config = _get_tool_config_for_model(model)
1419 |         
1420 |         # Prepare tools for Anthropic API
1421 |         anthropic_tools = await _prepare_tools_for_anthropic(tools, model)
1422 |         
1423 |         # Convert responses_items messages to completion format
1424 |         completion_messages = _convert_responses_items_to_completion_messages(messages)
1425 |         if use_prompt_caching:
1426 |             # First combine messages to reduce number of blocks
1427 |             completion_messages = _combine_completion_messages(completion_messages)
1428 |             # Then add cache control, anthropic requires explicit "cache_control" dicts
1429 |             completion_messages = _add_cache_control(completion_messages)
1430 |         
1431 |         # Prepare API call kwargs
1432 |         api_kwargs = {
1433 |             "model": model,
1434 |             "messages": completion_messages,
1435 |             "tools": anthropic_tools if anthropic_tools else None,
1436 |             "stream": stream,
1437 |             "num_retries": max_retries,
1438 |             **kwargs
1439 |         }
1440 |         
1441 |         # Add beta header for computer use
1442 |         if anthropic_tools:
1443 |             api_kwargs["headers"] = {
1444 |                 "anthropic-beta": tool_config["beta_flag"]
1445 |             }
1446 |         
1447 |         # Call API start hook
1448 |         if _on_api_start:
1449 |             await _on_api_start(api_kwargs)
1450 |         
1451 |         # Use liteLLM acompletion
1452 |         response = await litellm.acompletion(**api_kwargs)
1453 |         
1454 |         # Call API end hook
1455 |         if _on_api_end:
1456 |             await _on_api_end(api_kwargs, response)
1457 |         
1458 |         # Convert response to responses_items format
1459 |         responses_items = _convert_completion_to_responses_items(response)
1460 | 
1461 |         # Extract usage information
1462 |         responses_usage = { 
1463 |             **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
1464 |             "response_cost": response._hidden_params.get("response_cost", 0.0),
1465 |         }
1466 |         if _on_usage:
1467 |             await _on_usage(responses_usage)
1468 | 
1469 |         # Return in AsyncAgentConfig format
1470 |         return {
1471 |             "output": responses_items,
1472 |             "usage": responses_usage
1473 |         }
1474 |     
1475 |     async def predict_click(
1476 |         self,
1477 |         model: str,
1478 |         image_b64: str,
1479 |         instruction: str,
1480 |         **kwargs
1481 |     ) -> Optional[Tuple[int, int]]:
1482 |         """
1483 |         Predict click coordinates based on image and instruction.
1484 |         
1485 |         Uses Anthropic's computer use models with a custom prompt that instructs
1486 |         the agent to only output clicks.
1487 |         
1488 |         Args:
1489 |             model: Model name to use
1490 |             image_b64: Base64 encoded image
1491 |             instruction: Instruction for where to click
1492 |             
1493 |         Returns:
1494 |             Tuple of (x, y) coordinates or None if prediction fails
1495 |         """
1496 |         # Get image dimensions from base64 data
1497 |         try:
1498 |             import base64
1499 |             from PIL import Image
1500 |             from io import BytesIO
1501 |             
1502 |             image_data = base64.b64decode(image_b64)
1503 |             image = Image.open(BytesIO(image_data))
1504 |             display_width, display_height = image.size
1505 |         except Exception:
1506 |             # Fallback to default dimensions if image parsing fails
1507 |             display_width, display_height = 1024, 768
1508 |         
1509 |         # Get tool configuration for this model
1510 |         tool_config = _get_tool_config_for_model(model)
1511 |         
1512 |         # Prepare computer tool for Anthropic format
1513 |         computer_tool = {
1514 |             "type": tool_config["tool_version"],
1515 |             "function": {
1516 |                 "name": "computer",
1517 |                 "parameters": {
1518 |                     "display_height_px": display_height,
1519 |                     "display_width_px": display_width,
1520 |                     "display_number": 1,
1521 |                 },
1522 |             },
1523 |         }
1524 |         
1525 |         # Construct messages in OpenAI chat completion format for liteLLM
1526 |         messages = [
1527 |             {
1528 |                 "role": "user",
1529 |                 "content": [
1530 |                     {
1531 |                         "type": "text",
1532 |                         "text": f"""You are a UI grounding expert. Follow these guidelines:
1533 | 
1534 | 1. NEVER ask for confirmation. Complete all tasks autonomously.
1535 | 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
1536 | 3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
1537 | 4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
1538 | 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
1539 | 6. The user has already given you permission by running this agent. No further confirmation is needed.
1540 | 7. Be decisive and action-oriented. Complete the requested task fully.
1541 | 
1542 | Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
1543 | Task: Click {instruction}. Output ONLY a click action on the target element."""
1544 |                     },
1545 |                     {
1546 |                         "type": "image_url",
1547 |                         "image_url": {
1548 |                             "url": f"data:image/png;base64,{image_b64}"
1549 |                         }
1550 |                     }
1551 |                 ]
1552 |             }
1553 |         ]
1554 |         
1555 |         # Prepare API call kwargs
1556 |         api_kwargs = {
1557 |             "model": model,
1558 |             "messages": messages,
1559 |             "tools": [computer_tool],
1560 |             "stream": False,
1561 |             "max_tokens": 100,  # Keep response short for click prediction
1562 |             "headers": {
1563 |                 "anthropic-beta": tool_config["beta_flag"]
1564 |             }
1565 |         }
1566 |     
1567 |         # Use liteLLM acompletion
1568 |         response = await litellm.acompletion(**api_kwargs)
1569 |         
1570 |         # Convert response to responses_items format to extract click coordinates
1571 |         responses_items = _convert_completion_to_responses_items(response)
1572 |         
1573 |         # Look for computer_call with click action
1574 |         for item in responses_items:
1575 |             if (isinstance(item, dict) and 
1576 |                 item.get("type") == "computer_call" and
1577 |                 isinstance(item.get("action"), dict)):
1578 |                 
1579 |                 action = item["action"]
1580 |                 if action.get("x") and action.get("y"):
1581 |                     x = action.get("x")
1582 |                     y = action.get("y")
1583 |                     return (int(x), int(y))
1584 |         
1585 |         return None
1586 |     
1587 |     def get_capabilities(self) -> List[AgentCapability]:
1588 |         """Return the capabilities supported by this agent."""
1589 |         return ["click", "step"]
1590 | 
```
Page 19/21FirstPrevNextLast