This is page 19 of 21. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .all-contributorsrc ├── .cursorignore ├── .devcontainer │ ├── devcontainer.json │ ├── post-install.sh │ └── README.md ├── .dockerignore ├── .gitattributes ├── .github │ ├── FUNDING.yml │ ├── scripts │ │ ├── get_pyproject_version.py │ │ └── tests │ │ ├── __init__.py │ │ ├── README.md │ │ └── test_get_pyproject_version.py │ └── workflows │ ├── ci-lume.yml │ ├── docker-publish-kasm.yml │ ├── docker-publish-xfce.yml │ ├── docker-reusable-publish.yml │ ├── npm-publish-computer.yml │ ├── npm-publish-core.yml │ ├── publish-lume.yml │ ├── pypi-publish-agent.yml │ ├── pypi-publish-computer-server.yml │ ├── pypi-publish-computer.yml │ ├── pypi-publish-core.yml │ ├── pypi-publish-mcp-server.yml │ ├── pypi-publish-pylume.yml │ ├── pypi-publish-som.yml │ ├── pypi-reusable-publish.yml │ └── test-validation-script.yml ├── .gitignore ├── .vscode │ ├── docs.code-workspace │ ├── launch.json │ ├── libs-ts.code-workspace │ ├── lume.code-workspace │ ├── lumier.code-workspace │ └── py.code-workspace ├── blog │ ├── app-use.md │ ├── assets │ │ ├── composite-agents.png │ │ ├── docker-ubuntu-support.png │ │ ├── hack-booth.png │ │ ├── hack-closing-ceremony.jpg │ │ ├── hack-cua-ollama-hud.jpeg │ │ ├── hack-leaderboard.png │ │ ├── hack-the-north.png │ │ ├── hack-winners.jpeg │ │ ├── hack-workshop.jpeg │ │ ├── hud-agent-evals.png │ │ └── trajectory-viewer.jpeg │ ├── bringing-computer-use-to-the-web.md │ ├── build-your-own-operator-on-macos-1.md │ ├── build-your-own-operator-on-macos-2.md │ ├── composite-agents.md │ ├── cua-hackathon.md │ ├── hack-the-north.md │ ├── hud-agent-evals.md │ ├── human-in-the-loop.md │ ├── introducing-cua-cloud-containers.md │ ├── lume-to-containerization.md │ ├── sandboxed-python-execution.md │ ├── training-computer-use-models-trajectories-1.md │ ├── trajectory-viewer.md │ ├── ubuntu-docker-support.md │ └── windows-sandbox.md ├── CONTRIBUTING.md ├── Development.md ├── Dockerfile ├── docs │ ├── .gitignore │ ├── .prettierrc │ ├── content │ │ └── docs │ │ ├── agent-sdk │ │ │ ├── agent-loops.mdx │ │ │ ├── benchmarks │ │ │ │ ├── index.mdx │ │ │ │ ├── interactive.mdx │ │ │ │ ├── introduction.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── osworld-verified.mdx │ │ │ │ ├── screenspot-pro.mdx │ │ │ │ └── screenspot-v2.mdx │ │ │ ├── callbacks │ │ │ │ ├── agent-lifecycle.mdx │ │ │ │ ├── cost-saving.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── logging.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── pii-anonymization.mdx │ │ │ │ └── trajectories.mdx │ │ │ ├── chat-history.mdx │ │ │ ├── custom-computer-handlers.mdx │ │ │ ├── custom-tools.mdx │ │ │ ├── customizing-computeragent.mdx │ │ │ ├── integrations │ │ │ │ ├── hud.mdx │ │ │ │ └── meta.json │ │ │ ├── message-format.mdx │ │ │ ├── meta.json │ │ │ ├── migration-guide.mdx │ │ │ ├── prompt-caching.mdx │ │ │ ├── supported-agents │ │ │ │ ├── composed-agents.mdx │ │ │ │ ├── computer-use-agents.mdx │ │ │ │ ├── grounding-models.mdx │ │ │ │ ├── human-in-the-loop.mdx │ │ │ │ └── meta.json │ │ │ ├── supported-model-providers │ │ │ │ ├── index.mdx │ │ │ │ └── local-models.mdx │ │ │ └── usage-tracking.mdx │ │ ├── computer-sdk │ │ │ ├── cloud-vm-management.mdx │ │ │ ├── commands.mdx │ │ │ ├── computer-ui.mdx │ │ │ ├── computers.mdx │ │ │ ├── meta.json │ │ │ └── sandboxed-python.mdx │ │ ├── index.mdx │ │ ├── libraries │ │ │ ├── agent │ │ │ │ └── index.mdx │ │ │ ├── computer │ │ │ │ └── index.mdx │ │ │ ├── computer-server │ │ │ │ ├── Commands.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── REST-API.mdx │ │ │ │ └── WebSocket-API.mdx │ │ │ ├── core │ │ │ │ └── index.mdx │ │ │ ├── lume │ │ │ │ ├── cli-reference.mdx │ │ │ │ ├── faq.md │ │ │ │ ├── http-api.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── meta.json │ │ │ │ └── prebuilt-images.mdx │ │ │ ├── lumier │ │ │ │ ├── building-lumier.mdx │ │ │ │ ├── docker-compose.mdx │ │ │ │ ├── docker.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ └── meta.json │ │ │ ├── mcp-server │ │ │ │ ├── client-integrations.mdx │ │ │ │ ├── configuration.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── llm-integrations.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── tools.mdx │ │ │ │ └── usage.mdx │ │ │ └── som │ │ │ ├── configuration.mdx │ │ │ └── index.mdx │ │ ├── meta.json │ │ ├── quickstart-cli.mdx │ │ ├── quickstart-devs.mdx │ │ └── telemetry.mdx │ ├── next.config.mjs │ ├── package-lock.json │ ├── package.json │ ├── pnpm-lock.yaml │ ├── postcss.config.mjs │ ├── public │ │ └── img │ │ ├── agent_gradio_ui.png │ │ ├── agent.png │ │ ├── cli.png │ │ ├── computer.png │ │ ├── som_box_threshold.png │ │ └── som_iou_threshold.png │ ├── README.md │ ├── source.config.ts │ ├── src │ │ ├── app │ │ │ ├── (home) │ │ │ │ ├── [[...slug]] │ │ │ │ │ └── page.tsx │ │ │ │ └── layout.tsx │ │ │ ├── api │ │ │ │ └── search │ │ │ │ └── route.ts │ │ │ ├── favicon.ico │ │ │ ├── global.css │ │ │ ├── layout.config.tsx │ │ │ ├── layout.tsx │ │ │ ├── llms.mdx │ │ │ │ └── [[...slug]] │ │ │ │ └── route.ts │ │ │ └── llms.txt │ │ │ └── route.ts │ │ ├── assets │ │ │ ├── discord-black.svg │ │ │ ├── discord-white.svg │ │ │ ├── logo-black.svg │ │ │ └── logo-white.svg │ │ ├── components │ │ │ ├── iou.tsx │ │ │ └── mermaid.tsx │ │ ├── lib │ │ │ ├── llms.ts │ │ │ └── source.ts │ │ └── mdx-components.tsx │ └── tsconfig.json ├── examples │ ├── agent_examples.py │ ├── agent_ui_examples.py │ ├── cloud_api_examples.py │ ├── computer_examples_windows.py │ ├── computer_examples.py │ ├── computer_ui_examples.py │ ├── computer-example-ts │ │ ├── .env.example │ │ ├── .gitignore │ │ ├── .prettierrc │ │ ├── package-lock.json │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── README.md │ │ ├── src │ │ │ ├── helpers.ts │ │ │ └── index.ts │ │ └── tsconfig.json │ ├── docker_examples.py │ ├── evals │ │ ├── hud_eval_examples.py │ │ └── wikipedia_most_linked.txt │ ├── pylume_examples.py │ ├── sandboxed_functions_examples.py │ ├── som_examples.py │ ├── utils.py │ └── winsandbox_example.py ├── img │ ├── agent_gradio_ui.png │ ├── agent.png │ ├── cli.png │ ├── computer.png │ ├── logo_black.png │ └── logo_white.png ├── libs │ ├── kasm │ │ ├── Dockerfile │ │ ├── LICENSE │ │ ├── README.md │ │ └── src │ │ └── ubuntu │ │ └── install │ │ └── firefox │ │ ├── custom_startup.sh │ │ ├── firefox.desktop │ │ └── install_firefox.sh │ ├── lume │ │ ├── .cursorignore │ │ ├── CONTRIBUTING.md │ │ ├── Development.md │ │ ├── img │ │ │ └── cli.png │ │ ├── Package.resolved │ │ ├── Package.swift │ │ ├── README.md │ │ ├── resources │ │ │ └── lume.entitlements │ │ ├── scripts │ │ │ ├── build │ │ │ │ ├── build-debug.sh │ │ │ │ ├── build-release-notarized.sh │ │ │ │ └── build-release.sh │ │ │ └── install.sh │ │ ├── src │ │ │ ├── Commands │ │ │ │ ├── Clone.swift │ │ │ │ ├── Config.swift │ │ │ │ ├── Create.swift │ │ │ │ ├── Delete.swift │ │ │ │ ├── Get.swift │ │ │ │ ├── Images.swift │ │ │ │ ├── IPSW.swift │ │ │ │ ├── List.swift │ │ │ │ ├── Logs.swift │ │ │ │ ├── Options │ │ │ │ │ └── FormatOption.swift │ │ │ │ ├── Prune.swift │ │ │ │ ├── Pull.swift │ │ │ │ ├── Push.swift │ │ │ │ ├── Run.swift │ │ │ │ ├── Serve.swift │ │ │ │ ├── Set.swift │ │ │ │ └── Stop.swift │ │ │ ├── ContainerRegistry │ │ │ │ ├── ImageContainerRegistry.swift │ │ │ │ ├── ImageList.swift │ │ │ │ └── ImagesPrinter.swift │ │ │ ├── Errors │ │ │ │ └── Errors.swift │ │ │ ├── FileSystem │ │ │ │ ├── Home.swift │ │ │ │ ├── Settings.swift │ │ │ │ ├── VMConfig.swift │ │ │ │ ├── VMDirectory.swift │ │ │ │ └── VMLocation.swift │ │ │ ├── LumeController.swift │ │ │ ├── Main.swift │ │ │ ├── Server │ │ │ │ ├── Handlers.swift │ │ │ │ ├── HTTP.swift │ │ │ │ ├── Requests.swift │ │ │ │ ├── Responses.swift │ │ │ │ └── Server.swift │ │ │ ├── Utils │ │ │ │ ├── CommandRegistry.swift │ │ │ │ ├── CommandUtils.swift │ │ │ │ ├── Logger.swift │ │ │ │ ├── NetworkUtils.swift │ │ │ │ ├── Path.swift │ │ │ │ ├── ProcessRunner.swift │ │ │ │ ├── ProgressLogger.swift │ │ │ │ ├── String.swift │ │ │ │ └── Utils.swift │ │ │ ├── Virtualization │ │ │ │ ├── DarwinImageLoader.swift │ │ │ │ ├── DHCPLeaseParser.swift │ │ │ │ ├── ImageLoaderFactory.swift │ │ │ │ └── VMVirtualizationService.swift │ │ │ ├── VM │ │ │ │ ├── DarwinVM.swift │ │ │ │ ├── LinuxVM.swift │ │ │ │ ├── VM.swift │ │ │ │ ├── VMDetails.swift │ │ │ │ ├── VMDetailsPrinter.swift │ │ │ │ ├── VMDisplayResolution.swift │ │ │ │ └── VMFactory.swift │ │ │ └── VNC │ │ │ ├── PassphraseGenerator.swift │ │ │ └── VNCService.swift │ │ └── tests │ │ ├── Mocks │ │ │ ├── MockVM.swift │ │ │ ├── MockVMVirtualizationService.swift │ │ │ └── MockVNCService.swift │ │ ├── VM │ │ │ └── VMDetailsPrinterTests.swift │ │ ├── VMTests.swift │ │ ├── VMVirtualizationServiceTests.swift │ │ └── VNCServiceTests.swift │ ├── lumier │ │ ├── .dockerignore │ │ ├── Dockerfile │ │ ├── README.md │ │ └── src │ │ ├── bin │ │ │ └── entry.sh │ │ ├── config │ │ │ └── constants.sh │ │ ├── hooks │ │ │ └── on-logon.sh │ │ └── lib │ │ ├── utils.sh │ │ └── vm.sh │ ├── python │ │ ├── agent │ │ │ ├── agent │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── adapters │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── huggingfacelocal_adapter.py │ │ │ │ │ ├── human_adapter.py │ │ │ │ │ ├── mlxvlm_adapter.py │ │ │ │ │ └── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── qwen2_5_vl.py │ │ │ │ ├── agent.py │ │ │ │ ├── callbacks │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── budget_manager.py │ │ │ │ │ ├── image_retention.py │ │ │ │ │ ├── logging.py │ │ │ │ │ ├── operator_validator.py │ │ │ │ │ ├── pii_anonymization.py │ │ │ │ │ ├── prompt_instructions.py │ │ │ │ │ ├── telemetry.py │ │ │ │ │ └── trajectory_saver.py │ │ │ │ ├── cli.py │ │ │ │ ├── computers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cua.py │ │ │ │ │ └── custom.py │ │ │ │ ├── decorators.py │ │ │ │ ├── human_tool │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ ├── server.py │ │ │ │ │ └── ui.py │ │ │ │ ├── integrations │ │ │ │ │ └── hud │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── agent.py │ │ │ │ │ └── proxy.py │ │ │ │ ├── loops │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── anthropic.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── composed_grounded.py │ │ │ │ │ ├── gemini.py │ │ │ │ │ ├── glm45v.py │ │ │ │ │ ├── gta1.py │ │ │ │ │ ├── holo.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── model_types.csv │ │ │ │ │ ├── moondream3.py │ │ │ │ │ ├── omniparser.py │ │ │ │ │ ├── openai.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── uitars.py │ │ │ │ ├── proxy │ │ │ │ │ ├── examples.py │ │ │ │ │ └── handlers.py │ │ │ │ ├── responses.py │ │ │ │ ├── types.py │ │ │ │ └── ui │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ └── gradio │ │ │ │ ├── __init__.py │ │ │ │ ├── app.py │ │ │ │ └── ui_components.py │ │ │ ├── benchmarks │ │ │ │ ├── .gitignore │ │ │ │ ├── contrib.md │ │ │ │ ├── interactive.py │ │ │ │ ├── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ └── gta1.py │ │ │ │ ├── README.md │ │ │ │ ├── ss-pro.py │ │ │ │ ├── ss-v2.py │ │ │ │ └── utils.py │ │ │ ├── example.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer │ │ │ ├── computer │ │ │ │ ├── __init__.py │ │ │ │ ├── computer.py │ │ │ │ ├── diorama_computer.py │ │ │ │ ├── helpers.py │ │ │ │ ├── interface │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ ├── models.py │ │ │ │ │ └── windows.py │ │ │ │ ├── logger.py │ │ │ │ ├── models.py │ │ │ │ ├── providers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cloud │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── docker │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── lume │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── lume_api.py │ │ │ │ │ ├── lumier │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── types.py │ │ │ │ │ └── winsandbox │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── provider.py │ │ │ │ │ └── setup_script.ps1 │ │ │ │ ├── ui │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ └── gradio │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── app.py │ │ │ │ └── utils.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer-server │ │ │ ├── computer_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── cli.py │ │ │ │ ├── diorama │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── diorama_computer.py │ │ │ │ │ ├── diorama.py │ │ │ │ │ ├── draw.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── safezone.py │ │ │ │ ├── handlers │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── windows.py │ │ │ │ ├── main.py │ │ │ │ ├── server.py │ │ │ │ └── watchdog.py │ │ │ ├── examples │ │ │ │ ├── __init__.py │ │ │ │ └── usage_example.py │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ ├── run_server.py │ │ │ └── test_connection.py │ │ ├── core │ │ │ ├── core │ │ │ │ ├── __init__.py │ │ │ │ └── telemetry │ │ │ │ ├── __init__.py │ │ │ │ └── posthog.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── mcp-server │ │ │ ├── mcp_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ └── server.py │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ └── scripts │ │ │ ├── install_mcp_server.sh │ │ │ └── start_mcp_server.sh │ │ ├── pylume │ │ │ ├── __init__.py │ │ │ ├── pylume │ │ │ │ ├── __init__.py │ │ │ │ ├── client.py │ │ │ │ ├── exceptions.py │ │ │ │ ├── lume │ │ │ │ ├── models.py │ │ │ │ ├── pylume.py │ │ │ │ └── server.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ └── som │ │ ├── LICENSE │ │ ├── poetry.toml │ │ ├── pyproject.toml │ │ ├── README.md │ │ ├── som │ │ │ ├── __init__.py │ │ │ ├── detect.py │ │ │ ├── detection.py │ │ │ ├── models.py │ │ │ ├── ocr.py │ │ │ ├── util │ │ │ │ └── utils.py │ │ │ └── visualization.py │ │ └── tests │ │ └── test_omniparser.py │ ├── typescript │ │ ├── .gitignore │ │ ├── .nvmrc │ │ ├── agent │ │ │ ├── examples │ │ │ │ ├── playground-example.html │ │ │ │ └── README.md │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── client.ts │ │ │ │ ├── index.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ └── client.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── biome.json │ │ ├── computer │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── computer │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── providers │ │ │ │ │ │ ├── base.ts │ │ │ │ │ │ ├── cloud.ts │ │ │ │ │ │ └── index.ts │ │ │ │ │ └── types.ts │ │ │ │ ├── index.ts │ │ │ │ ├── interface │ │ │ │ │ ├── base.ts │ │ │ │ │ ├── factory.ts │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── linux.ts │ │ │ │ │ ├── macos.ts │ │ │ │ │ └── windows.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ ├── computer │ │ │ │ │ └── cloud.test.ts │ │ │ │ ├── interface │ │ │ │ │ ├── factory.test.ts │ │ │ │ │ ├── index.test.ts │ │ │ │ │ ├── linux.test.ts │ │ │ │ │ ├── macos.test.ts │ │ │ │ │ └── windows.test.ts │ │ │ │ └── setup.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── core │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── index.ts │ │ │ │ └── telemetry │ │ │ │ ├── clients │ │ │ │ │ ├── index.ts │ │ │ │ │ └── posthog.ts │ │ │ │ └── index.ts │ │ │ ├── tests │ │ │ │ └── telemetry.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── pnpm-workspace.yaml │ │ └── README.md │ └── xfce │ ├── .dockerignore │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ └── src │ ├── scripts │ │ ├── resize-display.sh │ │ ├── start-computer-server.sh │ │ ├── start-novnc.sh │ │ ├── start-vnc.sh │ │ └── xstartup.sh │ ├── supervisor │ │ └── supervisord.conf │ └── xfce-config │ ├── helpers.rc │ ├── xfce4-power-manager.xml │ └── xfce4-session.xml ├── LICENSE.md ├── notebooks │ ├── agent_nb.ipynb │ ├── blog │ │ ├── build-your-own-operator-on-macos-1.ipynb │ │ └── build-your-own-operator-on-macos-2.ipynb │ ├── composite_agents_docker_nb.ipynb │ ├── computer_nb.ipynb │ ├── computer_server_nb.ipynb │ ├── customizing_computeragent.ipynb │ ├── eval_osworld.ipynb │ ├── ollama_nb.ipynb │ ├── pylume_nb.ipynb │ ├── README.md │ ├── sota_hackathon_cloud.ipynb │ └── sota_hackathon.ipynb ├── pdm.lock ├── pyproject.toml ├── pyrightconfig.json ├── README.md ├── samples │ └── community │ ├── global-online │ │ └── README.md │ └── hack-the-north │ └── README.md ├── scripts │ ├── build-uv.sh │ ├── build.ps1 │ ├── build.sh │ ├── cleanup.sh │ ├── playground-docker.sh │ ├── playground.sh │ └── run-docker-dev.sh └── tests ├── pytest.ini ├── shell_cmd.py ├── test_files.py ├── test_shell_bash.py ├── test_telemetry.py ├── test_venv.py └── test_watchdog.py ``` # Files -------------------------------------------------------------------------------- /libs/python/computer-server/computer_server/handlers/macos.py: -------------------------------------------------------------------------------- ```python 1 | import pyautogui 2 | pyautogui.FAILSAFE = False 3 | from pynput.mouse import Button, Controller as MouseController 4 | from pynput.keyboard import Key, Controller as KeyboardController 5 | import time 6 | import base64 7 | from io import BytesIO 8 | from typing import Optional, Dict, Any, List, Tuple 9 | from ctypes import byref, c_void_p, POINTER 10 | from AppKit import NSWorkspace # type: ignore 11 | import AppKit 12 | from Quartz.CoreGraphics import * # type: ignore 13 | from Quartz.CoreGraphics import CGPoint, CGSize # type: ignore 14 | import Foundation 15 | from ApplicationServices import ( 16 | AXUIElementCreateSystemWide, # type: ignore 17 | AXUIElementCreateApplication, # type: ignore 18 | AXUIElementCopyAttributeValue, # type: ignore 19 | AXUIElementCopyAttributeValues, # type: ignore 20 | kAXFocusedWindowAttribute, # type: ignore 21 | kAXWindowsAttribute, # type: ignore 22 | kAXMainWindowAttribute, # type: ignore 23 | kAXChildrenAttribute, # type: ignore 24 | kAXRoleAttribute, # type: ignore 25 | kAXTitleAttribute, # type: ignore 26 | kAXValueAttribute, # type: ignore 27 | kAXDescriptionAttribute, # type: ignore 28 | kAXEnabledAttribute, # type: ignore 29 | kAXPositionAttribute, # type: ignore 30 | kAXSizeAttribute, # type: ignore 31 | kAXErrorSuccess, # type: ignore 32 | AXValueGetType, # type: ignore 33 | kAXValueCGSizeType, # type: ignore 34 | kAXValueCGPointType, # type: ignore 35 | kAXValueCFRangeType, # type: ignore 36 | AXUIElementGetTypeID, # type: ignore 37 | AXValueGetValue, # type: ignore 38 | kAXVisibleChildrenAttribute, # type: ignore 39 | kAXRoleDescriptionAttribute, # type: ignore 40 | kAXFocusedApplicationAttribute, # type: ignore 41 | kAXFocusedUIElementAttribute, # type: ignore 42 | kAXSelectedTextAttribute, # type: ignore 43 | kAXSelectedTextRangeAttribute, # type: ignore 44 | ) 45 | import objc 46 | import re 47 | import json 48 | import copy 49 | import asyncio 50 | from .base import BaseAccessibilityHandler, BaseAutomationHandler 51 | import logging 52 | 53 | logger = logging.getLogger(__name__) 54 | 55 | # Constants for accessibility API 56 | kAXErrorSuccess = 0 57 | kAXRoleAttribute = "AXRole" 58 | kAXTitleAttribute = "AXTitle" 59 | kAXValueAttribute = "AXValue" 60 | kAXWindowsAttribute = "AXWindows" 61 | kAXFocusedAttribute = "AXFocused" 62 | kAXPositionAttribute = "AXPosition" 63 | kAXSizeAttribute = "AXSize" 64 | kAXChildrenAttribute = "AXChildren" 65 | kAXMenuBarAttribute = "AXMenuBar" 66 | kAXMenuBarItemAttribute = "AXMenuBarItem" 67 | 68 | # Constants for window properties 69 | kCGWindowLayer = "kCGWindowLayer" # Z-order information (lower values are higher in the stack) 70 | kCGWindowAlpha = "kCGWindowAlpha" # Window opacity 71 | 72 | # Constants for application activation options 73 | NSApplicationActivationOptions = { 74 | "regular": 0, # Default activation 75 | "bringing_all_windows_forward": 1 << 0, # NSApplicationActivateAllWindows 76 | "ignoring_other_apps": 1 << 1 # NSApplicationActivateIgnoringOtherApps 77 | } 78 | 79 | def CFAttributeToPyObject(attrValue): 80 | """Convert Core Foundation attribute values to Python objects. 81 | 82 | Args: 83 | attrValue: Core Foundation attribute value to convert 84 | 85 | Returns: 86 | Converted Python object or None if conversion fails 87 | """ 88 | def list_helper(list_value): 89 | """Helper function to convert CF arrays to Python lists. 90 | 91 | Args: 92 | list_value: Core Foundation array to convert 93 | 94 | Returns: 95 | Python list containing converted items 96 | """ 97 | list_builder = [] 98 | for item in list_value: 99 | list_builder.append(CFAttributeToPyObject(item)) 100 | return list_builder 101 | 102 | def number_helper(number_value): 103 | """Helper function to convert CF numbers to Python numbers. 104 | 105 | Args: 106 | number_value: Core Foundation number to convert 107 | 108 | Returns: 109 | Python int or float, or None if conversion fails 110 | """ 111 | success, int_value = Foundation.CFNumberGetValue( # type: ignore 112 | number_value, Foundation.kCFNumberIntType, None # type: ignore 113 | ) 114 | if success: 115 | return int(int_value) 116 | 117 | success, float_value = Foundation.CFNumberGetValue( # type: ignore 118 | number_value, Foundation.kCFNumberDoubleType, None # type: ignore 119 | ) 120 | if success: 121 | return float(float_value) 122 | return None 123 | 124 | def axuielement_helper(element_value): 125 | """Helper function to handle AX UI elements. 126 | 127 | Args: 128 | element_value: Accessibility UI element to process 129 | 130 | Returns: 131 | The element value unchanged 132 | """ 133 | return element_value 134 | 135 | cf_attr_type = Foundation.CFGetTypeID(attrValue) # type: ignore 136 | cf_type_mapping = { 137 | Foundation.CFStringGetTypeID(): str, # type: ignore 138 | Foundation.CFBooleanGetTypeID(): bool, # type: ignore 139 | Foundation.CFArrayGetTypeID(): list_helper, # type: ignore 140 | Foundation.CFNumberGetTypeID(): number_helper, # type: ignore 141 | AXUIElementGetTypeID(): axuielement_helper, # type: ignore 142 | } 143 | try: 144 | return cf_type_mapping[cf_attr_type](attrValue) 145 | except KeyError: 146 | # did not get a supported CF type. Move on to AX type 147 | pass 148 | 149 | ax_attr_type = AXValueGetType(attrValue) 150 | ax_type_map = { 151 | kAXValueCGSizeType: Foundation.NSSizeFromString, # type: ignore 152 | kAXValueCGPointType: Foundation.NSPointFromString, # type: ignore 153 | kAXValueCFRangeType: Foundation.NSRangeFromString, # type: ignore 154 | } 155 | try: 156 | search_result = re.search("{.*}", attrValue.description()) 157 | if search_result: 158 | extracted_str = search_result.group() 159 | return tuple(ax_type_map[ax_attr_type](extracted_str)) 160 | return None 161 | except KeyError: 162 | return None 163 | 164 | 165 | def element_attribute(element, attribute): 166 | """Get an attribute value from an accessibility element. 167 | 168 | Args: 169 | element: The accessibility element 170 | attribute: The attribute name to retrieve 171 | 172 | Returns: 173 | The attribute value or None if not found 174 | """ 175 | if attribute == kAXChildrenAttribute: 176 | err, value = AXUIElementCopyAttributeValues(element, attribute, 0, 999, None) 177 | if err == kAXErrorSuccess: 178 | if isinstance(value, Foundation.NSArray): # type: ignore 179 | return CFAttributeToPyObject(value) 180 | else: 181 | return value 182 | err, value = AXUIElementCopyAttributeValue(element, attribute, None) 183 | if err == kAXErrorSuccess: 184 | if isinstance(value, Foundation.NSArray): # type: ignore 185 | return CFAttributeToPyObject(value) 186 | else: 187 | return value 188 | return None 189 | 190 | 191 | def element_value(element, type): 192 | """Extract a typed value from an accessibility element. 193 | 194 | Args: 195 | element: The accessibility element containing the value 196 | type: The expected value type 197 | 198 | Returns: 199 | The extracted value or None if extraction fails 200 | """ 201 | err, value = AXValueGetValue(element, type, None) 202 | if err == True: 203 | return value 204 | return None 205 | 206 | 207 | class UIElement: 208 | """Represents a UI element in the accessibility tree with position, size, and hierarchy information.""" 209 | 210 | def __init__(self, element, offset_x=0, offset_y=0, max_depth=None, parents_visible_bbox=None): 211 | """Initialize a UIElement from an accessibility element. 212 | 213 | Args: 214 | element: The accessibility element to wrap 215 | offset_x: X offset for position calculations 216 | offset_y: Y offset for position calculations 217 | max_depth: Maximum depth to traverse for children 218 | parents_visible_bbox: Parent's visible bounding box for clipping 219 | """ 220 | self.ax_element = element 221 | self.content_identifier = "" 222 | self.identifier = "" 223 | self.name = "" 224 | self.children = [] 225 | self.description = "" 226 | self.role_description = "" 227 | self.value = None 228 | self.max_depth = max_depth 229 | 230 | # Set role 231 | self.role = element_attribute(element, kAXRoleAttribute) 232 | if self.role is None: 233 | self.role = "No role" 234 | 235 | # Set name 236 | self.name = element_attribute(element, kAXTitleAttribute) 237 | if self.name is not None: 238 | # Convert tuple to string if needed 239 | if isinstance(self.name, tuple): 240 | self.name = str(self.name[0]) if self.name else "" 241 | self.name = self.name.replace(" ", "_") 242 | 243 | # Set enabled 244 | self.enabled = element_attribute(element, kAXEnabledAttribute) 245 | if self.enabled is None: 246 | self.enabled = False 247 | 248 | # Set position and size 249 | position = element_attribute(element, kAXPositionAttribute) 250 | size = element_attribute(element, kAXSizeAttribute) 251 | start_position = element_value(position, kAXValueCGPointType) 252 | 253 | if self.role == "AXWindow" and start_position is not None: 254 | offset_x = start_position.x 255 | offset_y = start_position.y 256 | 257 | self.absolute_position = copy.copy(start_position) 258 | self.position = start_position 259 | if self.position is not None: 260 | self.position.x -= max(0, offset_x) 261 | self.position.y -= max(0, offset_y) 262 | self.size = element_value(size, kAXValueCGSizeType) 263 | 264 | self._set_bboxes(parents_visible_bbox) 265 | 266 | # Set component center 267 | if start_position is None or self.size is None: 268 | print("Position is None") 269 | return 270 | self.center = ( 271 | start_position.x + offset_x + self.size.width / 2, 272 | start_position.y + offset_y + self.size.height / 2, 273 | ) 274 | 275 | self.description = element_attribute(element, kAXDescriptionAttribute) 276 | self.role_description = element_attribute(element, kAXRoleDescriptionAttribute) 277 | attribute_value = element_attribute(element, kAXValueAttribute) 278 | 279 | # Set value 280 | self.value = attribute_value 281 | if attribute_value is not None: 282 | if isinstance(attribute_value, Foundation.NSArray): # type: ignore 283 | self.value = [] 284 | for value in attribute_value: 285 | self.value.append(value) 286 | # Check if it's an accessibility element by checking its type ID 287 | elif Foundation.CFGetTypeID(attribute_value) == AXUIElementGetTypeID(): # type: ignore 288 | self.value = UIElement(attribute_value, offset_x, offset_y) 289 | 290 | # Set children 291 | if self.max_depth is None or self.max_depth > 0: 292 | self.children = self._get_children(element, start_position, offset_x, offset_y) 293 | else: 294 | self.children = [] 295 | 296 | self.calculate_hashes() 297 | 298 | def _set_bboxes(self, parents_visible_bbox): 299 | """Set bounding box and visible bounding box for the element. 300 | 301 | Args: 302 | parents_visible_bbox: Parent's visible bounding box for intersection calculation 303 | """ 304 | if not self.absolute_position or not self.size: 305 | self.bbox = None 306 | self.visible_bbox = None 307 | return 308 | self.bbox = [ 309 | int(self.absolute_position.x), 310 | int(self.absolute_position.y), 311 | int(self.absolute_position.x + self.size.width), 312 | int(self.absolute_position.y + self.size.height), 313 | ] 314 | if parents_visible_bbox: 315 | # check if not intersected 316 | if ( 317 | self.bbox[0] > parents_visible_bbox[2] 318 | or self.bbox[1] > parents_visible_bbox[3] 319 | or self.bbox[2] < parents_visible_bbox[0] 320 | or self.bbox[3] < parents_visible_bbox[1] 321 | ): 322 | self.visible_bbox = None 323 | else: 324 | self.visible_bbox = [ 325 | int(max(self.bbox[0], parents_visible_bbox[0])), 326 | int(max(self.bbox[1], parents_visible_bbox[1])), 327 | int(min(self.bbox[2], parents_visible_bbox[2])), 328 | int(min(self.bbox[3], parents_visible_bbox[3])), 329 | ] 330 | else: 331 | self.visible_bbox = self.bbox 332 | 333 | def _get_children(self, element, start_position, offset_x, offset_y): 334 | """Get child elements from the accessibility element. 335 | 336 | Args: 337 | element: The parent accessibility element 338 | start_position: Starting position for offset calculations 339 | offset_x: X offset for child positioning 340 | offset_y: Y offset for child positioning 341 | 342 | Returns: 343 | List of UIElement children 344 | """ 345 | children = element_attribute(element, kAXChildrenAttribute) 346 | visible_children = element_attribute(element, kAXVisibleChildrenAttribute) 347 | found_children = [] 348 | if children is not None: 349 | found_children.extend(children) 350 | else: 351 | if visible_children is not None: 352 | found_children.extend(visible_children) 353 | 354 | result = [] 355 | if self.max_depth is None or self.max_depth > 0: 356 | for child in found_children: 357 | child = UIElement( 358 | child, 359 | offset_x, 360 | offset_y, 361 | self.max_depth - 1 if self.max_depth is not None else None, 362 | self.visible_bbox, 363 | ) 364 | result.append(child) 365 | return result 366 | 367 | def calculate_hashes(self): 368 | """Calculate unique identifiers for the element and its content.""" 369 | self.identifier = self.component_hash() 370 | self.content_identifier = self.children_content_hash(self.children) 371 | 372 | def component_hash(self): 373 | """Generate a hash identifier for this component based on its properties. 374 | 375 | Returns: 376 | MD5 hash string of component properties 377 | """ 378 | if self.position is None or self.size is None: 379 | return "" 380 | position_string = f"{self.position.x:.0f};{self.position.y:.0f}" 381 | size_string = f"{self.size.width:.0f};{self.size.height:.0f}" 382 | enabled_string = str(self.enabled) 383 | # Ensure role is a string 384 | role_string = "" 385 | if self.role is not None: 386 | role_string = str(self.role[0]) if isinstance(self.role, tuple) else str(self.role) 387 | return self.hash_from_string(position_string + size_string + enabled_string + role_string) 388 | 389 | def hash_from_string(self, string): 390 | """Generate MD5 hash from a string. 391 | 392 | Args: 393 | string: Input string to hash 394 | 395 | Returns: 396 | MD5 hash hexdigest or empty string if input is None/empty 397 | """ 398 | if string is None or string == "": 399 | return "" 400 | from hashlib import md5 401 | 402 | return md5(string.encode()).hexdigest() 403 | 404 | def children_content_hash(self, children): 405 | """Generate a hash representing the content and structure of child elements. 406 | 407 | Args: 408 | children: List of child UIElement objects 409 | 410 | Returns: 411 | Combined hash of children content and structure 412 | """ 413 | if len(children) == 0: 414 | return "" 415 | all_content_hashes = [] 416 | all_hashes = [] 417 | for child in children: 418 | all_content_hashes.append(child.content_identifier) 419 | all_hashes.append(child.identifier) 420 | all_content_hashes.sort() 421 | if len(all_content_hashes) == 0: 422 | return "" 423 | content_hash = self.hash_from_string("".join(all_content_hashes)) 424 | content_structure_hash = self.hash_from_string("".join(all_hashes)) 425 | return self.hash_from_string(content_hash.join(content_structure_hash)) 426 | 427 | def to_dict(self): 428 | """Convert the UIElement to a dictionary representation. 429 | 430 | Returns: 431 | Dictionary containing all element properties and children 432 | """ 433 | def children_to_dict(children): 434 | """Convert list of children to dictionary format. 435 | 436 | Args: 437 | children: List of UIElement children to convert 438 | 439 | Returns: 440 | List of dictionaries representing the children 441 | """ 442 | result = [] 443 | for child in children: 444 | result.append(child.to_dict()) 445 | return result 446 | 447 | value = self.value 448 | if isinstance(value, UIElement): 449 | value = json.dumps(value.to_dict(), indent=4) 450 | elif isinstance(value, AppKit.NSDate): # type: ignore 451 | value = str(value) 452 | 453 | if self.absolute_position is not None: 454 | absolute_position = f"{self.absolute_position.x:.2f};{self.absolute_position.y:.2f}" 455 | else: 456 | absolute_position = "" 457 | 458 | if self.position is not None: 459 | position = f"{self.position.x:.2f};{self.position.y:.2f}" 460 | else: 461 | position = "" 462 | 463 | if self.size is not None: 464 | size = f"{self.size.width:.0f};{self.size.height:.0f}" 465 | else: 466 | size = "" 467 | 468 | return { 469 | "id": self.identifier, 470 | "name": self.name, 471 | "role": self.role, 472 | "description": self.description, 473 | "role_description": self.role_description, 474 | "value": value, 475 | "absolute_position": absolute_position, 476 | "position": position, 477 | "size": size, 478 | "enabled": self.enabled, 479 | "bbox": self.bbox, 480 | "visible_bbox": self.visible_bbox, 481 | "children": children_to_dict(self.children), 482 | } 483 | 484 | 485 | import Quartz 486 | from AppKit import NSWorkspace, NSRunningApplication 487 | from pathlib import Path 488 | 489 | def get_all_windows_zorder(): 490 | """Get all windows in the system with their z-order information. 491 | 492 | Returns: 493 | List of window dictionaries sorted by z-index, containing window properties 494 | like id, name, pid, owner, bounds, layer, and opacity 495 | """ 496 | window_list = Quartz.CGWindowListCopyWindowInfo( 497 | Quartz.kCGWindowListOptionOnScreenOnly, 498 | Quartz.kCGNullWindowID 499 | ) 500 | z_order = {window['kCGWindowNumber']: z_index for z_index, window in enumerate(window_list[::-1])} 501 | window_list_all = Quartz.CGWindowListCopyWindowInfo( 502 | Quartz.kCGWindowListOptionAll, 503 | Quartz.kCGNullWindowID 504 | ) 505 | windows = [] 506 | for window in window_list_all: 507 | window_id = window.get('kCGWindowNumber', 0) 508 | window_name = window.get('kCGWindowName', '') 509 | window_pid = window.get('kCGWindowOwnerPID', 0) 510 | window_bounds = window.get('kCGWindowBounds', {}) 511 | window_owner = window.get('kCGWindowOwnerName', '') 512 | window_is_on_screen = window.get('kCGWindowIsOnscreen', False) 513 | layer = window.get('kCGWindowLayer', 0) 514 | opacity = window.get('kCGWindowAlpha', 1.0) 515 | z_index = z_order.get(window_id, -1) 516 | if window_name == "Dock" and window_owner == "Dock": 517 | role = "dock" 518 | elif window_name == "Menubar" and window_owner == "Window Server": 519 | role = "menubar" 520 | elif window_owner in ["Window Server", "Dock"]: 521 | role = "desktop" 522 | else: 523 | role = "app" 524 | if window_bounds: 525 | windows.append({ 526 | "id": window_id, 527 | "name": window_name or "Unnamed Window", 528 | "pid": window_pid, 529 | "owner": window_owner, 530 | "role": role, 531 | "is_on_screen": window_is_on_screen, 532 | "bounds": { 533 | "x": window_bounds.get('X', 0), 534 | "y": window_bounds.get('Y', 0), 535 | "width": window_bounds.get('Width', 0), 536 | "height": window_bounds.get('Height', 0) 537 | }, 538 | "layer": layer, 539 | "z_index": z_index, 540 | "opacity": opacity 541 | }) 542 | windows = sorted(windows, key=lambda x: x["z_index"]) 543 | return windows 544 | 545 | def get_app_info(app): 546 | """Extract information from an NSRunningApplication object. 547 | 548 | Args: 549 | app: NSRunningApplication instance 550 | 551 | Returns: 552 | Dictionary containing app name, bundle ID, PID, and status flags 553 | """ 554 | return { 555 | "name": app.localizedName(), 556 | "bundle_id": app.bundleIdentifier(), 557 | "pid": app.processIdentifier(), 558 | "active": app.isActive(), 559 | "hidden": app.isHidden(), 560 | "terminated": app.isTerminated(), 561 | } 562 | 563 | def get_menubar_items(active_app_pid=None): 564 | """Get menubar items for the active application. 565 | 566 | Args: 567 | active_app_pid: Process ID of the active application, or None to use frontmost app 568 | 569 | Returns: 570 | List of menubar item dictionaries with title, bounds, index, and app_pid 571 | """ 572 | menubar_items = [] 573 | if active_app_pid is None: 574 | frontmost_app = NSWorkspace.sharedWorkspace().frontmostApplication() 575 | if frontmost_app: 576 | active_app_pid = frontmost_app.processIdentifier() 577 | else: 578 | return menubar_items 579 | app_element = AXUIElementCreateApplication(active_app_pid) 580 | if app_element is None: 581 | return menubar_items 582 | menubar = element_attribute(app_element, kAXMenuBarAttribute) 583 | if menubar is None: 584 | return menubar_items 585 | children = element_attribute(menubar, kAXChildrenAttribute) 586 | if children is None: 587 | return menubar_items 588 | for i, item in enumerate(children): 589 | title = element_attribute(item, kAXTitleAttribute) or "Untitled" 590 | bounds = {"x": 0, "y": 0, "width": 0, "height": 0} 591 | position_value = element_attribute(item, kAXPositionAttribute) 592 | if position_value: 593 | position_value = element_value(position_value, kAXValueCGPointType) 594 | bounds["x"] = getattr(position_value, 'x', 0) 595 | bounds["y"] = getattr(position_value, 'y', 0) 596 | size_value = element_attribute(item, kAXSizeAttribute) 597 | if size_value: 598 | size_value = element_value(size_value, kAXValueCGSizeType) 599 | bounds["width"] = getattr(size_value, 'width', 0) 600 | bounds["height"] = getattr(size_value, 'height', 0) 601 | menubar_items.append({ 602 | "title": title, 603 | "bounds": bounds, 604 | "index": i, 605 | "app_pid": active_app_pid 606 | }) 607 | return menubar_items 608 | 609 | def get_dock_items(): 610 | """Get all items in the macOS Dock. 611 | 612 | Returns: 613 | List of dock item dictionaries with title, description, bounds, index, 614 | type, role, and subrole information 615 | """ 616 | dock_items = [] 617 | dock_pid = None 618 | running_apps = NSWorkspace.sharedWorkspace().runningApplications() 619 | for app in running_apps: 620 | if app.localizedName() == "Dock" and app.bundleIdentifier() == "com.apple.dock": 621 | dock_pid = app.processIdentifier() 622 | break 623 | if dock_pid is None: 624 | return dock_items 625 | dock_element = AXUIElementCreateApplication(dock_pid) 626 | if dock_element is None: 627 | return dock_items 628 | dock_list = element_attribute(dock_element, kAXChildrenAttribute) 629 | if dock_list is None or len(dock_list) == 0: 630 | return dock_items 631 | dock_app_list = None 632 | for child in dock_list: 633 | role = element_attribute(child, kAXRoleAttribute) 634 | if role == "AXList": 635 | dock_app_list = child 636 | break 637 | if dock_app_list is None: 638 | return dock_items 639 | items = element_attribute(dock_app_list, kAXChildrenAttribute) 640 | if items is None: 641 | return dock_items 642 | for i, item in enumerate(items): 643 | title = element_attribute(item, kAXTitleAttribute) or "Untitled" 644 | description = element_attribute(item, kAXDescriptionAttribute) or "" 645 | role = element_attribute(item, kAXRoleAttribute) or "" 646 | subrole = element_attribute(item, "AXSubrole") or "" 647 | bounds = {"x": 0, "y": 0, "width": 0, "height": 0} 648 | position_value = element_attribute(item, kAXPositionAttribute) 649 | if position_value: 650 | position_value = element_value(position_value, kAXValueCGPointType) 651 | bounds["x"] = getattr(position_value, 'x', 0) 652 | bounds["y"] = getattr(position_value, 'y', 0) 653 | size_value = element_attribute(item, kAXSizeAttribute) 654 | if size_value: 655 | size_value = element_value(size_value, kAXValueCGSizeType) 656 | bounds["width"] = getattr(size_value, 'width', 0) 657 | bounds["height"] = getattr(size_value, 'height', 0) 658 | item_type = "unknown" 659 | if subrole == "AXApplicationDockItem": 660 | item_type = "application" 661 | elif subrole == "AXFolderDockItem": 662 | item_type = "folder" 663 | elif subrole == "AXDocumentDockItem": 664 | item_type = "document" 665 | elif subrole == "AXSeparatorDockItem" or role == "AXSeparator": 666 | item_type = "separator" 667 | elif "trash" in title.lower(): 668 | item_type = "trash" 669 | dock_items.append({ 670 | "title": title, 671 | "description": description, 672 | "bounds": bounds, 673 | "index": i, 674 | "type": item_type, 675 | "role": role, 676 | "subrole": subrole 677 | }) 678 | return dock_items 679 | 680 | class MacOSAccessibilityHandler(BaseAccessibilityHandler): 681 | """Handler for macOS accessibility features and UI element inspection.""" 682 | 683 | def get_desktop_state(self): 684 | """Get the current state of the desktop including windows, apps, menubar, and dock. 685 | 686 | Returns: 687 | Dictionary containing applications, windows, menubar_items, and dock_items 688 | """ 689 | windows = [w for w in get_all_windows_zorder() if w.get("is_on_screen")] 690 | running_apps = self.get_running_apps() 691 | applications = [] 692 | pid_to_window_ids = {} 693 | # Build a mapping: pid -> list of AX window trees 694 | pid_to_ax_trees = {} 695 | for app in running_apps: 696 | pid = app.processIdentifier() 697 | try: 698 | app_elem = AXUIElementCreateApplication(pid) 699 | err, app_windows = AXUIElementCopyAttributeValue(app_elem, kAXWindowsAttribute, None) 700 | trees = [] 701 | if err == kAXErrorSuccess and app_windows: 702 | for ax_win in app_windows: 703 | try: 704 | trees.append(UIElement(ax_win).to_dict()) 705 | except Exception as e: 706 | trees.append({"error": str(e)}) 707 | pid_to_ax_trees[pid] = trees 708 | except Exception as e: 709 | pid_to_ax_trees[pid] = [{"error": str(e)}] 710 | # Attach children by pid and index (order) 711 | pid_to_idx = {} 712 | for win in windows: 713 | pid = win["pid"] 714 | idx = pid_to_idx.get(pid, 0) 715 | ax_trees = pid_to_ax_trees.get(pid, []) 716 | win["children"] = ax_trees[idx]["children"] if idx < len(ax_trees) and "children" in ax_trees[idx] else [] 717 | pid_to_idx[pid] = idx + 1 718 | pid_to_window_ids.setdefault(pid, []).append(win["id"]) 719 | for app in running_apps: 720 | info = get_app_info(app) 721 | app_pid = info["pid"] 722 | applications.append({ 723 | "info": info, 724 | "windows": pid_to_window_ids.get(app_pid, []) 725 | }) 726 | menubar_items = get_menubar_items() 727 | dock_items = get_dock_items() 728 | return { 729 | "applications": applications, 730 | "windows": windows, 731 | "menubar_items": menubar_items, 732 | "dock_items": dock_items 733 | } 734 | 735 | def get_application_windows(self, pid: int): 736 | """Get all windows for a specific application. 737 | 738 | Args: 739 | pid: Process ID of the application 740 | 741 | Returns: 742 | List of accessibility window elements or empty list if none found 743 | """ 744 | try: 745 | app = AXUIElementCreateApplication(pid) 746 | err, windows = AXUIElementCopyAttributeValue(app, kAXWindowsAttribute, None) 747 | if err == kAXErrorSuccess and windows: 748 | if isinstance(windows, Foundation.NSArray): # type: ignore 749 | return windows 750 | return [] 751 | except: 752 | return [] 753 | 754 | def get_all_windows(self): 755 | """Get all visible windows in the system. 756 | 757 | Returns: 758 | List of window dictionaries with app information and window details 759 | """ 760 | try: 761 | windows = [] 762 | running_apps = self.get_running_apps() 763 | 764 | for app in running_apps: 765 | try: 766 | app_name = app.localizedName() 767 | pid = app.processIdentifier() 768 | 769 | # Skip system processes and background apps 770 | if not app.activationPolicy() == 0: # NSApplicationActivationPolicyRegular 771 | continue 772 | 773 | # Get application windows 774 | app_windows = self.get_application_windows(pid) 775 | 776 | windows.append( 777 | { 778 | "app_name": app_name, 779 | "pid": pid, 780 | "frontmost": app.isActive(), 781 | "has_windows": len(app_windows) > 0, 782 | "windows": app_windows, 783 | } 784 | ) 785 | except: 786 | continue 787 | 788 | return windows 789 | except: 790 | return [] 791 | 792 | def get_running_apps(self): 793 | """Get all currently running applications. 794 | 795 | Returns: 796 | List of NSRunningApplication objects 797 | """ 798 | # From NSWorkspace.runningApplications docs: https://developer.apple.com/documentation/appkit/nsworkspace/runningapplications 799 | # "Similar to the NSRunningApplication class's properties, this property will only change when the main run loop runs in a common mode" 800 | # So we need to run the main run loop to get the latest running applications 801 | Foundation.CFRunLoopRunInMode(Foundation.kCFRunLoopDefaultMode, 0.1, False) # type: ignore 802 | return NSWorkspace.sharedWorkspace().runningApplications() 803 | 804 | def get_ax_attribute(self, element, attribute): 805 | """Get an accessibility attribute from an element. 806 | 807 | Args: 808 | element: The accessibility element 809 | attribute: The attribute name to retrieve 810 | 811 | Returns: 812 | The attribute value or None if not found 813 | """ 814 | return element_attribute(element, attribute) 815 | 816 | def serialize_node(self, element): 817 | """Create a serializable dictionary representation of an accessibility element. 818 | 819 | Args: 820 | element: The accessibility element to serialize 821 | 822 | Returns: 823 | Dictionary containing element properties like role, title, value, position, and size 824 | """ 825 | # Create a serializable dictionary representation of an accessibility element 826 | result = {} 827 | 828 | # Get basic attributes 829 | result["role"] = self.get_ax_attribute(element, kAXRoleAttribute) 830 | result["title"] = self.get_ax_attribute(element, kAXTitleAttribute) 831 | result["value"] = self.get_ax_attribute(element, kAXValueAttribute) 832 | 833 | # Get position and size if available 834 | position = self.get_ax_attribute(element, kAXPositionAttribute) 835 | if position: 836 | try: 837 | position_dict = {"x": position[0], "y": position[1]} 838 | result["position"] = position_dict 839 | except (IndexError, TypeError): 840 | pass 841 | 842 | size = self.get_ax_attribute(element, kAXSizeAttribute) 843 | if size: 844 | try: 845 | size_dict = {"width": size[0], "height": size[1]} 846 | result["size"] = size_dict 847 | except (IndexError, TypeError): 848 | pass 849 | 850 | return result 851 | 852 | async def get_accessibility_tree(self) -> Dict[str, Any]: 853 | """Get the complete accessibility tree for the current desktop state. 854 | 855 | Returns: 856 | Dictionary containing success status and desktop state information 857 | """ 858 | try: 859 | desktop_state = self.get_desktop_state() 860 | return { 861 | "success": True, 862 | **desktop_state 863 | } 864 | 865 | except Exception as e: 866 | return {"success": False, "error": str(e)} 867 | 868 | async def find_element( 869 | self, role: Optional[str] = None, title: Optional[str] = None, value: Optional[str] = None 870 | ) -> Dict[str, Any]: 871 | """Find an accessibility element matching the specified criteria. 872 | 873 | Args: 874 | role: The accessibility role to match (optional) 875 | title: The title to match (optional) 876 | value: The value to match (optional) 877 | 878 | Returns: 879 | Dictionary containing success status and the found element or error message 880 | """ 881 | try: 882 | system = AXUIElementCreateSystemWide() 883 | 884 | def match_element(element): 885 | """Check if an element matches the search criteria. 886 | 887 | Args: 888 | element: The accessibility element to check 889 | 890 | Returns: 891 | True if element matches all specified criteria, False otherwise 892 | """ 893 | if role and self.get_ax_attribute(element, kAXRoleAttribute) != role: 894 | return False 895 | if title and self.get_ax_attribute(element, kAXTitleAttribute) != title: 896 | return False 897 | if value and str(self.get_ax_attribute(element, kAXValueAttribute)) != value: 898 | return False 899 | return True 900 | 901 | def search_tree(element): 902 | """Recursively search the accessibility tree for matching elements. 903 | 904 | Args: 905 | element: The accessibility element to search from 906 | 907 | Returns: 908 | Serialized element dictionary if match found, None otherwise 909 | """ 910 | if match_element(element): 911 | return self.serialize_node(element) 912 | 913 | children = self.get_ax_attribute(element, kAXChildrenAttribute) 914 | if children: 915 | for child in children: 916 | result = search_tree(child) 917 | if result: 918 | return result 919 | return None 920 | 921 | element = search_tree(system) 922 | return {"success": True, "element": element} 923 | 924 | except Exception as e: 925 | return {"success": False, "error": str(e)} 926 | 927 | class MacOSAutomationHandler(BaseAutomationHandler): 928 | """Handler for macOS automation including mouse, keyboard, and screen operations.""" 929 | 930 | # Mouse Actions 931 | mouse = MouseController() 932 | keyboard = KeyboardController() 933 | 934 | async def mouse_down(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> Dict[str, Any]: 935 | """Press and hold a mouse button at the specified coordinates. 936 | 937 | Args: 938 | x: X coordinate (optional, uses current position if None) 939 | y: Y coordinate (optional, uses current position if None) 940 | button: Mouse button to press ("left", "right", or "middle") 941 | 942 | Returns: 943 | Dictionary containing success status and error message if failed 944 | """ 945 | try: 946 | if x is not None and y is not None: 947 | self.mouse.position = (x, y) 948 | self.mouse.press(Button.left if button == "left" else Button.right if button == "right" else Button.middle) 949 | return {"success": True} 950 | except Exception as e: 951 | return {"success": False, "error": str(e)} 952 | 953 | async def mouse_up(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> Dict[str, Any]: 954 | """Release a mouse button at the specified coordinates. 955 | 956 | Args: 957 | x: X coordinate (optional, uses current position if None) 958 | y: Y coordinate (optional, uses current position if None) 959 | button: Mouse button to release ("left", "right", or "middle") 960 | 961 | Returns: 962 | Dictionary containing success status and error message if failed 963 | """ 964 | try: 965 | if x is not None and y is not None: 966 | self.mouse.position = (x, y) 967 | self.mouse.release(Button.left if button == "left" else Button.right if button == "right" else Button.middle) 968 | return {"success": True} 969 | except Exception as e: 970 | return {"success": False, "error": str(e)} 971 | 972 | async def left_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]: 973 | """Perform a left mouse click at the specified coordinates. 974 | 975 | Args: 976 | x: X coordinate (optional, uses current position if None) 977 | y: Y coordinate (optional, uses current position if None) 978 | 979 | Returns: 980 | Dictionary containing success status and error message if failed 981 | """ 982 | try: 983 | if x is not None and y is not None: 984 | self.mouse.position = (x, y) 985 | self.mouse.click(Button.left, 1) 986 | return {"success": True} 987 | except Exception as e: 988 | return {"success": False, "error": str(e)} 989 | 990 | async def right_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]: 991 | """Perform a right mouse click at the specified coordinates. 992 | 993 | Args: 994 | x: X coordinate (optional, uses current position if None) 995 | y: Y coordinate (optional, uses current position if None) 996 | 997 | Returns: 998 | Dictionary containing success status and error message if failed 999 | """ 1000 | try: 1001 | if x is not None and y is not None: 1002 | self.mouse.position = (x, y) 1003 | self.mouse.click(Button.right, 1) 1004 | return {"success": True} 1005 | except Exception as e: 1006 | return {"success": False, "error": str(e)} 1007 | 1008 | async def double_click( 1009 | self, x: Optional[int] = None, y: Optional[int] = None 1010 | ) -> Dict[str, Any]: 1011 | """Perform a double left mouse click at the specified coordinates. 1012 | 1013 | Args: 1014 | x: X coordinate (optional, uses current position if None) 1015 | y: Y coordinate (optional, uses current position if None) 1016 | 1017 | Returns: 1018 | Dictionary containing success status and error message if failed 1019 | """ 1020 | try: 1021 | if x is not None and y is not None: 1022 | self.mouse.position = (x, y) 1023 | self.mouse.click(Button.left, 2) 1024 | return {"success": True} 1025 | except Exception as e: 1026 | return {"success": False, "error": str(e)} 1027 | 1028 | async def move_cursor(self, x: int, y: int) -> Dict[str, Any]: 1029 | """Move the mouse cursor to the specified coordinates. 1030 | 1031 | Args: 1032 | x: Target X coordinate 1033 | y: Target Y coordinate 1034 | 1035 | Returns: 1036 | Dictionary containing success status and error message if failed 1037 | """ 1038 | try: 1039 | self.mouse.position = (x, y) 1040 | return {"success": True} 1041 | except Exception as e: 1042 | return {"success": False, "error": str(e)} 1043 | 1044 | async def drag_to( 1045 | self, x: int, y: int, button: str = "left", duration: float = 0.5 1046 | ) -> Dict[str, Any]: 1047 | """Drag from current position to target coordinates. 1048 | 1049 | Args: 1050 | x: Target X coordinate 1051 | y: Target Y coordinate 1052 | button: Mouse button to use for dragging ("left", "right", or "middle") 1053 | duration: Duration of the drag operation in seconds 1054 | 1055 | Returns: 1056 | Dictionary containing success status and error message if failed 1057 | """ 1058 | try: 1059 | btn = Button.left if button == "left" else Button.right if button == "right" else Button.middle 1060 | # Press 1061 | self.mouse.press(btn) 1062 | # Move with sleep to simulate drag duration 1063 | start = self.mouse.position 1064 | steps = 20 1065 | start_x, start_y = start 1066 | dx = (x - start_x) / steps 1067 | dy = (y - start_y) / steps 1068 | for i in range(steps): 1069 | self.mouse.position = (int(start_x + dx * (i + 1)), int(start_y + dy * (i + 1))) 1070 | time.sleep(duration / steps) 1071 | # Release 1072 | self.mouse.release(btn) 1073 | return {"success": True} 1074 | except Exception as e: 1075 | try: 1076 | self.mouse.release(btn) 1077 | except: 1078 | pass 1079 | return {"success": False, "error": str(e)} 1080 | 1081 | async def drag( 1082 | self, path: List[Tuple[int, int]], button: str = "left", duration: float = 0.5 1083 | ) -> Dict[str, Any]: 1084 | """Drag the mouse along a specified path of coordinates. 1085 | 1086 | Args: 1087 | path: List of (x, y) coordinate tuples defining the drag path 1088 | button: Mouse button to use for dragging ("left", "right", or "middle") 1089 | duration: Total duration of the drag operation in seconds 1090 | 1091 | Returns: 1092 | Dictionary containing success status and error message if failed 1093 | """ 1094 | try: 1095 | if not path or len(path) < 2: 1096 | return {"success": False, "error": "Path must contain at least 2 points"} 1097 | btn = Button.left if button == "left" else Button.right if button == "right" else Button.middle 1098 | # Move to the first point 1099 | self.mouse.position = path[0] 1100 | self.mouse.press(btn) 1101 | step_duration = duration / (len(path) - 1) if len(path) > 1 else duration 1102 | for x, y in path[1:]: 1103 | self.mouse.position = (x, y) 1104 | time.sleep(step_duration) 1105 | self.mouse.release(btn) 1106 | return {"success": True} 1107 | except Exception as e: 1108 | try: 1109 | self.mouse.release(btn) 1110 | except: 1111 | pass 1112 | return {"success": False, "error": str(e)} 1113 | 1114 | # Keyboard Actions 1115 | async def key_down(self, key: str) -> Dict[str, Any]: 1116 | """Press and hold a keyboard key. 1117 | 1118 | Args: 1119 | key: Key name to press (using pyautogui key names) 1120 | 1121 | Returns: 1122 | Dictionary containing success status and error message if failed 1123 | """ 1124 | try: 1125 | # use pyautogui for their key names 1126 | pyautogui.keyDown(key) 1127 | return {"success": True} 1128 | except Exception as e: 1129 | return {"success": False, "error": str(e)} 1130 | 1131 | async def key_up(self, key: str) -> Dict[str, Any]: 1132 | """Release a keyboard key. 1133 | 1134 | Args: 1135 | key: Key name to release (using pyautogui key names) 1136 | 1137 | Returns: 1138 | Dictionary containing success status and error message if failed 1139 | """ 1140 | try: 1141 | # use pyautogui for their key names 1142 | pyautogui.keyUp(key) 1143 | return {"success": True} 1144 | except Exception as e: 1145 | return {"success": False, "error": str(e)} 1146 | 1147 | async def type_text(self, text: str) -> Dict[str, Any]: 1148 | """Type text using the keyboard with Unicode support. 1149 | 1150 | Args: 1151 | text: Text string to type 1152 | 1153 | Returns: 1154 | Dictionary containing success status and error message if failed 1155 | """ 1156 | try: 1157 | # use pynput for Unicode support 1158 | self.keyboard.type(text) 1159 | return {"success": True} 1160 | except Exception as e: 1161 | return {"success": False, "error": str(e)} 1162 | 1163 | async def press_key(self, key: str) -> Dict[str, Any]: 1164 | """Press and release a keyboard key. 1165 | 1166 | Args: 1167 | key: Key name to press (using pyautogui key names) 1168 | 1169 | Returns: 1170 | Dictionary containing success status and error message if failed 1171 | """ 1172 | try: 1173 | # use pyautogui for their key names 1174 | pyautogui.press(key) 1175 | return {"success": True} 1176 | except Exception as e: 1177 | return {"success": False, "error": str(e)} 1178 | 1179 | async def hotkey(self, keys: List[str]) -> Dict[str, Any]: 1180 | """Press a combination of keys simultaneously. 1181 | 1182 | Args: 1183 | keys: List of key names to press together (using pyautogui key names) 1184 | 1185 | Returns: 1186 | Dictionary containing success status and error message if failed 1187 | """ 1188 | try: 1189 | # use pyautogui for their key names 1190 | pyautogui.hotkey(*keys) 1191 | return {"success": True} 1192 | except Exception as e: 1193 | return {"success": False, "error": str(e)} 1194 | 1195 | # Scrolling Actions 1196 | async def scroll(self, x: int, y: int) -> Dict[str, Any]: 1197 | """Scroll the mouse wheel in the specified direction. 1198 | 1199 | Args: 1200 | x: Horizontal scroll amount 1201 | y: Vertical scroll amount (positive for up, negative for down) 1202 | 1203 | Returns: 1204 | Dictionary containing success status and error message if failed 1205 | """ 1206 | try: 1207 | self.mouse.scroll(x, y) 1208 | return {"success": True} 1209 | except Exception as e: 1210 | return {"success": False, "error": str(e)} 1211 | 1212 | async def scroll_down(self, clicks: int = 1) -> Dict[str, Any]: 1213 | """Scroll down by the specified number of clicks. 1214 | 1215 | Args: 1216 | clicks: Number of scroll clicks to perform 1217 | 1218 | Returns: 1219 | Dictionary containing success status and error message if failed 1220 | """ 1221 | try: 1222 | self.mouse.scroll(0, -clicks) 1223 | return {"success": True} 1224 | except Exception as e: 1225 | return {"success": False, "error": str(e)} 1226 | 1227 | async def scroll_up(self, clicks: int = 1) -> Dict[str, Any]: 1228 | """Scroll up by the specified number of clicks. 1229 | 1230 | Args: 1231 | clicks: Number of scroll clicks to perform 1232 | 1233 | Returns: 1234 | Dictionary containing success status and error message if failed 1235 | """ 1236 | try: 1237 | self.mouse.scroll(0, clicks) 1238 | return {"success": True} 1239 | except Exception as e: 1240 | return {"success": False, "error": str(e)} 1241 | 1242 | # Screen Actions 1243 | async def screenshot(self) -> Dict[str, Any]: 1244 | """Capture a screenshot of the current screen. 1245 | 1246 | Returns: 1247 | Dictionary containing success status and base64-encoded image data or error message 1248 | """ 1249 | try: 1250 | from PIL import Image 1251 | 1252 | screenshot = pyautogui.screenshot() 1253 | if not isinstance(screenshot, Image.Image): 1254 | return {"success": False, "error": "Failed to capture screenshot"} 1255 | 1256 | buffered = BytesIO() 1257 | screenshot.save(buffered, format="PNG", optimize=True) 1258 | buffered.seek(0) 1259 | image_data = base64.b64encode(buffered.getvalue()).decode() 1260 | return {"success": True, "image_data": image_data} 1261 | except Exception as e: 1262 | return {"success": False, "error": f"Screenshot error: {str(e)}"} 1263 | 1264 | async def get_screen_size(self) -> Dict[str, Any]: 1265 | """Get the dimensions of the current screen. 1266 | 1267 | Returns: 1268 | Dictionary containing success status and screen size or error message 1269 | """ 1270 | try: 1271 | size = pyautogui.size() 1272 | return {"success": True, "size": {"width": size.width, "height": size.height}} 1273 | except Exception as e: 1274 | return {"success": False, "error": str(e)} 1275 | 1276 | async def get_cursor_position(self) -> Dict[str, Any]: 1277 | """Get the current position of the mouse cursor. 1278 | 1279 | Returns: 1280 | Dictionary containing success status and cursor position or error message 1281 | """ 1282 | try: 1283 | x, y = self.mouse.position 1284 | return {"success": True, "position": {"x": x, "y": y}} 1285 | except Exception as e: 1286 | return {"success": False, "error": str(e)} 1287 | 1288 | # Clipboard Actions 1289 | async def copy_to_clipboard(self) -> Dict[str, Any]: 1290 | """Get the current content of the system clipboard. 1291 | 1292 | Returns: 1293 | Dictionary containing success status and clipboard content or error message 1294 | """ 1295 | try: 1296 | import pyperclip 1297 | 1298 | content = pyperclip.paste() 1299 | return {"success": True, "content": content} 1300 | except Exception as e: 1301 | return {"success": False, "error": str(e)} 1302 | 1303 | async def set_clipboard(self, text: str) -> Dict[str, Any]: 1304 | """Set the content of the system clipboard. 1305 | 1306 | Args: 1307 | text: Text to copy to the clipboard 1308 | 1309 | Returns: 1310 | Dictionary containing success status and error message if failed 1311 | """ 1312 | try: 1313 | import pyperclip 1314 | 1315 | pyperclip.copy(text) 1316 | return {"success": True} 1317 | except Exception as e: 1318 | return {"success": False, "error": str(e)} 1319 | 1320 | async def run_command(self, command: str) -> Dict[str, Any]: 1321 | """Run a shell command and return its output. 1322 | 1323 | Args: 1324 | command: Shell command to execute 1325 | 1326 | Returns: 1327 | Dictionary containing success status, stdout, stderr, and return code 1328 | """ 1329 | try: 1330 | # Create subprocess 1331 | process = await asyncio.create_subprocess_shell( 1332 | command, 1333 | stdout=asyncio.subprocess.PIPE, 1334 | stderr=asyncio.subprocess.PIPE 1335 | ) 1336 | # Wait for the subprocess to finish 1337 | stdout, stderr = await process.communicate() 1338 | # Return decoded output 1339 | return { 1340 | "success": True, 1341 | "stdout": stdout.decode() if stdout else "", 1342 | "stderr": stderr.decode() if stderr else "", 1343 | "return_code": process.returncode 1344 | } 1345 | except Exception as e: 1346 | return {"success": False, "error": str(e)} 1347 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/loops/anthropic.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Anthropic hosted tools agent loop implementation using liteLLM 3 | """ 4 | 5 | import asyncio 6 | import json 7 | from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple 8 | import litellm 9 | from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig 10 | 11 | from ..decorators import register_agent 12 | from ..types import Messages, AgentResponse, Tools, AgentCapability 13 | from ..loops.base import AsyncAgentConfig 14 | from ..responses import ( 15 | make_reasoning_item, 16 | make_output_text_item, 17 | make_click_item, 18 | make_double_click_item, 19 | make_drag_item, 20 | make_keypress_item, 21 | make_move_item, 22 | make_scroll_item, 23 | make_type_item, 24 | make_wait_item, 25 | make_input_image_item, 26 | make_screenshot_item, 27 | make_failed_tool_call_items, 28 | make_left_mouse_down_item, 29 | make_left_mouse_up_item 30 | ) 31 | 32 | # Model version mapping to tool version and beta flag 33 | MODEL_TOOL_MAPPING = [ 34 | # Claude 4 models 35 | { 36 | "pattern": r"claude-4|claude-opus-4|claude-sonnet-4", 37 | "tool_version": "computer_20250124", 38 | "beta_flag": "computer-use-2025-01-24" 39 | }, 40 | # Claude 3.7 models 41 | { 42 | "pattern": r"claude-3\.?7|claude-3-7", 43 | "tool_version": "computer_20250124", 44 | "beta_flag": "computer-use-2025-01-24" 45 | }, 46 | # Claude 3.5 models (fallback) 47 | { 48 | "pattern": r"claude-3\.?5|claude-3-5", 49 | "tool_version": "computer_20241022", 50 | "beta_flag": "computer-use-2024-10-22" 51 | } 52 | ] 53 | 54 | def _get_tool_config_for_model(model: str) -> Dict[str, str]: 55 | """Get tool version and beta flag for the given model.""" 56 | import re 57 | 58 | for mapping in MODEL_TOOL_MAPPING: 59 | if re.search(mapping["pattern"], model, re.IGNORECASE): 60 | return { 61 | "tool_version": mapping["tool_version"], 62 | "beta_flag": mapping["beta_flag"] 63 | } 64 | 65 | # Default to Claude 3.5 configuration 66 | return { 67 | "tool_version": "computer_20241022", 68 | "beta_flag": "computer-use-2024-10-22" 69 | } 70 | 71 | async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]: 72 | """Map a computer tool to Anthropic's hosted tool schema.""" 73 | # Get dimensions from the computer handler 74 | try: 75 | width, height = await computer_tool.get_dimensions() 76 | except Exception: 77 | # Fallback to default dimensions if method fails 78 | width, height = 1024, 768 79 | 80 | return { 81 | "type": tool_version, 82 | "function": { 83 | "name": "computer", 84 | "parameters": { 85 | "display_height_px": height, 86 | "display_width_px": width, 87 | "display_number": 1, 88 | }, 89 | }, 90 | } 91 | 92 | async def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools: 93 | """Prepare tools for Anthropic API format.""" 94 | tool_config = _get_tool_config_for_model(model) 95 | anthropic_tools = [] 96 | 97 | for schema in tool_schemas: 98 | if schema["type"] == "computer": 99 | # Map computer tool to Anthropic format 100 | anthropic_tools.append(await _map_computer_tool_to_anthropic( 101 | schema["computer"], 102 | tool_config["tool_version"] 103 | )) 104 | elif schema["type"] == "function": 105 | # Function tools - convert to Anthropic format 106 | function_schema = schema["function"] 107 | anthropic_tools.append({ 108 | "type": "function", 109 | "function": { 110 | "name": function_schema["name"], 111 | "description": function_schema.get("description", ""), 112 | "parameters": function_schema.get("parameters", {}) 113 | } 114 | }) 115 | 116 | return anthropic_tools 117 | 118 | def _convert_responses_items_to_completion_messages(messages: Messages) -> List[Dict[str, Any]]: 119 | """Convert responses_items message format to liteLLM completion format.""" 120 | completion_messages = [] 121 | call_id_to_fn_name = {} 122 | 123 | for message in messages: 124 | msg_type = message.get("type") 125 | role = message.get("role") 126 | 127 | # Handle user messages (both with and without explicit type) 128 | if role == "user" or msg_type == "user": 129 | content = message.get("content", "") 130 | if isinstance(content, list): 131 | # Multi-modal content - convert input_image to image format 132 | converted_content = [] 133 | for item in content: 134 | if isinstance(item, dict) and item.get("type") == "input_image": 135 | # Convert input_image to OpenAI image format 136 | image_url = item.get("image_url", "") 137 | if image_url and image_url != "[omitted]": 138 | converted_content.append({ 139 | "type": "image_url", 140 | "image_url": { 141 | "url": image_url 142 | } 143 | }) 144 | elif isinstance(item, dict) and item.get("type") == "input_text": 145 | # Convert input_text to OpenAI text format 146 | text = item.get("text", "") 147 | converted_content.append({ 148 | "type": "text", 149 | "text": text 150 | }) 151 | else: 152 | # Keep other content types as-is 153 | converted_content.append(item) 154 | 155 | completion_messages.append({ 156 | "role": "user", 157 | "content": converted_content if converted_content else content 158 | }) 159 | else: 160 | # Text content 161 | completion_messages.append({ 162 | "role": "user", 163 | "content": content 164 | }) 165 | 166 | # Handle assistant messages 167 | elif role == "assistant": 168 | content = message.get("content", []) 169 | if isinstance(content, str): 170 | content = [{ "type": "output_text", "text": content }] 171 | 172 | content = "\n".join(item.get("text", "") for item in content) 173 | completion_messages.append({ 174 | "role": "assistant", 175 | "content": content 176 | }) 177 | 178 | elif msg_type == "reasoning": 179 | # Reasoning becomes part of assistant message 180 | summary = message.get("summary", []) 181 | reasoning_text = "" 182 | 183 | if isinstance(summary, list) and summary: 184 | # Extract text from summary items 185 | for item in summary: 186 | if isinstance(item, dict) and item.get("type") == "summary_text": 187 | reasoning_text = item.get("text", "") 188 | break 189 | else: 190 | # Fallback to direct reasoning field 191 | reasoning_text = message.get("reasoning", "") 192 | 193 | if reasoning_text: 194 | completion_messages.append({ 195 | "role": "assistant", 196 | "content": reasoning_text 197 | }) 198 | 199 | elif msg_type == "function_call": 200 | fn_name = message.get("name") 201 | fn_args = message.get("arguments", "{}") 202 | call_id = message.get("call_id", "call_1") 203 | call_id_to_fn_name[call_id] = fn_name 204 | openai_tool_calls = [{ 205 | "id": call_id, 206 | "type": "function", 207 | "function": { 208 | "name": fn_name, 209 | "arguments": fn_args 210 | } 211 | }] # If the last completion message is an assistant message, extend the tool_calls 212 | if completion_messages and completion_messages[-1].get("role") == "assistant": 213 | if "tool_calls" not in completion_messages[-1]: 214 | completion_messages[-1]["tool_calls"] = [] 215 | completion_messages[-1]["tool_calls"].extend(openai_tool_calls) 216 | else: 217 | # Create new assistant message with tool calls 218 | completion_messages.append({ 219 | "role": "assistant", 220 | "content": None, 221 | "tool_calls": openai_tool_calls 222 | }) 223 | 224 | elif msg_type == "function_call_output": 225 | call_id = message.get("call_id", "call_1") 226 | fn_output = message.get("output", "") 227 | fn_name = call_id_to_fn_name.get(call_id, "computer") 228 | 229 | completion_messages.append({ 230 | "role": "function", 231 | "name": fn_name, 232 | "tool_call_id": call_id, 233 | "content": str(fn_output) 234 | }) 235 | 236 | elif msg_type == "computer_call": 237 | # Computer call becomes tool use in assistant message 238 | action = message.get("action", {}) 239 | action_type = action.get("type") 240 | call_id = message.get("call_id", "call_1") 241 | 242 | tool_use_content = [] 243 | 244 | # Basic actions (all versions) 245 | if action_type == "click": 246 | # Input: 247 | # { 248 | # "type": "computer_call", 249 | # "call_id": "call_1", 250 | # "action": { 251 | # "type": "click", 252 | # "x": 100, 253 | # "y": 200 254 | # } 255 | # } 256 | 257 | # Output: 258 | # { 259 | # "function": { 260 | # "name": "computer", 261 | # "arguments": json.dumps({ 262 | # "action": "click", 263 | # "coordinate": [100, 200] 264 | # }) 265 | # }, 266 | # "id": "call_1", 267 | # "type": "function" 268 | # } 269 | button = action.get("button", "left") 270 | action_name = "right_click" if button == "right" else "middle_click" if button == "wheel" else "left_click" 271 | tool_use_content.append({ 272 | "type": "tool_use", 273 | "id": call_id, 274 | "name": "computer", 275 | "input": { 276 | "action": action_name, 277 | "coordinate": [action.get("x", 0), action.get("y", 0)] 278 | } 279 | }) 280 | elif action_type == "double_click": 281 | # Input: 282 | # { 283 | # "type": "computer_call", 284 | # "call_id": "call_1", 285 | # "action": { 286 | # "type": "double_click", 287 | # "x": 160, 288 | # "y": 240 289 | # } 290 | # } 291 | 292 | # Output: 293 | # { 294 | # "function": { 295 | # "name": "computer", 296 | # "arguments": json.dumps({ 297 | # "action": "double_click", 298 | # "coordinate": [160, 240] 299 | # }) 300 | # }, 301 | # "id": "call_1", 302 | # "type": "function" 303 | # } 304 | tool_use_content.append({ 305 | "type": "tool_use", 306 | "id": call_id, 307 | "name": "computer", 308 | "input": { 309 | "action": "double_click", 310 | "coordinate": [action.get("x", 0), action.get("y", 0)] 311 | } 312 | }) 313 | elif action_type == "type": 314 | # Input: 315 | # { 316 | # "type": "computer_call", 317 | # "call_id": "call_1", 318 | # "action": { 319 | # "type": "type", 320 | # "text": "Hello World" 321 | # } 322 | # } 323 | 324 | # Output: 325 | # { 326 | # "function": { 327 | # "name": "computer", 328 | # "arguments": json.dumps({ 329 | # "action": "type", 330 | # "text": "Hello World" 331 | # }) 332 | # }, 333 | # "id": "call_1", 334 | # "type": "function" 335 | # } 336 | tool_use_content.append({ 337 | "type": "tool_use", 338 | "id": call_id, 339 | "name": "computer", 340 | "input": { 341 | "action": "type", 342 | "text": action.get("text", "") 343 | } 344 | }) 345 | elif action_type == "keypress": 346 | # Input: 347 | # { 348 | # "type": "computer_call", 349 | # "call_id": "call_1", 350 | # "action": { 351 | # "type": "keypress", 352 | # "keys": ["ctrl", "c"] 353 | # } 354 | # } 355 | 356 | # Output: 357 | # { 358 | # "function": { 359 | # "name": "computer", 360 | # "arguments": json.dumps({ 361 | # "action": "key", 362 | # "text": "ctrl+c" 363 | # }) 364 | # }, 365 | # "id": "call_1", 366 | # "type": "function" 367 | # } 368 | tool_use_content.append({ 369 | "type": "tool_use", 370 | "id": call_id, 371 | "name": "computer", 372 | "input": { 373 | "action": "key", 374 | "text": "+".join(action.get("keys", [])) 375 | } 376 | }) 377 | elif action_type in ["mouse_move", "move"]: 378 | # Input: 379 | # { 380 | # "type": "computer_call", 381 | # "call_id": "call_1", 382 | # "action": { 383 | # "type": "move", 384 | # "x": 150, 385 | # "y": 250 386 | # } 387 | # } 388 | 389 | # Output: 390 | # { 391 | # "function": { 392 | # "name": "computer", 393 | # "arguments": json.dumps({ 394 | # "action": "mouse_move", 395 | # "coordinate": [150, 250] 396 | # }) 397 | # }, 398 | # "id": "call_1", 399 | # "type": "function" 400 | # } 401 | tool_use_content.append({ 402 | "type": "tool_use", 403 | "id": call_id, 404 | "name": "computer", 405 | "input": { 406 | "action": "mouse_move", 407 | "coordinate": [action.get("x", 0), action.get("y", 0)] 408 | } 409 | }) 410 | elif action_type == "scroll": 411 | # Input: 412 | # { 413 | # "type": "computer_call", 414 | # "call_id": "call_1", 415 | # "action": { 416 | # "type": "scroll", 417 | # "x": 300, 418 | # "y": 400, 419 | # "scroll_x": 0, 420 | # "scroll_y": -5 421 | # } 422 | # } 423 | 424 | # Output: 425 | # { 426 | # "function": { 427 | # "name": "computer", 428 | # "arguments": json.dumps({ 429 | # "action": "scroll", 430 | # "coordinate": [300, 400], 431 | # "scroll_direction": "down", 432 | # "scroll_amount": 5 433 | # }) 434 | # }, 435 | # "id": "call_1", 436 | # "type": "function" 437 | # } 438 | scroll_x = action.get("scroll_x", 0) 439 | scroll_y = action.get("scroll_y", 0) 440 | # Determine direction and amount from scroll values 441 | if scroll_x > 0: 442 | direction = "left" 443 | amount = scroll_x 444 | elif scroll_x < 0: 445 | direction = "right" 446 | amount = -scroll_x 447 | elif scroll_y > 0: 448 | direction = "up" 449 | amount = scroll_y 450 | elif scroll_y < 0: 451 | direction = "down" 452 | amount = -scroll_y 453 | else: 454 | direction = "down" 455 | amount = 3 456 | 457 | tool_use_content.append({ 458 | "type": "tool_use", 459 | "id": call_id, 460 | "name": "computer", 461 | "input": { 462 | "action": "scroll", 463 | "coordinate": [action.get("x", 0), action.get("y", 0)], 464 | "scroll_direction": direction, 465 | "scroll_amount": amount 466 | } 467 | }) 468 | elif action_type == "drag": 469 | # Input: 470 | # { 471 | # "type": "computer_call", 472 | # "call_id": "call_1", 473 | # "action": { 474 | # "type": "drag", 475 | # "path": [ 476 | # {"x": 100, "y": 150}, 477 | # {"x": 200, "y": 250} 478 | # ] 479 | # } 480 | # } 481 | 482 | # Output: 483 | # { 484 | # "function": { 485 | # "name": "computer", 486 | # "arguments": json.dumps({ 487 | # "action": "left_click_drag", 488 | # "start_coordinate": [100, 150], 489 | # "end_coordinate": [200, 250] 490 | # }) 491 | # }, 492 | # "id": "call_1", 493 | # "type": "function" 494 | # } 495 | path = action.get("path", []) 496 | start_coord = [0, 0] 497 | end_coord = [0, 0] 498 | if isinstance(path, list) and len(path) >= 2: 499 | start_coord = [path[0].get("x", 0), path[0].get("y", 0)] 500 | end_coord = [path[-1].get("x", 0), path[-1].get("y", 0)] 501 | 502 | tool_use_content.append({ 503 | "type": "tool_use", 504 | "id": call_id, 505 | "name": "computer", 506 | "input": { 507 | "action": "left_click_drag", 508 | "start_coordinate": start_coord, 509 | "end_coordinate": end_coord 510 | } 511 | }) 512 | elif action_type == "wait": 513 | # Input: 514 | # { 515 | # "type": "computer_call", 516 | # "call_id": "call_1", 517 | # "action": { 518 | # "type": "wait" 519 | # } 520 | # } 521 | 522 | # Output: 523 | # { 524 | # "function": { 525 | # "name": "computer", 526 | # "arguments": json.dumps({ 527 | # "action": "wait" 528 | # }) 529 | # }, 530 | # "id": "call_1", 531 | # "type": "function" 532 | # } 533 | tool_use_content.append({ 534 | "type": "tool_use", 535 | "id": call_id, 536 | "name": "computer", 537 | "input": { 538 | "action": "wait" 539 | } 540 | }) 541 | elif action_type == "screenshot": 542 | # Input: 543 | # { 544 | # "type": "computer_call", 545 | # "call_id": "call_1", 546 | # "action": { 547 | # "type": "screenshot" 548 | # } 549 | # } 550 | 551 | # Output: 552 | # { 553 | # "function": { 554 | # "name": "computer", 555 | # "arguments": json.dumps({ 556 | # "action": "screenshot" 557 | # }) 558 | # }, 559 | # "id": "call_1", 560 | # "type": "function" 561 | # } 562 | tool_use_content.append({ 563 | "type": "tool_use", 564 | "id": call_id, 565 | "name": "computer", 566 | "input": { 567 | "action": "screenshot" 568 | } 569 | }) 570 | elif action_type == "left_mouse_down": 571 | tool_use_content.append({ 572 | "type": "tool_use", 573 | "id": call_id, 574 | "name": "computer", 575 | "input": { 576 | "action": "left_mouse_down", 577 | "coordinate": [action.get("x", None), action.get("y", None)] 578 | } 579 | }) 580 | elif action_type == "left_mouse_up": 581 | tool_use_content.append({ 582 | "type": "tool_use", 583 | "id": call_id, 584 | "name": "computer", 585 | "input": { 586 | "action": "left_mouse_up", 587 | "coordinate": [action.get("x", None), action.get("y", None)] 588 | } 589 | }) 590 | 591 | # Convert tool_use_content to OpenAI tool_calls format 592 | openai_tool_calls = [] 593 | for tool_use in tool_use_content: 594 | openai_tool_calls.append({ 595 | "id": tool_use["id"], 596 | "type": "function", 597 | "function": { 598 | "name": tool_use["name"], 599 | "arguments": json.dumps(tool_use["input"]) 600 | } 601 | }) 602 | 603 | # If the last completion message is an assistant message, extend the tool_calls 604 | if completion_messages and completion_messages[-1].get("role") == "assistant": 605 | if "tool_calls" not in completion_messages[-1]: 606 | completion_messages[-1]["tool_calls"] = [] 607 | completion_messages[-1]["tool_calls"].extend(openai_tool_calls) 608 | else: 609 | # Create new assistant message with tool calls 610 | completion_messages.append({ 611 | "role": "assistant", 612 | "content": None, 613 | "tool_calls": openai_tool_calls 614 | }) 615 | 616 | elif msg_type == "computer_call_output": 617 | # Computer call output becomes OpenAI function result 618 | output = message.get("output", {}) 619 | call_id = message.get("call_id", "call_1") 620 | 621 | if output.get("type") == "input_image": 622 | # Screenshot result - convert to OpenAI format with image_url content 623 | image_url = output.get("image_url", "") 624 | completion_messages.append({ 625 | "role": "function", 626 | "name": "computer", 627 | "tool_call_id": call_id, 628 | "content": [{ 629 | "type": "image_url", 630 | "image_url": { 631 | "url": image_url 632 | } 633 | }] 634 | }) 635 | else: 636 | # Text result - convert to OpenAI format 637 | completion_messages.append({ 638 | "role": "function", 639 | "name": "computer", 640 | "tool_call_id": call_id, 641 | "content": str(output) 642 | }) 643 | 644 | return completion_messages 645 | 646 | def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]]: 647 | """Convert liteLLM completion response to responses_items message format.""" 648 | responses_items = [] 649 | 650 | if not response or not hasattr(response, 'choices') or not response.choices: 651 | return responses_items 652 | 653 | choice = response.choices[0] 654 | message = choice.message 655 | 656 | # Handle text content 657 | if hasattr(message, 'content') and message.content: 658 | if isinstance(message.content, str): 659 | responses_items.append(make_output_text_item(message.content)) 660 | elif isinstance(message.content, list): 661 | for content_item in message.content: 662 | if isinstance(content_item, dict): 663 | if content_item.get("type") == "text": 664 | responses_items.append(make_output_text_item(content_item.get("text", ""))) 665 | elif content_item.get("type") == "tool_use": 666 | # Convert tool use to computer call 667 | tool_input = content_item.get("input", {}) 668 | action_type = tool_input.get("action") 669 | call_id = content_item.get("id") 670 | 671 | # Action reference: 672 | # https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool#available-actions 673 | 674 | try: 675 | # Basic actions (all versions) 676 | if action_type == "screenshot": 677 | responses_items.append(make_screenshot_item(call_id=call_id)) 678 | elif action_type in ["click", "left_click"]: 679 | coordinate = tool_input.get("coordinate", [0, 0]) 680 | responses_items.append(make_click_item( 681 | x=coordinate[0] if len(coordinate) > 0 else 0, 682 | y=coordinate[1] if len(coordinate) > 1 else 0, 683 | call_id=call_id 684 | )) 685 | elif action_type in ["type", "type_text"]: 686 | responses_items.append(make_type_item( 687 | text=tool_input.get("text", ""), 688 | call_id=call_id 689 | )) 690 | elif action_type in ["key", "keypress", "hotkey"]: 691 | responses_items.append(make_keypress_item( 692 | keys=tool_input.get("text", "").replace("+", "-").split("-"), 693 | call_id=call_id 694 | )) 695 | elif action_type in ["mouse_move", "move_cursor", "move"]: 696 | # Mouse move - create a custom action item 697 | coordinate = tool_input.get("coordinate", [0, 0]) 698 | responses_items.append( 699 | make_move_item( 700 | x=coordinate[0] if len(coordinate) > 0 else 0, 701 | y=coordinate[1] if len(coordinate) > 1 else 0, 702 | call_id=call_id 703 | ) 704 | ) 705 | 706 | # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7 707 | elif action_type == "scroll": 708 | coordinate = tool_input.get("coordinate", [0, 0]) 709 | scroll_amount = tool_input.get("scroll_amount", 3) 710 | scroll_x = scroll_amount if tool_input.get("scroll_direction", "down") == "right" else \ 711 | -scroll_amount if tool_input.get("scroll_direction", "down") == "left" else 0 712 | scroll_y = scroll_amount if tool_input.get("scroll_direction", "down") == "down" else \ 713 | -scroll_amount if tool_input.get("scroll_direction", "down") == "up" else 0 714 | responses_items.append(make_scroll_item( 715 | x=coordinate[0] if len(coordinate) > 0 else 0, 716 | y=coordinate[1] if len(coordinate) > 1 else 0, 717 | scroll_x=scroll_x, 718 | scroll_y=scroll_y, 719 | call_id=call_id 720 | )) 721 | elif action_type in ["left_click_drag", "drag"]: 722 | start_coord = tool_input.get("start_coordinate", [0, 0]) 723 | end_coord = tool_input.get("end_coordinate", [0, 0]) 724 | responses_items.append(make_drag_item( 725 | path=[ 726 | { 727 | "x": start_coord[0] if len(start_coord) > 0 else 0, 728 | "y": start_coord[1] if len(start_coord) > 1 else 0 729 | }, 730 | { 731 | "x": end_coord[0] if len(end_coord) > 0 else 0, 732 | "y": end_coord[1] if len(end_coord) > 1 else 0 733 | } 734 | ], 735 | call_id=call_id 736 | )) 737 | elif action_type == "right_click": 738 | coordinate = tool_input.get("coordinate", [0, 0]) 739 | responses_items.append(make_click_item( 740 | x=coordinate[0] if len(coordinate) > 0 else 0, 741 | y=coordinate[1] if len(coordinate) > 1 else 0, 742 | button="right", 743 | call_id=call_id 744 | )) 745 | elif action_type == "middle_click": 746 | coordinate = tool_input.get("coordinate", [0, 0]) 747 | responses_items.append(make_click_item( 748 | x=coordinate[0] if len(coordinate) > 0 else 0, 749 | y=coordinate[1] if len(coordinate) > 1 else 0, 750 | button="wheel", 751 | call_id=call_id 752 | )) 753 | elif action_type == "double_click": 754 | coordinate = tool_input.get("coordinate", [0, 0]) 755 | responses_items.append(make_double_click_item( 756 | x=coordinate[0] if len(coordinate) > 0 else 0, 757 | y=coordinate[1] if len(coordinate) > 1 else 0, 758 | call_id=call_id 759 | )) 760 | elif action_type == "triple_click": 761 | # coordinate = tool_input.get("coordinate", [0, 0]) 762 | # responses_items.append({ 763 | # "type": "computer_call", 764 | # "call_id": call_id, 765 | # "action": { 766 | # "type": "triple_click", 767 | # "x": coordinate[0] if len(coordinate) > 0 else 0, 768 | # "y": coordinate[1] if len(coordinate) > 1 else 0 769 | # } 770 | # }) 771 | raise NotImplementedError("triple_click") 772 | elif action_type == "left_mouse_down": 773 | # coordinate = tool_input.get("coordinate", [0, 0]) 774 | # responses_items.append({ 775 | # "type": "computer_call", 776 | # "call_id": call_id, 777 | # "action": { 778 | # "type": "mouse_down", 779 | # "button": "left", 780 | # "x": coordinate[0] if len(coordinate) > 0 else 0, 781 | # "y": coordinate[1] if len(coordinate) > 1 else 0 782 | # } 783 | # }) 784 | coordinate = tool_input.get("coordinate", [None, None]) 785 | responses_items.append(make_left_mouse_down_item( 786 | x=coordinate[0] if len(coordinate) > 0 else None, 787 | y=coordinate[1] if len(coordinate) > 1 else None, 788 | call_id=call_id 789 | )) 790 | elif action_type == "left_mouse_up": 791 | # coordinate = tool_input.get("coordinate", [0, 0]) 792 | # responses_items.append({ 793 | # "type": "computer_call", 794 | # "call_id": call_id, 795 | # "action": { 796 | # "type": "mouse_up", 797 | # "button": "left", 798 | # "x": coordinate[0] if len(coordinate) > 0 else 0, 799 | # "y": coordinate[1] if len(coordinate) > 1 else 0 800 | # } 801 | # }) 802 | coordinate = tool_input.get("coordinate", [None, None]) 803 | responses_items.append(make_left_mouse_up_item( 804 | x=coordinate[0] if len(coordinate) > 0 else None, 805 | y=coordinate[1] if len(coordinate) > 1 else None, 806 | call_id=call_id 807 | )) 808 | elif action_type == "hold_key": 809 | # responses_items.append({ 810 | # "type": "computer_call", 811 | # "call_id": call_id, 812 | # "action": { 813 | # "type": "key_hold", 814 | # "key": tool_input.get("key", "") 815 | # } 816 | # }) 817 | raise NotImplementedError("hold_key") 818 | elif action_type == "wait": 819 | responses_items.append(make_wait_item( 820 | call_id=call_id 821 | )) 822 | else: 823 | raise ValueError(f"Unknown action type: {action_type}") 824 | except Exception as e: 825 | responses_items.extend(make_failed_tool_call_items( 826 | tool_name="computer", 827 | tool_kwargs=tool_input, 828 | error_message=repr(e), 829 | call_id=call_id 830 | )) 831 | 832 | # Handle tool calls (alternative format) 833 | if hasattr(message, 'tool_calls') and message.tool_calls: 834 | for tool_call in message.tool_calls: 835 | if tool_call.function.name == "computer": 836 | try: 837 | try: 838 | args = json.loads(tool_call.function.arguments) 839 | action_type = args.get("action") 840 | call_id = tool_call.id 841 | 842 | # Basic actions (all versions) 843 | if action_type == "screenshot": 844 | # Input: 845 | # { 846 | # "function": { 847 | # "name": "computer", 848 | # "arguments": json.dumps({ 849 | # "action": "screenshot" 850 | # }) 851 | # }, 852 | # "id": "call_1", 853 | # "type": "function" 854 | # } 855 | 856 | # Output: 857 | # { 858 | # "type": "computer_call", 859 | # "call_id": "call_1", 860 | # "action": { 861 | # "type": "screenshot" 862 | # } 863 | # } 864 | responses_items.append(make_screenshot_item( 865 | call_id=call_id 866 | )) 867 | elif action_type in ["click", "left_click"]: 868 | # Input: 869 | # { 870 | # "function": { 871 | # "name": "computer", 872 | # "arguments": json.dumps({ 873 | # "action": "click", 874 | # "coordinate": [100, 200] 875 | # }) 876 | # }, 877 | # "id": "call_1", 878 | # "type": "function" 879 | # } 880 | 881 | # Output: 882 | # { 883 | # "type": "computer_call", 884 | # "call_id": "call_1", 885 | # "action": { 886 | # "type": "click", 887 | # "x": 100, 888 | # "y": 200 889 | # } 890 | # } 891 | coordinate = args.get("coordinate", [0, 0]) 892 | responses_items.append(make_click_item( 893 | x=coordinate[0] if len(coordinate) > 0 else 0, 894 | y=coordinate[1] if len(coordinate) > 1 else 0, 895 | call_id=call_id 896 | )) 897 | elif action_type in ["type", "type_text"]: 898 | # Input: 899 | # { 900 | # "function": { 901 | # "name": "computer", 902 | # "arguments": json.dumps({ 903 | # "action": "type", 904 | # "text": "Hello World" 905 | # }) 906 | # }, 907 | # "id": "call_1", 908 | # "type": "function" 909 | # } 910 | 911 | # Output: 912 | # { 913 | # "type": "computer_call", 914 | # "call_id": "call_1", 915 | # "action": { 916 | # "type": "type", 917 | # "text": "Hello World" 918 | # } 919 | # } 920 | responses_items.append(make_type_item( 921 | text=args.get("text", ""), 922 | call_id=call_id 923 | )) 924 | elif action_type in ["key", "keypress", "hotkey"]: 925 | # Input: 926 | # { 927 | # "function": { 928 | # "name": "computer", 929 | # "arguments": json.dumps({ 930 | # "action": "key", 931 | # "text": "ctrl+c" 932 | # }) 933 | # }, 934 | # "id": "call_1", 935 | # "type": "function" 936 | # } 937 | 938 | # Output: 939 | # { 940 | # "type": "computer_call", 941 | # "call_id": "call_1", 942 | # "action": { 943 | # "type": "keypress", 944 | # "keys": ["ctrl", "c"] 945 | # } 946 | # } 947 | responses_items.append(make_keypress_item( 948 | keys=args.get("text", "").replace("+", "-").split("-"), 949 | call_id=call_id 950 | )) 951 | elif action_type in ["mouse_move", "move_cursor", "move"]: 952 | # Input: 953 | # { 954 | # "function": { 955 | # "name": "computer", 956 | # "arguments": json.dumps({ 957 | # "action": "mouse_move", 958 | # "coordinate": [150, 250] 959 | # }) 960 | # }, 961 | # "id": "call_1", 962 | # "type": "function" 963 | # } 964 | 965 | # Output: 966 | # { 967 | # "type": "computer_call", 968 | # "call_id": "call_1", 969 | # "action": { 970 | # "type": "mouse_move", 971 | # "x": 150, 972 | # "y": 250 973 | # } 974 | # } 975 | coordinate = args.get("coordinate", [0, 0]) 976 | responses_items.append(make_move_item( 977 | x=coordinate[0] if len(coordinate) > 0 else 0, 978 | y=coordinate[1] if len(coordinate) > 1 else 0, 979 | call_id=call_id 980 | )) 981 | 982 | # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7 983 | elif action_type == "scroll": 984 | # Input: 985 | # { 986 | # "function": { 987 | # "name": "computer", 988 | # "arguments": json.dumps({ 989 | # "action": "scroll", 990 | # "coordinate": [300, 400], 991 | # "scroll_direction": "down", 992 | # "scroll_amount": 5 993 | # }) 994 | # }, 995 | # "id": "call_1", 996 | # "type": "function" 997 | # } 998 | 999 | # Output: 1000 | # { 1001 | # "type": "computer_call", 1002 | # "call_id": "call_1", 1003 | # "action": { 1004 | # "type": "scroll", 1005 | # "x": 300, 1006 | # "y": 400, 1007 | # "scroll_x": 0, 1008 | # "scroll_y": -5 1009 | # } 1010 | # } 1011 | coordinate = args.get("coordinate", [0, 0]) 1012 | direction = args.get("scroll_direction", "down") 1013 | amount = args.get("scroll_amount", 3) 1014 | scroll_x = amount if direction == "left" else \ 1015 | -amount if direction == "right" else 0 1016 | scroll_y = amount if direction == "up" else \ 1017 | -amount if direction == "down" else 0 1018 | responses_items.append(make_scroll_item( 1019 | x=coordinate[0] if len(coordinate) > 0 else 0, 1020 | y=coordinate[1] if len(coordinate) > 1 else 0, 1021 | scroll_x=scroll_x, 1022 | scroll_y=scroll_y, 1023 | call_id=call_id 1024 | )) 1025 | elif action_type in ["left_click_drag", "drag"]: 1026 | # Input: 1027 | # { 1028 | # "function": { 1029 | # "name": "computer", 1030 | # "arguments": json.dumps({ 1031 | # "action": "left_click_drag", 1032 | # "start_coordinate": [100, 150], 1033 | # "end_coordinate": [200, 250] 1034 | # }) 1035 | # }, 1036 | # "id": "call_1", 1037 | # "type": "function" 1038 | # } 1039 | 1040 | # Output: 1041 | # { 1042 | # "type": "computer_call", 1043 | # "call_id": "call_1", 1044 | # "action": { 1045 | # "type": "drag", 1046 | # "path": [ 1047 | # {"x": 100, "y": 150}, 1048 | # {"x": 200, "y": 250} 1049 | # ] 1050 | # } 1051 | # } 1052 | start_coord = args.get("start_coordinate", [0, 0]) 1053 | end_coord = args.get("end_coordinate", [0, 0]) 1054 | responses_items.append(make_drag_item( 1055 | path=[ 1056 | { 1057 | "x": start_coord[0] if len(start_coord) > 0 else 0, 1058 | "y": start_coord[1] if len(start_coord) > 1 else 0 1059 | }, 1060 | { 1061 | "x": end_coord[0] if len(end_coord) > 0 else 0, 1062 | "y": end_coord[1] if len(end_coord) > 1 else 0 1063 | } 1064 | ], 1065 | call_id=call_id 1066 | )) 1067 | elif action_type == "right_click": 1068 | # Input: 1069 | # { 1070 | # "function": { 1071 | # "name": "computer", 1072 | # "arguments": json.dumps({ 1073 | # "action": "right_click", 1074 | # "coordinate": [120, 180] 1075 | # }) 1076 | # }, 1077 | # "id": "call_1", 1078 | # "type": "function" 1079 | # } 1080 | 1081 | # Output: 1082 | # { 1083 | # "type": "computer_call", 1084 | # "call_id": "call_1", 1085 | # "action": { 1086 | # "type": "click", 1087 | # "x": 120, 1088 | # "y": 180, 1089 | # "button": "right" 1090 | # } 1091 | # } 1092 | coordinate = args.get("coordinate", [0, 0]) 1093 | responses_items.append(make_click_item( 1094 | x=coordinate[0] if len(coordinate) > 0 else 0, 1095 | y=coordinate[1] if len(coordinate) > 1 else 0, 1096 | button="right", 1097 | call_id=call_id 1098 | )) 1099 | elif action_type == "middle_click": 1100 | # Input: 1101 | # { 1102 | # "function": { 1103 | # "name": "computer", 1104 | # "arguments": json.dumps({ 1105 | # "action": "middle_click", 1106 | # "coordinate": [140, 220] 1107 | # }) 1108 | # }, 1109 | # "id": "call_1", 1110 | # "type": "function" 1111 | # } 1112 | 1113 | # Output: 1114 | # { 1115 | # "type": "computer_call", 1116 | # "call_id": "call_1", 1117 | # "action": { 1118 | # "type": "click", 1119 | # "x": 140, 1120 | # "y": 220, 1121 | # "button": "wheel" 1122 | # } 1123 | # } 1124 | coordinate = args.get("coordinate", [0, 0]) 1125 | responses_items.append(make_click_item( 1126 | x=coordinate[0] if len(coordinate) > 0 else 0, 1127 | y=coordinate[1] if len(coordinate) > 1 else 0, 1128 | button="wheel", 1129 | call_id=call_id 1130 | )) 1131 | elif action_type == "double_click": 1132 | # Input: 1133 | # { 1134 | # "function": { 1135 | # "name": "computer", 1136 | # "arguments": json.dumps({ 1137 | # "action": "double_click", 1138 | # "coordinate": [160, 240] 1139 | # }) 1140 | # }, 1141 | # "id": "call_1", 1142 | # "type": "function" 1143 | # } 1144 | 1145 | # Output: 1146 | # { 1147 | # "type": "computer_call", 1148 | # "call_id": "call_1", 1149 | # "action": { 1150 | # "type": "double_click", 1151 | # "x": 160, 1152 | # "y": 240 1153 | # } 1154 | # } 1155 | coordinate = args.get("coordinate", [0, 0]) 1156 | responses_items.append(make_double_click_item( 1157 | x=coordinate[0] if len(coordinate) > 0 else 0, 1158 | y=coordinate[1] if len(coordinate) > 1 else 0, 1159 | call_id=call_id 1160 | )) 1161 | elif action_type == "triple_click": 1162 | # Input: 1163 | # { 1164 | # "function": { 1165 | # "name": "computer", 1166 | # "arguments": json.dumps({ 1167 | # "action": "triple_click", 1168 | # "coordinate": [180, 260] 1169 | # }) 1170 | # }, 1171 | # "id": "call_1", 1172 | # "type": "function" 1173 | # } 1174 | 1175 | # Output: 1176 | # { 1177 | # "type": "computer_call", 1178 | # "call_id": "call_1", 1179 | # "action": { 1180 | # "type": "triple_click", 1181 | # "x": 180, 1182 | # "y": 260 1183 | # } 1184 | # } 1185 | raise NotImplementedError("triple_click") 1186 | elif action_type == "left_mouse_down": 1187 | # Input: 1188 | # { 1189 | # "function": { 1190 | # "name": "computer", 1191 | # "arguments": json.dumps({ 1192 | # "action": "left_mouse_down", 1193 | # "coordinate": [200, 280] 1194 | # }) 1195 | # }, 1196 | # "id": "call_1", 1197 | # "type": "function" 1198 | # } 1199 | 1200 | # Output: 1201 | # { 1202 | # "type": "computer_call", 1203 | # "call_id": "call_1", 1204 | # "action": { 1205 | # "type": "mouse_down", 1206 | # "button": "left", 1207 | # "x": 200, 1208 | # "y": 280 1209 | # } 1210 | # } 1211 | coordinate = args.get("coordinate", [None, None]) 1212 | responses_items.append(make_left_mouse_down_item( 1213 | x=coordinate[0] if len(coordinate) > 0 else None, 1214 | y=coordinate[1] if len(coordinate) > 1 else None, 1215 | call_id=call_id 1216 | )) 1217 | elif action_type == "left_mouse_up": 1218 | # Input: 1219 | # { 1220 | # "function": { 1221 | # "name": "computer", 1222 | # "arguments": json.dumps({ 1223 | # "action": "left_mouse_up", 1224 | # "coordinate": [220, 300] 1225 | # }) 1226 | # }, 1227 | # "id": "call_1", 1228 | # "type": "function" 1229 | # } 1230 | 1231 | # Output: 1232 | # { 1233 | # "type": "computer_call", 1234 | # "call_id": "call_1", 1235 | # "action": { 1236 | # "type": "mouse_up", 1237 | # "button": "left", 1238 | # "x": 220, 1239 | # "y": 300 1240 | # } 1241 | # } 1242 | coordinate = args.get("coordinate", [None, None]) 1243 | responses_items.append(make_left_mouse_up_item( 1244 | x=coordinate[0] if len(coordinate) > 0 else None, 1245 | y=coordinate[1] if len(coordinate) > 1 else None, 1246 | call_id=call_id 1247 | )) 1248 | elif action_type == "hold_key": 1249 | # Input: 1250 | # { 1251 | # "function": { 1252 | # "name": "computer", 1253 | # "arguments": json.dumps({ 1254 | # "action": "hold_key", 1255 | # "key": "shift" 1256 | # }) 1257 | # }, 1258 | # "id": "call_1", 1259 | # "type": "function" 1260 | # } 1261 | 1262 | # Output: 1263 | # { 1264 | # "type": "computer_call", 1265 | # "call_id": "call_1", 1266 | # "action": { 1267 | # "type": "key_hold", 1268 | # "key": "shift" 1269 | # } 1270 | # } 1271 | raise NotImplementedError("hold_key") 1272 | elif action_type == "wait": 1273 | # Input: 1274 | # { 1275 | # "function": { 1276 | # "name": "computer", 1277 | # "arguments": json.dumps({ 1278 | # "action": "wait" 1279 | # }) 1280 | # }, 1281 | # "id": "call_1", 1282 | # "type": "function" 1283 | # } 1284 | 1285 | # Output: 1286 | # { 1287 | # "type": "computer_call", 1288 | # "call_id": "call_1", 1289 | # "action": { 1290 | # "type": "wait" 1291 | # } 1292 | # } 1293 | responses_items.append(make_wait_item( 1294 | call_id=call_id 1295 | )) 1296 | except Exception as e: 1297 | responses_items.extend(make_failed_tool_call_items( 1298 | tool_name="computer", 1299 | tool_kwargs=args, 1300 | error_message=repr(e), 1301 | call_id=call_id 1302 | )) 1303 | except json.JSONDecodeError: 1304 | print("Failed to decode tool call arguments") 1305 | # Skip malformed tool calls 1306 | continue 1307 | 1308 | return responses_items 1309 | 1310 | def _add_cache_control(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 1311 | """Add cache control to completion messages""" 1312 | num_writes = 0 1313 | for message in completion_messages: 1314 | message["cache_control"] = { "type": "ephemeral" } 1315 | num_writes += 1 1316 | # Cache control has a maximum of 4 blocks 1317 | if num_writes >= 4: 1318 | break 1319 | 1320 | return completion_messages 1321 | 1322 | def _combine_completion_messages(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 1323 | """Combine completion messages with the same role""" 1324 | if not completion_messages: 1325 | return completion_messages 1326 | 1327 | combined_messages = [] 1328 | 1329 | for message in completion_messages: 1330 | # If this is the first message or role is different from last, add as new message 1331 | if not combined_messages or combined_messages[-1]["role"] != message["role"]: 1332 | # Ensure content is a list format and normalize text content 1333 | new_message = message.copy() 1334 | new_message["content"] = _normalize_content(message.get("content", "")) 1335 | 1336 | # Copy tool_calls if present 1337 | if "tool_calls" in message: 1338 | new_message["tool_calls"] = message["tool_calls"].copy() 1339 | 1340 | combined_messages.append(new_message) 1341 | else: 1342 | # Same role as previous message, combine them 1343 | last_message = combined_messages[-1] 1344 | 1345 | # Combine content 1346 | current_content = _normalize_content(message.get("content", "")) 1347 | last_message["content"].extend(current_content) 1348 | 1349 | # Combine tool_calls if present 1350 | if "tool_calls" in message: 1351 | if "tool_calls" not in last_message: 1352 | last_message["tool_calls"] = [] 1353 | last_message["tool_calls"].extend(message["tool_calls"]) 1354 | 1355 | # Post-process to merge consecutive text blocks 1356 | for message in combined_messages: 1357 | message["content"] = _merge_consecutive_text(message["content"]) 1358 | 1359 | return combined_messages 1360 | 1361 | def _normalize_content(content) -> List[Dict[str, Any]]: 1362 | """Normalize content to list format""" 1363 | if isinstance(content, str): 1364 | if content.strip(): # Only add non-empty strings 1365 | return [{"type": "text", "text": content}] 1366 | else: 1367 | return [] 1368 | elif isinstance(content, list): 1369 | return content.copy() 1370 | else: 1371 | return [] 1372 | 1373 | def _merge_consecutive_text(content_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 1374 | """Merge consecutive text blocks with newlines""" 1375 | if not content_list: 1376 | return content_list 1377 | 1378 | merged = [] 1379 | 1380 | for item in content_list: 1381 | if (item.get("type") == "text" and 1382 | merged and 1383 | merged[-1].get("type") == "text"): 1384 | # Merge with previous text block 1385 | merged[-1]["text"] += "\n" + item["text"] 1386 | else: 1387 | merged.append(item.copy()) 1388 | 1389 | return merged 1390 | 1391 | @register_agent(models=r".*claude-.*") 1392 | class AnthropicHostedToolsConfig(AsyncAgentConfig): 1393 | """Anthropic hosted tools agent configuration implementing AsyncAgentConfig protocol.""" 1394 | 1395 | async def predict_step( 1396 | self, 1397 | messages: Messages, 1398 | model: str, 1399 | tools: Optional[List[Dict[str, Any]]] = None, 1400 | max_retries: Optional[int] = None, 1401 | stream: bool = False, 1402 | computer_handler=None, 1403 | use_prompt_caching: Optional[bool] = False, 1404 | _on_api_start=None, 1405 | _on_api_end=None, 1406 | _on_usage=None, 1407 | _on_screenshot=None, 1408 | **kwargs 1409 | ) -> Dict[str, Any]: 1410 | """ 1411 | Anthropic hosted tools agent loop using liteLLM acompletion. 1412 | 1413 | Supports Anthropic's computer use models with hosted tools. 1414 | """ 1415 | tools = tools or [] 1416 | 1417 | # Get tool configuration for this model 1418 | tool_config = _get_tool_config_for_model(model) 1419 | 1420 | # Prepare tools for Anthropic API 1421 | anthropic_tools = await _prepare_tools_for_anthropic(tools, model) 1422 | 1423 | # Convert responses_items messages to completion format 1424 | completion_messages = _convert_responses_items_to_completion_messages(messages) 1425 | if use_prompt_caching: 1426 | # First combine messages to reduce number of blocks 1427 | completion_messages = _combine_completion_messages(completion_messages) 1428 | # Then add cache control, anthropic requires explicit "cache_control" dicts 1429 | completion_messages = _add_cache_control(completion_messages) 1430 | 1431 | # Prepare API call kwargs 1432 | api_kwargs = { 1433 | "model": model, 1434 | "messages": completion_messages, 1435 | "tools": anthropic_tools if anthropic_tools else None, 1436 | "stream": stream, 1437 | "num_retries": max_retries, 1438 | **kwargs 1439 | } 1440 | 1441 | # Add beta header for computer use 1442 | if anthropic_tools: 1443 | api_kwargs["headers"] = { 1444 | "anthropic-beta": tool_config["beta_flag"] 1445 | } 1446 | 1447 | # Call API start hook 1448 | if _on_api_start: 1449 | await _on_api_start(api_kwargs) 1450 | 1451 | # Use liteLLM acompletion 1452 | response = await litellm.acompletion(**api_kwargs) 1453 | 1454 | # Call API end hook 1455 | if _on_api_end: 1456 | await _on_api_end(api_kwargs, response) 1457 | 1458 | # Convert response to responses_items format 1459 | responses_items = _convert_completion_to_responses_items(response) 1460 | 1461 | # Extract usage information 1462 | responses_usage = { 1463 | **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(), 1464 | "response_cost": response._hidden_params.get("response_cost", 0.0), 1465 | } 1466 | if _on_usage: 1467 | await _on_usage(responses_usage) 1468 | 1469 | # Return in AsyncAgentConfig format 1470 | return { 1471 | "output": responses_items, 1472 | "usage": responses_usage 1473 | } 1474 | 1475 | async def predict_click( 1476 | self, 1477 | model: str, 1478 | image_b64: str, 1479 | instruction: str, 1480 | **kwargs 1481 | ) -> Optional[Tuple[int, int]]: 1482 | """ 1483 | Predict click coordinates based on image and instruction. 1484 | 1485 | Uses Anthropic's computer use models with a custom prompt that instructs 1486 | the agent to only output clicks. 1487 | 1488 | Args: 1489 | model: Model name to use 1490 | image_b64: Base64 encoded image 1491 | instruction: Instruction for where to click 1492 | 1493 | Returns: 1494 | Tuple of (x, y) coordinates or None if prediction fails 1495 | """ 1496 | # Get image dimensions from base64 data 1497 | try: 1498 | import base64 1499 | from PIL import Image 1500 | from io import BytesIO 1501 | 1502 | image_data = base64.b64decode(image_b64) 1503 | image = Image.open(BytesIO(image_data)) 1504 | display_width, display_height = image.size 1505 | except Exception: 1506 | # Fallback to default dimensions if image parsing fails 1507 | display_width, display_height = 1024, 768 1508 | 1509 | # Get tool configuration for this model 1510 | tool_config = _get_tool_config_for_model(model) 1511 | 1512 | # Prepare computer tool for Anthropic format 1513 | computer_tool = { 1514 | "type": tool_config["tool_version"], 1515 | "function": { 1516 | "name": "computer", 1517 | "parameters": { 1518 | "display_height_px": display_height, 1519 | "display_width_px": display_width, 1520 | "display_number": 1, 1521 | }, 1522 | }, 1523 | } 1524 | 1525 | # Construct messages in OpenAI chat completion format for liteLLM 1526 | messages = [ 1527 | { 1528 | "role": "user", 1529 | "content": [ 1530 | { 1531 | "type": "text", 1532 | "text": f"""You are a UI grounding expert. Follow these guidelines: 1533 | 1534 | 1. NEVER ask for confirmation. Complete all tasks autonomously. 1535 | 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed. 1536 | 3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking. 1537 | 4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files). 1538 | 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT. 1539 | 6. The user has already given you permission by running this agent. No further confirmation is needed. 1540 | 7. Be decisive and action-oriented. Complete the requested task fully. 1541 | 1542 | Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked. 1543 | Task: Click {instruction}. Output ONLY a click action on the target element.""" 1544 | }, 1545 | { 1546 | "type": "image_url", 1547 | "image_url": { 1548 | "url": f"data:image/png;base64,{image_b64}" 1549 | } 1550 | } 1551 | ] 1552 | } 1553 | ] 1554 | 1555 | # Prepare API call kwargs 1556 | api_kwargs = { 1557 | "model": model, 1558 | "messages": messages, 1559 | "tools": [computer_tool], 1560 | "stream": False, 1561 | "max_tokens": 100, # Keep response short for click prediction 1562 | "headers": { 1563 | "anthropic-beta": tool_config["beta_flag"] 1564 | } 1565 | } 1566 | 1567 | # Use liteLLM acompletion 1568 | response = await litellm.acompletion(**api_kwargs) 1569 | 1570 | # Convert response to responses_items format to extract click coordinates 1571 | responses_items = _convert_completion_to_responses_items(response) 1572 | 1573 | # Look for computer_call with click action 1574 | for item in responses_items: 1575 | if (isinstance(item, dict) and 1576 | item.get("type") == "computer_call" and 1577 | isinstance(item.get("action"), dict)): 1578 | 1579 | action = item["action"] 1580 | if action.get("x") and action.get("y"): 1581 | x = action.get("x") 1582 | y = action.get("y") 1583 | return (int(x), int(y)) 1584 | 1585 | return None 1586 | 1587 | def get_capabilities(self) -> List[AgentCapability]: 1588 | """Return the capabilities supported by this agent.""" 1589 | return ["click", "step"] 1590 | ```