This is page 10 of 21. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .all-contributorsrc ├── .cursorignore ├── .devcontainer │ ├── devcontainer.json │ ├── post-install.sh │ └── README.md ├── .dockerignore ├── .gitattributes ├── .github │ ├── FUNDING.yml │ ├── scripts │ │ ├── get_pyproject_version.py │ │ └── tests │ │ ├── __init__.py │ │ ├── README.md │ │ └── test_get_pyproject_version.py │ └── workflows │ ├── ci-lume.yml │ ├── docker-publish-kasm.yml │ ├── docker-publish-xfce.yml │ ├── docker-reusable-publish.yml │ ├── npm-publish-computer.yml │ ├── npm-publish-core.yml │ ├── publish-lume.yml │ ├── pypi-publish-agent.yml │ ├── pypi-publish-computer-server.yml │ ├── pypi-publish-computer.yml │ ├── pypi-publish-core.yml │ ├── pypi-publish-mcp-server.yml │ ├── pypi-publish-pylume.yml │ ├── pypi-publish-som.yml │ ├── pypi-reusable-publish.yml │ └── test-validation-script.yml ├── .gitignore ├── .vscode │ ├── docs.code-workspace │ ├── launch.json │ ├── libs-ts.code-workspace │ ├── lume.code-workspace │ ├── lumier.code-workspace │ ├── py.code-workspace │ └── settings.json ├── blog │ ├── app-use.md │ ├── assets │ │ ├── composite-agents.png │ │ ├── docker-ubuntu-support.png │ │ ├── hack-booth.png │ │ ├── hack-closing-ceremony.jpg │ │ ├── hack-cua-ollama-hud.jpeg │ │ ├── hack-leaderboard.png │ │ ├── hack-the-north.png │ │ ├── hack-winners.jpeg │ │ ├── hack-workshop.jpeg │ │ ├── hud-agent-evals.png │ │ └── trajectory-viewer.jpeg │ ├── bringing-computer-use-to-the-web.md │ ├── build-your-own-operator-on-macos-1.md │ ├── build-your-own-operator-on-macos-2.md │ ├── composite-agents.md │ ├── cua-hackathon.md │ ├── hack-the-north.md │ ├── hud-agent-evals.md │ ├── human-in-the-loop.md │ ├── introducing-cua-cloud-containers.md │ ├── lume-to-containerization.md │ ├── sandboxed-python-execution.md │ ├── training-computer-use-models-trajectories-1.md │ ├── trajectory-viewer.md │ ├── ubuntu-docker-support.md │ └── windows-sandbox.md ├── CONTRIBUTING.md ├── Development.md ├── Dockerfile ├── docs │ ├── .gitignore │ ├── .prettierrc │ ├── content │ │ └── docs │ │ ├── agent-sdk │ │ │ ├── agent-loops.mdx │ │ │ ├── benchmarks │ │ │ │ ├── index.mdx │ │ │ │ ├── interactive.mdx │ │ │ │ ├── introduction.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── osworld-verified.mdx │ │ │ │ ├── screenspot-pro.mdx │ │ │ │ └── screenspot-v2.mdx │ │ │ ├── callbacks │ │ │ │ ├── agent-lifecycle.mdx │ │ │ │ ├── cost-saving.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── logging.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── pii-anonymization.mdx │ │ │ │ └── trajectories.mdx │ │ │ ├── chat-history.mdx │ │ │ ├── custom-computer-handlers.mdx │ │ │ ├── custom-tools.mdx │ │ │ ├── customizing-computeragent.mdx │ │ │ ├── integrations │ │ │ │ ├── hud.mdx │ │ │ │ └── meta.json │ │ │ ├── message-format.mdx │ │ │ ├── meta.json │ │ │ ├── migration-guide.mdx │ │ │ ├── prompt-caching.mdx │ │ │ ├── supported-agents │ │ │ │ ├── composed-agents.mdx │ │ │ │ ├── computer-use-agents.mdx │ │ │ │ ├── grounding-models.mdx │ │ │ │ ├── human-in-the-loop.mdx │ │ │ │ └── meta.json │ │ │ ├── supported-model-providers │ │ │ │ ├── index.mdx │ │ │ │ └── local-models.mdx │ │ │ └── usage-tracking.mdx │ │ ├── computer-sdk │ │ │ ├── cloud-vm-management.mdx │ │ │ ├── commands.mdx │ │ │ ├── computer-ui.mdx │ │ │ ├── computers.mdx │ │ │ ├── meta.json │ │ │ └── sandboxed-python.mdx │ │ ├── index.mdx │ │ ├── libraries │ │ │ ├── agent │ │ │ │ └── index.mdx │ │ │ ├── computer │ │ │ │ └── index.mdx │ │ │ ├── computer-server │ │ │ │ ├── Commands.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── REST-API.mdx │ │ │ │ └── WebSocket-API.mdx │ │ │ ├── core │ │ │ │ └── index.mdx │ │ │ ├── lume │ │ │ │ ├── cli-reference.mdx │ │ │ │ ├── faq.md │ │ │ │ ├── http-api.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── meta.json │ │ │ │ └── prebuilt-images.mdx │ │ │ ├── lumier │ │ │ │ ├── building-lumier.mdx │ │ │ │ ├── docker-compose.mdx │ │ │ │ ├── docker.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ └── meta.json │ │ │ ├── mcp-server │ │ │ │ ├── client-integrations.mdx │ │ │ │ ├── configuration.mdx │ │ │ │ ├── index.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── llm-integrations.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── tools.mdx │ │ │ │ └── usage.mdx │ │ │ └── som │ │ │ ├── configuration.mdx │ │ │ └── index.mdx │ │ ├── meta.json │ │ ├── quickstart-cli.mdx │ │ ├── quickstart-devs.mdx │ │ └── telemetry.mdx │ ├── next.config.mjs │ ├── package-lock.json │ ├── package.json │ ├── pnpm-lock.yaml │ ├── postcss.config.mjs │ ├── public │ │ └── img │ │ ├── agent_gradio_ui.png │ │ ├── agent.png │ │ ├── cli.png │ │ ├── computer.png │ │ ├── som_box_threshold.png │ │ └── som_iou_threshold.png │ ├── README.md │ ├── source.config.ts │ ├── src │ │ ├── app │ │ │ ├── (home) │ │ │ │ ├── [[...slug]] │ │ │ │ │ └── page.tsx │ │ │ │ └── layout.tsx │ │ │ ├── api │ │ │ │ └── search │ │ │ │ └── route.ts │ │ │ ├── favicon.ico │ │ │ ├── global.css │ │ │ ├── layout.config.tsx │ │ │ ├── layout.tsx │ │ │ ├── llms.mdx │ │ │ │ └── [[...slug]] │ │ │ │ └── route.ts │ │ │ └── llms.txt │ │ │ └── route.ts │ │ ├── assets │ │ │ ├── discord-black.svg │ │ │ ├── discord-white.svg │ │ │ ├── logo-black.svg │ │ │ └── logo-white.svg │ │ ├── components │ │ │ ├── iou.tsx │ │ │ └── mermaid.tsx │ │ ├── lib │ │ │ ├── llms.ts │ │ │ └── source.ts │ │ └── mdx-components.tsx │ └── tsconfig.json ├── examples │ ├── agent_examples.py │ ├── agent_ui_examples.py │ ├── cloud_api_examples.py │ ├── computer_examples_windows.py │ ├── computer_examples.py │ ├── computer_ui_examples.py │ ├── computer-example-ts │ │ ├── .env.example │ │ ├── .gitignore │ │ ├── .prettierrc │ │ ├── package-lock.json │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── README.md │ │ ├── src │ │ │ ├── helpers.ts │ │ │ └── index.ts │ │ └── tsconfig.json │ ├── docker_examples.py │ ├── evals │ │ ├── hud_eval_examples.py │ │ └── wikipedia_most_linked.txt │ ├── pylume_examples.py │ ├── sandboxed_functions_examples.py │ ├── som_examples.py │ ├── utils.py │ └── winsandbox_example.py ├── img │ ├── agent_gradio_ui.png │ ├── agent.png │ ├── cli.png │ ├── computer.png │ ├── logo_black.png │ └── logo_white.png ├── libs │ ├── kasm │ │ ├── Dockerfile │ │ ├── LICENSE │ │ ├── README.md │ │ └── src │ │ └── ubuntu │ │ └── install │ │ └── firefox │ │ ├── custom_startup.sh │ │ ├── firefox.desktop │ │ └── install_firefox.sh │ ├── lume │ │ ├── .cursorignore │ │ ├── CONTRIBUTING.md │ │ ├── Development.md │ │ ├── img │ │ │ └── cli.png │ │ ├── Package.resolved │ │ ├── Package.swift │ │ ├── README.md │ │ ├── resources │ │ │ └── lume.entitlements │ │ ├── scripts │ │ │ ├── build │ │ │ │ ├── build-debug.sh │ │ │ │ ├── build-release-notarized.sh │ │ │ │ └── build-release.sh │ │ │ └── install.sh │ │ ├── src │ │ │ ├── Commands │ │ │ │ ├── Clone.swift │ │ │ │ ├── Config.swift │ │ │ │ ├── Create.swift │ │ │ │ ├── Delete.swift │ │ │ │ ├── Get.swift │ │ │ │ ├── Images.swift │ │ │ │ ├── IPSW.swift │ │ │ │ ├── List.swift │ │ │ │ ├── Logs.swift │ │ │ │ ├── Options │ │ │ │ │ └── FormatOption.swift │ │ │ │ ├── Prune.swift │ │ │ │ ├── Pull.swift │ │ │ │ ├── Push.swift │ │ │ │ ├── Run.swift │ │ │ │ ├── Serve.swift │ │ │ │ ├── Set.swift │ │ │ │ └── Stop.swift │ │ │ ├── ContainerRegistry │ │ │ │ ├── ImageContainerRegistry.swift │ │ │ │ ├── ImageList.swift │ │ │ │ └── ImagesPrinter.swift │ │ │ ├── Errors │ │ │ │ └── Errors.swift │ │ │ ├── FileSystem │ │ │ │ ├── Home.swift │ │ │ │ ├── Settings.swift │ │ │ │ ├── VMConfig.swift │ │ │ │ ├── VMDirectory.swift │ │ │ │ └── VMLocation.swift │ │ │ ├── LumeController.swift │ │ │ ├── Main.swift │ │ │ ├── Server │ │ │ │ ├── Handlers.swift │ │ │ │ ├── HTTP.swift │ │ │ │ ├── Requests.swift │ │ │ │ ├── Responses.swift │ │ │ │ └── Server.swift │ │ │ ├── Utils │ │ │ │ ├── CommandRegistry.swift │ │ │ │ ├── CommandUtils.swift │ │ │ │ ├── Logger.swift │ │ │ │ ├── NetworkUtils.swift │ │ │ │ ├── Path.swift │ │ │ │ ├── ProcessRunner.swift │ │ │ │ ├── ProgressLogger.swift │ │ │ │ ├── String.swift │ │ │ │ └── Utils.swift │ │ │ ├── Virtualization │ │ │ │ ├── DarwinImageLoader.swift │ │ │ │ ├── DHCPLeaseParser.swift │ │ │ │ ├── ImageLoaderFactory.swift │ │ │ │ └── VMVirtualizationService.swift │ │ │ ├── VM │ │ │ │ ├── DarwinVM.swift │ │ │ │ ├── LinuxVM.swift │ │ │ │ ├── VM.swift │ │ │ │ ├── VMDetails.swift │ │ │ │ ├── VMDetailsPrinter.swift │ │ │ │ ├── VMDisplayResolution.swift │ │ │ │ └── VMFactory.swift │ │ │ └── VNC │ │ │ ├── PassphraseGenerator.swift │ │ │ └── VNCService.swift │ │ └── tests │ │ ├── Mocks │ │ │ ├── MockVM.swift │ │ │ ├── MockVMVirtualizationService.swift │ │ │ └── MockVNCService.swift │ │ ├── VM │ │ │ └── VMDetailsPrinterTests.swift │ │ ├── VMTests.swift │ │ ├── VMVirtualizationServiceTests.swift │ │ └── VNCServiceTests.swift │ ├── lumier │ │ ├── .dockerignore │ │ ├── Dockerfile │ │ ├── README.md │ │ └── src │ │ ├── bin │ │ │ └── entry.sh │ │ ├── config │ │ │ └── constants.sh │ │ ├── hooks │ │ │ └── on-logon.sh │ │ └── lib │ │ ├── utils.sh │ │ └── vm.sh │ ├── python │ │ ├── agent │ │ │ ├── .bumpversion.cfg │ │ │ ├── agent │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── adapters │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── huggingfacelocal_adapter.py │ │ │ │ │ ├── human_adapter.py │ │ │ │ │ ├── mlxvlm_adapter.py │ │ │ │ │ └── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── qwen2_5_vl.py │ │ │ │ ├── agent.py │ │ │ │ ├── callbacks │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── budget_manager.py │ │ │ │ │ ├── image_retention.py │ │ │ │ │ ├── logging.py │ │ │ │ │ ├── operator_validator.py │ │ │ │ │ ├── pii_anonymization.py │ │ │ │ │ ├── prompt_instructions.py │ │ │ │ │ ├── telemetry.py │ │ │ │ │ └── trajectory_saver.py │ │ │ │ ├── cli.py │ │ │ │ ├── computers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cua.py │ │ │ │ │ └── custom.py │ │ │ │ ├── decorators.py │ │ │ │ ├── human_tool │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ ├── server.py │ │ │ │ │ └── ui.py │ │ │ │ ├── integrations │ │ │ │ │ └── hud │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── agent.py │ │ │ │ │ └── proxy.py │ │ │ │ ├── loops │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── anthropic.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── composed_grounded.py │ │ │ │ │ ├── gemini.py │ │ │ │ │ ├── glm45v.py │ │ │ │ │ ├── gta1.py │ │ │ │ │ ├── holo.py │ │ │ │ │ ├── internvl.py │ │ │ │ │ ├── model_types.csv │ │ │ │ │ ├── moondream3.py │ │ │ │ │ ├── omniparser.py │ │ │ │ │ ├── openai.py │ │ │ │ │ ├── opencua.py │ │ │ │ │ └── uitars.py │ │ │ │ ├── proxy │ │ │ │ │ ├── examples.py │ │ │ │ │ └── handlers.py │ │ │ │ ├── responses.py │ │ │ │ ├── types.py │ │ │ │ └── ui │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ └── gradio │ │ │ │ ├── __init__.py │ │ │ │ ├── app.py │ │ │ │ └── ui_components.py │ │ │ ├── benchmarks │ │ │ │ ├── .gitignore │ │ │ │ ├── contrib.md │ │ │ │ ├── interactive.py │ │ │ │ ├── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ └── gta1.py │ │ │ │ ├── README.md │ │ │ │ ├── ss-pro.py │ │ │ │ ├── ss-v2.py │ │ │ │ └── utils.py │ │ │ ├── example.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer │ │ │ ├── .bumpversion.cfg │ │ │ ├── computer │ │ │ │ ├── __init__.py │ │ │ │ ├── computer.py │ │ │ │ ├── diorama_computer.py │ │ │ │ ├── helpers.py │ │ │ │ ├── interface │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ ├── models.py │ │ │ │ │ └── windows.py │ │ │ │ ├── logger.py │ │ │ │ ├── models.py │ │ │ │ ├── providers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cloud │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── docker │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── lume │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── lume_api.py │ │ │ │ │ ├── lumier │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── provider.py │ │ │ │ │ ├── types.py │ │ │ │ │ └── winsandbox │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── provider.py │ │ │ │ │ └── setup_script.ps1 │ │ │ │ ├── ui │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── __main__.py │ │ │ │ │ └── gradio │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── app.py │ │ │ │ └── utils.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── computer-server │ │ │ ├── .bumpversion.cfg │ │ │ ├── computer_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── cli.py │ │ │ │ ├── diorama │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── diorama_computer.py │ │ │ │ │ ├── diorama.py │ │ │ │ │ ├── draw.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── safezone.py │ │ │ │ ├── handlers │ │ │ │ │ ├── base.py │ │ │ │ │ ├── factory.py │ │ │ │ │ ├── generic.py │ │ │ │ │ ├── linux.py │ │ │ │ │ ├── macos.py │ │ │ │ │ └── windows.py │ │ │ │ ├── main.py │ │ │ │ ├── server.py │ │ │ │ └── watchdog.py │ │ │ ├── examples │ │ │ │ ├── __init__.py │ │ │ │ └── usage_example.py │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ ├── run_server.py │ │ │ └── test_connection.py │ │ ├── core │ │ │ ├── .bumpversion.cfg │ │ │ ├── core │ │ │ │ ├── __init__.py │ │ │ │ └── telemetry │ │ │ │ ├── __init__.py │ │ │ │ └── posthog.py │ │ │ ├── poetry.toml │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ ├── mcp-server │ │ │ ├── .bumpversion.cfg │ │ │ ├── CONCURRENT_SESSIONS.md │ │ │ ├── mcp_server │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── server.py │ │ │ │ └── session_manager.py │ │ │ ├── pdm.lock │ │ │ ├── pyproject.toml │ │ │ ├── README.md │ │ │ └── scripts │ │ │ ├── install_mcp_server.sh │ │ │ └── start_mcp_server.sh │ │ ├── pylume │ │ │ ├── __init__.py │ │ │ ├── .bumpversion.cfg │ │ │ ├── pylume │ │ │ │ ├── __init__.py │ │ │ │ ├── client.py │ │ │ │ ├── exceptions.py │ │ │ │ ├── lume │ │ │ │ ├── models.py │ │ │ │ ├── pylume.py │ │ │ │ └── server.py │ │ │ ├── pyproject.toml │ │ │ └── README.md │ │ └── som │ │ ├── .bumpversion.cfg │ │ ├── LICENSE │ │ ├── poetry.toml │ │ ├── pyproject.toml │ │ ├── README.md │ │ ├── som │ │ │ ├── __init__.py │ │ │ ├── detect.py │ │ │ ├── detection.py │ │ │ ├── models.py │ │ │ ├── ocr.py │ │ │ ├── util │ │ │ │ └── utils.py │ │ │ └── visualization.py │ │ └── tests │ │ └── test_omniparser.py │ ├── typescript │ │ ├── .gitignore │ │ ├── .nvmrc │ │ ├── agent │ │ │ ├── examples │ │ │ │ ├── playground-example.html │ │ │ │ └── README.md │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── client.ts │ │ │ │ ├── index.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ └── client.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── biome.json │ │ ├── computer │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── computer │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── providers │ │ │ │ │ │ ├── base.ts │ │ │ │ │ │ ├── cloud.ts │ │ │ │ │ │ └── index.ts │ │ │ │ │ └── types.ts │ │ │ │ ├── index.ts │ │ │ │ ├── interface │ │ │ │ │ ├── base.ts │ │ │ │ │ ├── factory.ts │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── linux.ts │ │ │ │ │ ├── macos.ts │ │ │ │ │ └── windows.ts │ │ │ │ └── types.ts │ │ │ ├── tests │ │ │ │ ├── computer │ │ │ │ │ └── cloud.test.ts │ │ │ │ ├── interface │ │ │ │ │ ├── factory.test.ts │ │ │ │ │ ├── index.test.ts │ │ │ │ │ ├── linux.test.ts │ │ │ │ │ ├── macos.test.ts │ │ │ │ │ └── windows.test.ts │ │ │ │ └── setup.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── core │ │ │ ├── .editorconfig │ │ │ ├── .gitattributes │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── package.json │ │ │ ├── README.md │ │ │ ├── src │ │ │ │ ├── index.ts │ │ │ │ └── telemetry │ │ │ │ ├── clients │ │ │ │ │ ├── index.ts │ │ │ │ │ └── posthog.ts │ │ │ │ └── index.ts │ │ │ ├── tests │ │ │ │ └── telemetry.test.ts │ │ │ ├── tsconfig.json │ │ │ ├── tsdown.config.ts │ │ │ └── vitest.config.ts │ │ ├── package.json │ │ ├── pnpm-lock.yaml │ │ ├── pnpm-workspace.yaml │ │ └── README.md │ └── xfce │ ├── .dockerignore │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ └── src │ ├── scripts │ │ ├── resize-display.sh │ │ ├── start-computer-server.sh │ │ ├── start-novnc.sh │ │ ├── start-vnc.sh │ │ └── xstartup.sh │ ├── supervisor │ │ └── supervisord.conf │ └── xfce-config │ ├── helpers.rc │ ├── xfce4-power-manager.xml │ └── xfce4-session.xml ├── LICENSE.md ├── Makefile ├── notebooks │ ├── agent_nb.ipynb │ ├── blog │ │ ├── build-your-own-operator-on-macos-1.ipynb │ │ └── build-your-own-operator-on-macos-2.ipynb │ ├── composite_agents_docker_nb.ipynb │ ├── computer_nb.ipynb │ ├── computer_server_nb.ipynb │ ├── customizing_computeragent.ipynb │ ├── eval_osworld.ipynb │ ├── ollama_nb.ipynb │ ├── pylume_nb.ipynb │ ├── README.md │ ├── sota_hackathon_cloud.ipynb │ └── sota_hackathon.ipynb ├── pdm.lock ├── pyproject.toml ├── pyrightconfig.json ├── README.md ├── samples │ └── community │ ├── global-online │ │ └── README.md │ └── hack-the-north │ └── README.md ├── scripts │ ├── build-uv.sh │ ├── build.ps1 │ ├── build.sh │ ├── cleanup.sh │ ├── playground-docker.sh │ ├── playground.sh │ └── run-docker-dev.sh └── tests ├── pytest.ini ├── shell_cmd.py ├── test_files.py ├── test_mcp_server_session_management.py ├── test_mcp_server_streaming.py ├── test_shell_bash.py ├── test_telemetry.py ├── test_venv.py └── test_watchdog.py ``` # Files -------------------------------------------------------------------------------- /libs/python/agent/agent/adapters/models/internvl.py: -------------------------------------------------------------------------------- ```python 1 | from __future__ import annotations 2 | from typing import List, Dict, Any, Optional 3 | 4 | # Hugging Face imports are local to avoid hard dependency at module import 5 | try: 6 | import torch # type: ignore 7 | from transformers import AutoModel, AutoTokenizer # type: ignore 8 | # Attempt to import InternVL's model dependencies 9 | import einops as _ # type: ignore 10 | import timm as _ # type: ignore 11 | from PIL import Image # type: ignore 12 | import torchvision.transforms as T # type: ignore 13 | from torchvision.transforms.functional import InterpolationMode # type: ignore 14 | import base64 # type: ignore 15 | from io import BytesIO # type: ignore 16 | import requests # type: ignore 17 | HF_AVAILABLE = True 18 | except Exception: 19 | HF_AVAILABLE = False 20 | 21 | 22 | class InternVLModel: 23 | """Generic Hugging Face vision-language model handler. 24 | Uses InternVL's native `model.chat()` interface with `AutoTokenizer`. 25 | Provides preprocessing to support multi-turn conversations with multiple images. 26 | """ 27 | 28 | def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None: 29 | if not HF_AVAILABLE: 30 | raise ImportError( 31 | "InternVL dependencies not found. Install with: pip install \"cua-agent[internvl-hf]\"" 32 | ) 33 | self.model_name = model_name 34 | self.device = device 35 | self.model = None 36 | self.tokenizer = None 37 | self.trust_remote_code = trust_remote_code 38 | self._load() 39 | 40 | def _load(self) -> None: 41 | # Load model 42 | self.model = AutoModel.from_pretrained( 43 | self.model_name, 44 | torch_dtype=torch.bfloat16, 45 | low_cpu_mem_usage=True, 46 | use_flash_attn=True, 47 | device_map=self.device, 48 | trust_remote_code=self.trust_remote_code, 49 | ).eval() 50 | # Load tokenizer (InternVL requires trust_remote_code=True and often use_fast=False) 51 | self.tokenizer = AutoTokenizer.from_pretrained( 52 | self.model_name, 53 | trust_remote_code=self.trust_remote_code, 54 | use_fast=False, 55 | ) 56 | 57 | # ---- Image preprocessing utilities adapted from InternVL docs ---- 58 | IMAGENET_MEAN = (0.485, 0.456, 0.406) 59 | IMAGENET_STD = (0.229, 0.224, 0.225) 60 | 61 | def _build_transform(self, input_size: int) -> T.Compose: 62 | MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD 63 | transform = T.Compose([ 64 | T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), 65 | T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), 66 | T.ToTensor(), 67 | T.Normalize(mean=MEAN, std=STD) 68 | ]) 69 | return transform 70 | 71 | def _find_closest_aspect_ratio(self, aspect_ratio: float, target_ratios: List[tuple], width: int, height: int, image_size: int): 72 | best_ratio_diff = float('inf') 73 | best_ratio = (1, 1) 74 | area = width * height 75 | for ratio in target_ratios: 76 | target_aspect_ratio = ratio[0] / ratio[1] 77 | ratio_diff = abs(aspect_ratio - target_aspect_ratio) 78 | if ratio_diff < best_ratio_diff: 79 | best_ratio_diff = ratio_diff 80 | best_ratio = ratio 81 | elif ratio_diff == best_ratio_diff: 82 | if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: 83 | best_ratio = ratio 84 | return best_ratio 85 | 86 | def _dynamic_preprocess(self, image: Image.Image, min_num: int = 1, max_num: int = 12, image_size: int = 448, use_thumbnail: bool = True) -> List[Image.Image]: 87 | orig_width, orig_height = image.size 88 | aspect_ratio = orig_width / orig_height 89 | 90 | target_ratios = set( 91 | (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if 92 | i * j <= max_num and i * j >= min_num) 93 | target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) 94 | 95 | target_aspect_ratio = self._find_closest_aspect_ratio( 96 | aspect_ratio, target_ratios, orig_width, orig_height, image_size) 97 | 98 | target_width = image_size * target_aspect_ratio[0] 99 | target_height = image_size * target_aspect_ratio[1] 100 | blocks = target_aspect_ratio[0] * target_aspect_ratio[1] 101 | 102 | resized_img = image.resize((target_width, target_height)) 103 | processed_images: List[Image.Image] = [] 104 | for i in range(blocks): 105 | box = ( 106 | (i % (target_width // image_size)) * image_size, 107 | (i // (target_width // image_size)) * image_size, 108 | ((i % (target_width // image_size)) + 1) * image_size, 109 | ((i // (target_width // image_size)) + 1) * image_size 110 | ) 111 | split_img = resized_img.crop(box) 112 | processed_images.append(split_img) 113 | assert len(processed_images) == blocks 114 | if use_thumbnail and len(processed_images) != 1: 115 | thumbnail_img = image.resize((image_size, image_size)) 116 | processed_images.append(thumbnail_img) 117 | return processed_images 118 | 119 | def _load_image_from_source(self, src: str) -> Image.Image: 120 | """Load PIL image from various sources: data URL, http(s), or local path.""" 121 | if src.startswith("data:image/"): 122 | # data URL base64 123 | header, b64data = src.split(",", 1) 124 | img_bytes = base64.b64decode(b64data) 125 | return Image.open(BytesIO(img_bytes)).convert('RGB') 126 | if src.startswith("http://") or src.startswith("https://"): 127 | resp = requests.get(src, timeout=10) 128 | resp.raise_for_status() 129 | return Image.open(BytesIO(resp.content)).convert('RGB') 130 | # Assume local file path 131 | return Image.open(src).convert('RGB') 132 | 133 | def _images_to_pixel_values(self, images: List[Image.Image], input_size: int = 448, max_num: int = 12): 134 | transform = self._build_transform(input_size=input_size) 135 | pixel_values_list = [] 136 | num_patches_list: List[int] = [] 137 | for img in images: 138 | tiles = self._dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num) 139 | pv = [transform(tile) for tile in tiles] 140 | pv = torch.stack(pv) 141 | num_patches_list.append(pv.shape[0]) 142 | pixel_values_list.append(pv) 143 | if not pixel_values_list: 144 | return None, [] 145 | pixel_values = torch.cat(pixel_values_list) 146 | return pixel_values, num_patches_list 147 | 148 | def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str: 149 | """Generate text for the given HF-format messages. 150 | messages: [{ role, content: [{type:'text'|'image', text|image}] }] 151 | 152 | This implementation constructs InternVL-compatible inputs and uses 153 | `model.chat(tokenizer, pixel_values, question, history=...)` to avoid 154 | relying on AutoProcessor (which fails for some tokenizers). 155 | """ 156 | assert self.model is not None and self.tokenizer is not None 157 | 158 | # Build textual context and collect images and the final question 159 | context_lines: List[str] = [] 160 | all_images: List[Image.Image] = [] 161 | last_user_text_parts: List[str] = [] 162 | 163 | for msg in messages: 164 | role = msg.get("role", "user") 165 | content = msg.get("content", []) 166 | if isinstance(content, str): 167 | content_items = [{"type": "text", "text": content}] 168 | else: 169 | content_items = content 170 | 171 | if role == "user": 172 | # Collect text and images 173 | parts_text: List[str] = [] 174 | for item in content_items: 175 | if item.get("type") == "text": 176 | t = item.get("text", "") 177 | if t: 178 | parts_text.append(t) 179 | elif item.get("type") == "image": 180 | url = item.get("image", "") 181 | if url: 182 | try: 183 | all_images.append(self._load_image_from_source(url)) 184 | except Exception: 185 | # Ignore failed image loads but keep going 186 | pass 187 | text = "\n".join(parts_text).strip() 188 | if text: 189 | context_lines.append(f"User: {text}") 190 | # Track last user text separately for question 191 | last_user_text_parts = parts_text or last_user_text_parts 192 | elif role == "assistant": 193 | # Only keep text content for history 194 | parts_text = [item.get("text", "") for item in content_items if item.get("type") == "text"] 195 | text = "\n".join(parts_text).strip() 196 | if text: 197 | context_lines.append(f"Assistant: {text}") 198 | 199 | # Prepare pixel values for all collected images (across turns) 200 | pixel_values = None 201 | num_patches_list: List[int] = [] 202 | if all_images: 203 | pixel_values, num_patches_list = self._images_to_pixel_values(all_images, input_size=448, max_num=12) 204 | if pixel_values is not None: 205 | # Convert dtype/device as in docs 206 | pixel_values = pixel_values.to(torch.bfloat16) 207 | # Chat API expects tensors on CUDA when model is on CUDA 208 | try: 209 | pixel_values = pixel_values.to(self.model.device) 210 | except Exception: 211 | pass 212 | 213 | # Build question with any prior context and numbered image placeholders 214 | if all_images: 215 | # Separate images layout: Image-1: <image> ... then question text 216 | prefix_lines = [f"Image-{i+1}: <image>" for i in range(len(all_images))] 217 | prefix = "\n".join(prefix_lines) + "\n" 218 | else: 219 | prefix = "" 220 | 221 | last_user_text = "\n".join(last_user_text_parts).strip() 222 | # Combine prior text-only turns as context to emulate multi-turn 223 | context_text = "\n".join(context_lines[:-1]) if len(context_lines) > 1 else "" 224 | base_question = last_user_text if last_user_text else "Describe the image(s) in detail." 225 | if context_text: 226 | question = (context_text + "\n" + prefix + base_question).strip() 227 | else: 228 | question = (prefix + base_question).strip() 229 | 230 | # Generation config 231 | generation_config = dict(max_new_tokens=max_new_tokens, do_sample=False) 232 | 233 | # Call InternVL chat 234 | try: 235 | if pixel_values is None: 236 | # Pure-text conversation (embed prior turns in question) 237 | response = self.model.chat(self.tokenizer, None, question, generation_config) 238 | else: 239 | # Multi-image: pass num_patches_list if >1 image 240 | if len(num_patches_list) > 1: 241 | response = self.model.chat( 242 | self.tokenizer, 243 | pixel_values, 244 | question, 245 | generation_config, 246 | num_patches_list=num_patches_list, 247 | ) 248 | else: 249 | response = self.model.chat(self.tokenizer, pixel_values, question, generation_config) 250 | except Exception as e: 251 | # Fallback: return empty string to avoid crashing the adapter 252 | return "" 253 | 254 | return response or "" 255 | ``` -------------------------------------------------------------------------------- /scripts/playground.sh: -------------------------------------------------------------------------------- ```bash 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | echo "🚀 Launching Cua Computer-Use Agent UI..." 6 | 7 | # Save the original working directory 8 | ORIGINAL_DIR="$(pwd)" 9 | 10 | # Directories used by the script 11 | DEMO_DIR="$HOME/.cua-demo" 12 | VENV_DIR="$DEMO_DIR/venv" 13 | 14 | # Function to clean up on exit 15 | cleanup() { 16 | cd ~ 17 | rm -rf "$TMP_DIR" 2>/dev/null || true 18 | } 19 | 20 | # Create a temporary directory for our work 21 | TMP_DIR=$(mktemp -d) 22 | cd "$TMP_DIR" 23 | trap cleanup EXIT 24 | 25 | # Ask user to choose between local macOS VMs or Cua Cloud Sandbox 26 | echo "" 27 | echo "Choose your Cua setup:" 28 | echo "1) ☁️ Cua Cloud Sandbox (works on any system)" 29 | echo "2) 🖥️ Local macOS VMs (requires Apple Silicon Mac + macOS 15+)" 30 | echo "" 31 | read -p "Enter your choice (1 or 2): " CHOICE 32 | 33 | if [[ "$CHOICE" == "1" ]]; then 34 | # Cua Cloud Sandbox setup 35 | echo "" 36 | echo "☁️ Setting up Cua Cloud Sandbox..." 37 | echo "" 38 | 39 | # Check if existing .env.local already has CUA_API_KEY (check current dir and demo dir) 40 | # Look for .env.local in the original working directory (before cd to temp dir) 41 | CURRENT_ENV_FILE="$ORIGINAL_DIR/.env.local" 42 | DEMO_ENV_FILE="$DEMO_DIR/.env.local" 43 | 44 | CUA_API_KEY="" 45 | 46 | # First check current directory 47 | if [[ -f "$CURRENT_ENV_FILE" ]] && grep -q "CUA_API_KEY=" "$CURRENT_ENV_FILE"; then 48 | EXISTING_CUA_KEY=$(grep "CUA_API_KEY=" "$CURRENT_ENV_FILE" | cut -d'=' -f2- | tr -d '"' | tr -d "'" | xargs) 49 | if [[ -n "$EXISTING_CUA_KEY" && "$EXISTING_CUA_KEY" != "your_cua_api_key_here" && "$EXISTING_CUA_KEY" != "" ]]; then 50 | CUA_API_KEY="$EXISTING_CUA_KEY" 51 | fi 52 | fi 53 | 54 | # Then check demo directory if not found in current dir 55 | if [[ -z "$CUA_API_KEY" ]] && [[ -f "$DEMO_ENV_FILE" ]] && grep -q "CUA_API_KEY=" "$DEMO_ENV_FILE"; then 56 | EXISTING_CUA_KEY=$(grep "CUA_API_KEY=" "$DEMO_ENV_FILE" | cut -d'=' -f2- | tr -d '"' | tr -d "'" | xargs) 57 | if [[ -n "$EXISTING_CUA_KEY" && "$EXISTING_CUA_KEY" != "your_cua_api_key_here" && "$EXISTING_CUA_KEY" != "" ]]; then 58 | CUA_API_KEY="$EXISTING_CUA_KEY" 59 | fi 60 | fi 61 | 62 | # If no valid API key found, prompt for one 63 | if [[ -z "$CUA_API_KEY" ]]; then 64 | echo "To use Cua Cloud Sandbox, you need to:" 65 | echo "1. Sign up at https://trycua.com" 66 | echo "2. Create a Cloud Sandbox" 67 | echo "3. Generate an Api Key" 68 | echo "" 69 | read -p "Enter your Cua Api Key: " CUA_API_KEY 70 | 71 | if [[ -z "$CUA_API_KEY" ]]; then 72 | echo "❌ Cua Api Key is required for Cloud Sandbox." 73 | exit 1 74 | fi 75 | fi 76 | 77 | USE_CLOUD=true 78 | 79 | elif [[ "$CHOICE" == "2" ]]; then 80 | # Local macOS VM setup 81 | echo "" 82 | echo "🖥️ Setting up local macOS VMs..." 83 | 84 | # Check for Apple Silicon Mac 85 | if [[ $(uname -s) != "Darwin" || $(uname -m) != "arm64" ]]; then 86 | echo "❌ Local macOS VMs require an Apple Silicon Mac (M1/M2/M3/M4)." 87 | echo "💡 Consider using Cua Cloud Sandbox instead (option 1)." 88 | exit 1 89 | fi 90 | 91 | # Check for macOS 15 (Sequoia) or newer 92 | OSVERSION=$(sw_vers -productVersion) 93 | if [[ $(echo "$OSVERSION 15.0" | tr " " "\n" | sort -V | head -n 1) != "15.0" ]]; then 94 | echo "❌ Local macOS VMs require macOS 15 (Sequoia) or newer. You have $OSVERSION." 95 | echo "💡 Consider using Cua Cloud Sandbox instead (option 1)." 96 | exit 1 97 | fi 98 | 99 | USE_CLOUD=false 100 | 101 | else 102 | echo "❌ Invalid choice. Please run the script again and choose 1 or 2." 103 | exit 1 104 | fi 105 | 106 | # Install Lume if not already installed (only for local VMs) 107 | if [[ "$USE_CLOUD" == "false" ]]; then 108 | if ! command -v lume &> /dev/null; then 109 | echo "📦 Installing Lume CLI..." 110 | curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh | bash 111 | 112 | # Add lume to PATH for this session if it's not already there 113 | if ! command -v lume &> /dev/null; then 114 | export PATH="$PATH:$HOME/.local/bin" 115 | fi 116 | fi 117 | 118 | # Pull the macOS CUA image if not already present 119 | if ! lume ls | grep -q "macos-sequoia-cua"; then 120 | # Check available disk space 121 | IMAGE_SIZE_GB=30 122 | AVAILABLE_SPACE_KB=$(df -k $HOME | tail -1 | awk '{print $4}') 123 | AVAILABLE_SPACE_GB=$(($AVAILABLE_SPACE_KB / 1024 / 1024)) 124 | 125 | echo "📊 The macOS CUA image will use approximately ${IMAGE_SIZE_GB}GB of disk space." 126 | echo " You currently have ${AVAILABLE_SPACE_GB}GB available on your system." 127 | 128 | # Prompt for confirmation 129 | read -p " Continue? [y]/n: " CONTINUE 130 | CONTINUE=${CONTINUE:-y} 131 | 132 | if [[ $CONTINUE =~ ^[Yy]$ ]]; then 133 | echo "📥 Pulling macOS CUA image (this may take a while)..." 134 | lume pull macos-sequoia-cua:latest 135 | else 136 | echo "❌ Installation cancelled." 137 | exit 1 138 | fi 139 | fi 140 | fi 141 | 142 | # Create a Python virtual environment 143 | echo "🐍 Setting up Python environment..." 144 | 145 | # Try different Python commands in order of preference 146 | PYTHON_CMD="" 147 | for cmd in python3.11 python3 python; do 148 | if command -v $cmd &> /dev/null; then 149 | # Check this Python version 150 | PYTHON_VERSION=$($cmd --version 2>&1 | cut -d" " -f2) 151 | PYTHON_MAJOR=$(echo $PYTHON_VERSION | cut -d. -f1) 152 | PYTHON_MINOR=$(echo $PYTHON_VERSION | cut -d. -f2) 153 | 154 | if [ "$PYTHON_MAJOR" -eq 3 ] && [ "$PYTHON_MINOR" -eq 11 ]; then 155 | PYTHON_CMD=$cmd 156 | echo "✅ Found suitable Python: $cmd (version $PYTHON_VERSION)" 157 | break 158 | elif [ "$PYTHON_MAJOR" -eq 3 ] && [ "$PYTHON_MINOR" -gt 11 ]; then 159 | PYTHON_CMD=$cmd 160 | PYTHON_TOO_NEW=true 161 | echo "⚠️ Found $cmd (version $PYTHON_VERSION) but only Python 3.11.x is supported." 162 | break 163 | else 164 | echo "⚠️ Found $cmd (version $PYTHON_VERSION) but it's too old, trying next..." 165 | fi 166 | fi 167 | done 168 | 169 | # If no suitable Python was found, or if Python is too new, offer to exit or continue 170 | if [ -z "$PYTHON_CMD" ] || [ "$PYTHON_TOO_NEW" = true ]; then 171 | OS_TYPE=$(uname -s) 172 | if [ "$PYTHON_TOO_NEW" = true ]; then 173 | echo -e "\n❌ Python version $PYTHON_VERSION detected. Only Python 3.11.x is supported. Newer versions (e.g., 3.12+) are not yet supported." 174 | else 175 | if [[ "$OS_TYPE" == "Darwin" ]]; then 176 | echo -e "\n❌ python3.11 not found. To continue, we recommend running this:\n\n $ brew install [email protected]\n" 177 | elif [[ "$OS_TYPE" == "MINGW"* || "$OS_TYPE" == "CYGWIN"* || "$OS_TYPE" == "MSYS"* ]]; then 178 | echo -e "\n❌ python3.11 not found. Please install Python 3.11 from https://www.python.org/downloads/\n" 179 | else 180 | echo -e "\n❌ python3.11 not found. Please install Python 3.11 from your package manager or https://www.python.org/downloads/\n" 181 | fi 182 | fi 183 | while true; do 184 | echo "Would you like to exit so you can install Python 3.11, or continue anyway? (e = exit, c = continue): " 185 | read -n 1 -r PYTHON_CONT_CHOICE 186 | echo 187 | if [[ "$PYTHON_CONT_CHOICE" =~ ^[Ee]$ ]]; then 188 | echo "Exiting so you can install Python 3.11." 189 | exit 1 190 | elif [[ "$PYTHON_CONT_CHOICE" =~ ^[Cc]$ ]]; then 191 | echo "⚠️ Continuing without Python 3.11. Some features may not work as expected." 192 | break 193 | else 194 | echo "Please enter 'e' to exit or 'c' to continue." 195 | fi 196 | done 197 | fi 198 | 199 | # Create a virtual environment 200 | if [ ! -d "$VENV_DIR" ]; then 201 | $PYTHON_CMD -m venv "$VENV_DIR" 202 | fi 203 | 204 | # Activate the virtual environment 205 | source "$VENV_DIR/bin/activate" 206 | 207 | # Install required packages 208 | echo "📦 Updating Cua packages..." 209 | pip install -U pip setuptools wheel Cmake 210 | pip install -U cua-computer "cua-agent[all]" 211 | 212 | # Create a simple demo script 213 | mkdir -p "$DEMO_DIR" 214 | 215 | # Create .env.local file with API keys (only if it doesn't exist) 216 | if [[ ! -f "$DEMO_DIR/.env.local" ]]; then 217 | cat > "$DEMO_DIR/.env.local" << EOF 218 | # Uncomment and add your API keys here 219 | # OPENAI_API_KEY=your_openai_api_key_here 220 | # ANTHROPIC_API_KEY=your_anthropic_api_key_here 221 | CUA_API_KEY=your_cua_api_key_here 222 | EOF 223 | echo "📝 Created .env.local file with API key placeholders" 224 | else 225 | echo "📝 Found existing .env.local file - keeping your current settings" 226 | fi 227 | 228 | if [[ "$USE_CLOUD" == "true" ]]; then 229 | # Add CUA API key to .env.local if not already present 230 | if ! grep -q "CUA_API_KEY" "$DEMO_DIR/.env.local"; then 231 | echo "CUA_API_KEY=$CUA_API_KEY" >> "$DEMO_DIR/.env.local" 232 | echo "🔑 Added CUA_API_KEY to .env.local" 233 | elif grep -q "CUA_API_KEY=your_cua_api_key_here" "$DEMO_DIR/.env.local"; then 234 | # Update placeholder with actual key 235 | sed -i.bak "s/CUA_API_KEY=your_cua_api_key_here/CUA_API_KEY=$CUA_API_KEY/" "$DEMO_DIR/.env.local" 236 | echo "🔑 Updated CUA_API_KEY in .env.local" 237 | fi 238 | fi 239 | 240 | # Create a convenience script to run the demo 241 | cat > "$DEMO_DIR/start_ui.sh" << EOF 242 | #!/bin/bash 243 | source "$VENV_DIR/bin/activate" 244 | cd "$DEMO_DIR" 245 | python run_demo.py 246 | EOF 247 | chmod +x "$DEMO_DIR/start_ui.sh" 248 | 249 | echo "✅ Setup complete!" 250 | 251 | if [[ "$USE_CLOUD" == "true" ]]; then 252 | # Create run_demo.py for cloud sandbox 253 | cat > "$DEMO_DIR/run_demo.py" << 'EOF' 254 | import asyncio 255 | import os 256 | from pathlib import Path 257 | from dotenv import load_dotenv 258 | from computer import Computer 259 | from agent import ComputerAgent, LLM, AgentLoop, LLMProvider 260 | from agent.ui.gradio.ui_components import create_gradio_ui 261 | 262 | # Load environment variables from .env.local 263 | load_dotenv(Path(__file__).parent / ".env.local") 264 | 265 | # Check for required API keys 266 | cua_api_key = os.environ.get("CUA_API_KEY", "") 267 | if not cua_api_key: 268 | print("\n❌ CUA_API_KEY not found in .env.local file.") 269 | print("Please add your CUA API key to the .env.local file.") 270 | exit(1) 271 | 272 | openai_key = os.environ.get("OPENAI_API_KEY", "") 273 | anthropic_key = os.environ.get("ANTHROPIC_API_KEY", "") 274 | 275 | if not openai_key and not anthropic_key: 276 | print("\n⚠️ No OpenAI or Anthropic API keys found in .env.local.") 277 | print("Please add at least one API key to use AI agents.") 278 | 279 | print("🚀 Starting CUA playground with Cloud Sandbox...") 280 | print("📝 Edit .env.local to update your API keys") 281 | 282 | # Launch the Gradio UI and open it in the browser 283 | app = create_gradio_ui() 284 | app.launch(share=False, inbrowser=True) 285 | EOF 286 | else 287 | # Create run_demo.py for local macOS VMs 288 | cat > "$DEMO_DIR/run_demo.py" << 'EOF' 289 | import asyncio 290 | import os 291 | from pathlib import Path 292 | from dotenv import load_dotenv 293 | from computer import Computer 294 | from agent import ComputerAgent, LLM, AgentLoop, LLMProvider 295 | from agent.ui.gradio.ui_components import create_gradio_ui 296 | 297 | # Load environment variables from .env.local 298 | load_dotenv(Path(__file__).parent / ".env.local") 299 | 300 | # Try to load API keys from environment 301 | openai_key = os.environ.get("OPENAI_API_KEY", "") 302 | anthropic_key = os.environ.get("ANTHROPIC_API_KEY", "") 303 | 304 | if not openai_key and not anthropic_key: 305 | print("\n⚠️ No OpenAI or Anthropic API keys found in .env.local.") 306 | print("Please add at least one API key to use AI agents.") 307 | 308 | print("🚀 Starting CUA playground with local macOS VMs...") 309 | print("📝 Edit .env.local to update your API keys") 310 | 311 | # Launch the Gradio UI and open it in the browser 312 | app = create_gradio_ui() 313 | app.launch(share=False, inbrowser=True) 314 | EOF 315 | fi 316 | 317 | echo "☁️ CUA Cloud Sandbox setup complete!" 318 | echo "📝 Edit $DEMO_DIR/.env.local to update your API keys" 319 | echo "🖥️ Start the playground by running: $DEMO_DIR/start_ui.sh" 320 | 321 | # Check if the VM is running (only for local setup) 322 | if [[ "$USE_CLOUD" == "false" ]]; then 323 | echo "🔍 Checking if the macOS CUA VM is running..." 324 | VM_RUNNING=$(lume ls | grep "macos-sequoia-cua" | grep "running" || echo "") 325 | 326 | if [ -z "$VM_RUNNING" ]; then 327 | echo "🚀 Starting the macOS CUA VM in the background..." 328 | lume run macos-sequoia-cua:latest & 329 | # Wait a moment for the VM to initialize 330 | sleep 5 331 | echo "✅ VM started successfully." 332 | else 333 | echo "✅ macOS CUA VM is already running." 334 | fi 335 | fi 336 | 337 | # Ask if the user wants to start the demo now 338 | echo 339 | read -p "Would you like to start the Cua Computer-Use Agent UI now? (y/n) " -n 1 -r 340 | echo 341 | if [[ $REPLY =~ ^[Yy]$ ]]; then 342 | echo "🚀 Starting the Cua Computer-Use Agent UI..." 343 | echo "" 344 | "$DEMO_DIR/start_ui.sh" 345 | fi 346 | ``` -------------------------------------------------------------------------------- /libs/python/som/som/visualization.py: -------------------------------------------------------------------------------- ```python 1 | from typing import List, Dict, Any, Tuple 2 | import numpy as np 3 | from PIL import Image, ImageDraw, ImageFont 4 | import supervision as sv 5 | import platform 6 | import os 7 | import logging 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class BoxAnnotator: 13 | """Class for drawing bounding boxes and labels on images.""" 14 | 15 | def __init__(self): 16 | """Initialize the box annotator with a color palette.""" 17 | # WCAG 2.1 compliant color palette optimized for accessibility 18 | self.colors = [ 19 | "#2E7D32", # Green 20 | "#C62828", # Red 21 | "#1565C0", # Blue 22 | "#6A1B9A", # Purple 23 | "#EF6C00", # Orange 24 | "#283593", # Indigo 25 | "#4527A0", # Deep Purple 26 | "#00695C", # Teal 27 | "#D84315", # Deep Orange 28 | "#1B5E20", # Dark Green 29 | "#B71C1C", # Dark Red 30 | "#0D47A1", # Dark Blue 31 | "#4A148C", # Dark Purple 32 | "#E65100", # Dark Orange 33 | "#1A237E", # Dark Indigo 34 | "#311B92", # Darker Purple 35 | "#004D40", # Dark Teal 36 | "#BF360C", # Darker Orange 37 | "#33691E", # Darker Green 38 | "#880E4F", # Pink 39 | ] 40 | self.color_index = 0 41 | self.default_font = None 42 | self._initialize_font() 43 | 44 | def _initialize_font(self) -> None: 45 | """Initialize the default font.""" 46 | # Try to load a system font first 47 | system = platform.system() 48 | font_paths = [] 49 | 50 | if system == "Darwin": # macOS 51 | font_paths = [ 52 | "/System/Library/Fonts/Helvetica.ttc", 53 | "/System/Library/Fonts/Arial.ttf", 54 | "/Library/Fonts/Arial.ttf", 55 | ] 56 | elif system == "Linux": 57 | font_paths = [ 58 | "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 59 | "/usr/share/fonts/TTF/DejaVuSans.ttf", 60 | "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf", 61 | ] 62 | else: # Windows 63 | font_paths = ["C:\\Windows\\Fonts\\arial.ttf"] 64 | 65 | # Try each font path 66 | for font_path in font_paths: 67 | if os.path.exists(font_path): 68 | try: 69 | # Test the font with a small size 70 | test_font = ImageFont.truetype(font_path, 12) 71 | # Test if the font can render text 72 | test_font.getbbox("1") 73 | self.default_font = font_path 74 | return 75 | except Exception: 76 | continue 77 | 78 | def _get_next_color(self) -> str: 79 | """Get the next color from the palette.""" 80 | color = self.colors[self.color_index] 81 | self.color_index = (self.color_index + 1) % len(self.colors) 82 | return color 83 | 84 | def _hex_to_rgb(self, hex_color: str) -> Tuple[int, int, int]: 85 | """Convert hex color to RGB tuple.""" 86 | hex_color = hex_color.lstrip("#") 87 | # Create explicit tuple of 3 integers to match the return type 88 | r = int(hex_color[0:2], 16) 89 | g = int(hex_color[2:4], 16) 90 | b = int(hex_color[4:6], 16) 91 | return (r, g, b) 92 | 93 | def draw_boxes( 94 | self, image: Image.Image, detections: List[Dict[str, Any]], draw_config: Dict[str, Any] 95 | ) -> Image.Image: 96 | """Draw bounding boxes and labels on the image.""" 97 | draw = ImageDraw.Draw(image) 98 | 99 | # Create smaller font while keeping contrast 100 | try: 101 | if self.default_font: 102 | font = ImageFont.truetype(self.default_font, size=12) # Reduced from 16 to 12 103 | else: 104 | # If no TrueType font available, use default 105 | font = ImageFont.load_default() 106 | except Exception: 107 | font = ImageFont.load_default() 108 | 109 | padding = 2 # Reduced padding for smaller overall box 110 | spacing = 1 # Reduced spacing between elements 111 | 112 | # Keep track of used label areas to check for collisions 113 | used_areas = [] 114 | 115 | # Store label information for third pass 116 | labels_to_draw = [] 117 | 118 | # First pass: Initialize used_areas with all bounding boxes 119 | for detection in detections: 120 | box = detection["bbox"] 121 | x1, y1, x2, y2 = [ 122 | int(coord * dim) for coord, dim in zip(box, [image.width, image.height] * 2) 123 | ] 124 | used_areas.append((x1, y1, x2, y2)) 125 | 126 | # Second pass: Draw all bounding boxes 127 | for idx, detection in enumerate(detections, 1): 128 | # Get box coordinates 129 | box = detection["bbox"] 130 | x1, y1, x2, y2 = [ 131 | int(coord * dim) for coord, dim in zip(box, [image.width, image.height] * 2) 132 | ] 133 | 134 | # Get color for this detection 135 | color = self._get_next_color() 136 | rgb_color = self._hex_to_rgb(color) 137 | 138 | # Draw bounding box with original width 139 | draw.rectangle(((x1, y1), (x2, y2)), outline=rgb_color, width=2) 140 | 141 | # Use detection number as label 142 | label = str(idx) 143 | 144 | # Get text dimensions using getbbox 145 | bbox = font.getbbox(label) 146 | text_width = bbox[2] - bbox[0] 147 | text_height = bbox[3] - bbox[1] 148 | 149 | # Create box dimensions with padding 150 | box_width = text_width + (padding * 2) # Removed multiplier for tighter box 151 | box_height = text_height + (padding * 2) # Removed multiplier for tighter box 152 | 153 | def is_inside_bbox(x, y): 154 | """Check if a label box would be inside the bounding box.""" 155 | return x >= x1 and x + box_width <= x2 and y >= y1 and y + box_height <= y2 156 | 157 | # Try different positions until we find one without collision 158 | positions = [ 159 | # Top center (above bbox) 160 | lambda: (x1 + ((x2 - x1) - box_width) // 2, y1 - box_height - spacing), 161 | # Bottom center (below bbox) 162 | lambda: (x1 + ((x2 - x1) - box_width) // 2, y2 + spacing), 163 | # Right center (right of bbox) 164 | lambda: (x2 + spacing, y1 + ((y2 - y1) - box_height) // 2), 165 | # Left center (left of bbox) 166 | lambda: (x1 - box_width - spacing, y1 + ((y2 - y1) - box_height) // 2), 167 | # Top right (outside corner) 168 | lambda: (x2 + spacing, y1 - box_height - spacing), 169 | # Top left (outside corner) 170 | lambda: (x1 - box_width - spacing, y1 - box_height - spacing), 171 | # Bottom right (outside corner) 172 | lambda: (x2 + spacing, y2 + spacing), 173 | # Bottom left (outside corner) 174 | lambda: (x1 - box_width - spacing, y2 + spacing), 175 | ] 176 | 177 | def check_occlusion(x, y): 178 | """Check if a label box occludes any existing ones or is inside bbox.""" 179 | # First check if it's inside the bounding box 180 | if is_inside_bbox(x, y): 181 | return True 182 | 183 | # Then check collision with other labels 184 | new_box = (x, y, x + box_width, y + box_height) 185 | label_width = new_box[2] - new_box[0] 186 | label_height = new_box[3] - new_box[1] 187 | 188 | for used_box in used_areas: 189 | if not ( 190 | new_box[2] < used_box[0] # new box is left of used box 191 | or new_box[0] > used_box[2] # new box is right of used box 192 | or new_box[3] < used_box[1] # new box is above used box 193 | or new_box[1] > used_box[3] # new box is below used box 194 | ): 195 | # Calculate dimensions of the used box 196 | used_box_width = used_box[2] - used_box[0] 197 | used_box_height = used_box[3] - used_box[1] 198 | 199 | # Only consider as collision if used box is NOT more than 5x bigger in both dimensions 200 | if not (used_box_width > 5 * label_width and used_box_height > 5 * label_height): 201 | return True 202 | return False 203 | 204 | # Try each position until we find one without collision 205 | label_x = None 206 | label_y = None 207 | 208 | for get_pos in positions: 209 | x, y = get_pos() 210 | # Ensure position is within image bounds 211 | if x < 0 or y < 0 or x + box_width > image.width or y + box_height > image.height: 212 | continue 213 | if not check_occlusion(x, y): 214 | label_x = x 215 | label_y = y 216 | break 217 | 218 | # If all positions collide or are out of bounds, find the best possible position 219 | if label_x is None: 220 | # Try to place it in the nearest valid position outside the bbox 221 | best_pos = positions[0]() # Default to top center 222 | label_x = max(0, min(image.width - box_width, best_pos[0])) 223 | label_y = max(0, min(image.height - box_height, best_pos[1])) 224 | 225 | # Ensure it's not inside the bounding box 226 | if is_inside_bbox(label_x, label_y): 227 | # Force it above the bounding box 228 | label_y = max(0, y1 - box_height - spacing) 229 | 230 | # Add this label area to used areas 231 | if ( 232 | label_x is not None 233 | and label_y is not None 234 | and box_width is not None 235 | and box_height is not None 236 | ): 237 | used_areas.append((label_x, label_y, label_x + box_width, label_y + box_height)) 238 | 239 | # Store label information for second pass 240 | labels_to_draw.append( 241 | { 242 | "label": label, 243 | "x": label_x, 244 | "y": label_y, 245 | "width": box_width, 246 | "height": box_height, 247 | "text_width": text_width, 248 | "text_height": text_height, 249 | "color": rgb_color, 250 | } 251 | ) 252 | 253 | # Third pass: Draw all labels on top 254 | for label_info in labels_to_draw: 255 | # Draw background box with white outline 256 | draw.rectangle( 257 | ( 258 | (label_info["x"] - 1, label_info["y"] - 1), 259 | ( 260 | label_info["x"] + label_info["width"] + 1, 261 | label_info["y"] + label_info["height"] + 1, 262 | ), 263 | ), 264 | outline="white", 265 | width=2, 266 | ) 267 | draw.rectangle( 268 | ( 269 | (label_info["x"], label_info["y"]), 270 | (label_info["x"] + label_info["width"], label_info["y"] + label_info["height"]), 271 | ), 272 | fill=label_info["color"], 273 | ) 274 | 275 | # Center text in box 276 | text_x = label_info["x"] + (label_info["width"] - label_info["text_width"]) // 2 277 | text_y = label_info["y"] + (label_info["height"] - label_info["text_height"]) // 2 278 | 279 | # Draw text with black outline for better visibility 280 | outline_width = 1 281 | for dx in [-outline_width, outline_width]: 282 | for dy in [-outline_width, outline_width]: 283 | draw.text( 284 | (text_x + dx, text_y + dy), label_info["label"], fill="black", font=font 285 | ) 286 | 287 | # Draw the main white text 288 | draw.text((text_x, text_y), label_info["label"], fill=(255, 255, 255), font=font) 289 | 290 | logger.info("Finished drawing all boxes") 291 | return image 292 | ``` -------------------------------------------------------------------------------- /examples/evals/wikipedia_most_linked.txt: -------------------------------------------------------------------------------- ``` 1 | ISBN (identifier) 2 | United States 3 | Main Page 4 | Tilde 5 | Doi (identifier) 6 | Fair use 7 | Association football 8 | Years 9 | Wayback Machine 10 | ISSN (identifier) 11 | India 12 | Wikimedia Foundation 13 | Wikidata 14 | Animal 15 | Taxonomy (biology) 16 | Australia 17 | France 18 | Eukaryote 19 | IP address 20 | U.S. state 21 | Time zone 22 | City 23 | Copyright 24 | Canada 25 | Town 26 | ASCII 27 | Greek alphabet 28 | Typographic ligature 29 | Diacritical mark 30 | Wikipedia 31 | Germany 32 | Human settlement 33 | Open Tree of Life 34 | IMDb (identifier) 35 | United Kingdom 36 | Catalogue of Life 37 | Insect 38 | Russia 39 | Japan 40 | Italy 41 | Arthropod 42 | Television show 43 | Public domain 44 | INaturalist 45 | Poland 46 | England 47 | PMID (identifier) 48 | Daylight saving time 49 | S2CID (identifier) 50 | China 51 | Encyclopedia of Life 52 | Spain 53 | OCLC (identifier) 54 | Plant 55 | Flickr 56 | Wikispecies 57 | Africa 58 | Song 59 | Record label 60 | Lepidoptera 61 | Iran 62 | English language 63 | Music genre 64 | News aggregator 65 | Web feed 66 | Proxy server 67 | X-Forwarded-For 68 | College football 69 | World War II 70 | Brazil 71 | Sweden 72 | Politics 73 | Olympics 74 | Netherlands 75 | Record producer 76 | California 77 | New York City 78 | Surname 79 | The New York Times 80 | London 81 | New Zealand 82 | PMC (identifier) 83 | Logo 84 | Synonym (taxonomy) 85 | Switzerland 86 | Turkey 87 | Sport 88 | Video game 89 | Architecture 90 | Norway 91 | Bibcode (identifier) 92 | Mexico 93 | Botany 94 | JSTOR (identifier) 95 | Rail transport 96 | Field hockey 97 | Ireland 98 | Scotland 99 | Belgium 100 | South Africa 101 | Common name 102 | Professional sports 103 | Sport governing body 104 | Sport industry 105 | Olympic games 106 | Election 107 | Austria 108 | Ukraine 109 | Anthroponymy 110 | Pakistan 111 | Baseball 112 | Denmark 113 | Christianity 114 | Philippines 115 | Woman 116 | Romania 117 | Czech Republic 118 | Album 119 | Godzilla Minus One 120 | Single (music) 121 | Electoral reform 122 | Nofollow 123 | Basketball 124 | New York (state) 125 | Argentina 126 | Finland 127 | Soviet Union 128 | Greece 129 | Russian language 130 | Historic site 131 | Free content 132 | YouTube 133 | Catholic Church 134 | Hungary 135 | Kingdom Hearts 136 | Beetle 137 | Company 138 | Tetris 139 | Portugal 140 | BioShock 141 | Abandonware 142 | Deus Ex (video game) 143 | 4A Engine 144 | Yoshi's New Island 145 | Kaboom! (video game) 146 | Rain World 147 | Juno (Overwatch) 148 | Crash Team Rumble 149 | Vault 101 150 | Tales of Commons 151 | NHL Hockey 152 | Clutch Gaming 153 | Haseo 154 | Allin Kempthorne 155 | Ilyas El Maliki 156 | Ratalaika Games 157 | 3D mousepad 158 | HaptX 159 | Walid Sultan Midani 160 | Rustler (video game) 161 | Look Outside 162 | Ducks Ahoy! 163 | Fusion Engine 164 | Cricket 165 | Geography 166 | Chordate 167 | The Guardian 168 | Israel 169 | Billboard (magazine) 170 | Ice hockey 171 | Given name 172 | Chicago 173 | World War I 174 | Pennsylvania 175 | Indonesia 176 | Alma mater 177 | Vascular plant 178 | Amorphea 179 | Wikimedia Commons 180 | Novel 181 | Village 182 | Visual arts 183 | Film poster 184 | Flowering plant 185 | Opisthokont 186 | Obazoa 187 | County seat 188 | Short story 189 | First-class cricket 190 | Law 191 | Europe 192 | University 193 | Croatia 194 | Sport of athletics 195 | Holozoa 196 | Choanozoa 197 | Filozoa 198 | German language 199 | Tennis 200 | Eumetazoa 201 | Serbia 202 | ParaHoxozoa 203 | Thailand 204 | History 205 | Midfielder 206 | Bilateria 207 | Unincorporated area 208 | French language 209 | AllMusic 210 | Astronomy 211 | Nephrozoa 212 | Novella 213 | Ship 214 | Twitter 215 | Character (arts) 216 | College 217 | Malaysia 218 | Conflict of interest 219 | Higher education 220 | IUCN Red List 221 | Rock music 222 | Gastropoda 223 | Creative Commons 224 | Wales 225 | Bulgaria 226 | UTC+2 227 | Paris 228 | Species 229 | Illinois 230 | HTML element 231 | South Korea 232 | BBC 233 | Persian language 234 | Moth 235 | Conservation status 236 | Pop music 237 | Colombia 238 | Wicket 239 | American football 240 | Jazz 241 | World Flora Online 242 | Los Angeles 243 | Songwriter 244 | Hong Kong 245 | Hdl (identifier) 246 | Genus 247 | Spanish language 248 | Egypt 249 | Not out 250 | Slovenia 251 | Chile 252 | Korea 253 | Tropicos 254 | Slovakia 255 | Bishop 256 | Family (biology) 257 | Rugby union 258 | Women's history 259 | Nigeria 260 | College basketball 261 | Sports Reference 262 | Washington, D.C. 263 | GFDL 264 | Afghanistan 265 | Sri Lanka 266 | Newspapers.com 267 | UTC+1 268 | Eudicots 269 | Estonia 270 | Los Angeles Times 271 | Olympedia 272 | Bangladesh 273 | Peru 274 | Singapore 275 | Typographical error 276 | UTC 277 | Virginia 278 | Taiwan 279 | Fast bowling 280 | COVID-19 pandemic 281 | Food 282 | Fish 283 | River 284 | Republic of Ireland 285 | Beer 286 | Caribbean 287 | Michigan 288 | Drink 289 | Chinese language 290 | Business 291 | Leg break 292 | Women's Test cricket 293 | Women's cricket 294 | Innings 295 | New Jersey 296 | Protostome 297 | Spin bowling 298 | Sugar 299 | Underarm bowling 300 | Roger Federer 301 | Googly 302 | Apple 303 | Comics 304 | Cricket Australia XI 305 | Fair and unfair play 306 | Anime 307 | Rafael Nadal 308 | Leander Paes 309 | Kazakhstan 310 | Capital city 311 | Blessed Virgin Mary 312 | Venezuela 313 | Case sensitivity 314 | Arabic language 315 | North America 316 | Texas 317 | Burger King 318 | The Plant List 319 | Justine Henin 320 | Sushi 321 | Angelus 322 | Beef 323 | Sanctification 324 | Cuthbert Tunstall 325 | Bread 326 | Saint Mungo 327 | Incumbent 328 | Americanism (heresy) 329 | Curry 330 | Ensoulment 331 | Associated Press 332 | Adolph John Paschang 333 | French cuisine 334 | Altar Society 335 | UTC-5 336 | Philadelphia 337 | Bill Mallon 338 | Yogurt 339 | Soy sauce 340 | Open Era (tennis) 341 | Belarus 342 | Manga 343 | English Wikipedia 344 | Islam 345 | Trademark 346 | ISO 4 347 | Wisconsin 348 | Lithuania 349 | The Washington Post 350 | Agaricus bisporus 351 | Reptile 352 | Sociology 353 | Organizations 354 | Death 355 | Ham and eggs 356 | Asia 357 | Swimming (sport) 358 | South America 359 | Northern Ireland 360 | Observation.org 361 | European Union 362 | Astronomical object 363 | Georgia (U.S. state) 364 | Gmina 365 | Provinces of Iran 366 | Computing 367 | Counties of Iran 368 | Discogs 369 | Mathematics 370 | Powiat 371 | Missouri 372 | Bachelor of Arts 373 | Iran Standard Time 374 | Florida 375 | Bakhsh 376 | Minnesota 377 | Oregon 378 | Nepal 379 | Variety (magazine) 380 | Japanese language 381 | Journalism 382 | Rome 383 | Computer 384 | Ohio 385 | Ontario 386 | Internet Archive 387 | Latvia 388 | Comedy 389 | Azerbaijan 390 | BBC News 391 | Morocco 392 | Ecdysozoa 393 | Print-on-demand 394 | Bengali language 395 | A5 paper 396 | Pedia Press 397 | Education 398 | Mollusca 399 | American Civil War 400 | Berlin 401 | Taxon 402 | Maryland 403 | Panarthropoda 404 | Hebrew language 405 | Toronto 406 | Tactopoda 407 | Episode 408 | Cuba 409 | Country music 410 | Religion 411 | Rotten Tomatoes 412 | Georgia (country) 413 | Classical music 414 | Month 415 | Puerto Rico 416 | GEOnet Names Server 417 | Sydney 418 | The Times 419 | Iraq 420 | Polyphaga 421 | Derivative work 422 | Lisbon 423 | Syria 424 | Ecuador 425 | Uzbekistan 426 | Greek language 427 | Latin 428 | United Nations 429 | Literature 430 | Animation 431 | Physics 432 | Amphibian 433 | Romanize 434 | List of countries 435 | Moscow 436 | Politician 437 | Philosophy 438 | Metacritic 439 | Mammal 440 | Pinyin 441 | Open access 442 | New South Wales 443 | Theatre 444 | Allmusic 445 | Syntax 446 | Women in music 447 | Fly 448 | Colorado 449 | Academic journal 450 | LGBTQ 451 | Seal (emblem) 452 | Rolling Stone 453 | Saudi Arabia 454 | Science fiction 455 | Tweet (social media) 456 | Heavy metal music 457 | Boston 458 | Vietnam 459 | Molecular biology 460 | Facebook 461 | Iceland 462 | Albania 463 | Cycling 464 | Tennessee 465 | Armenia 466 | Massachusetts 467 | Mandibulata 468 | United States Navy 469 | Communes of France 470 | Census 471 | Algeria 472 | United States Army 473 | Wikilink 474 | Pancrustacea 475 | Alternative rock 476 | American English 477 | Radio stations 478 | History of Romania 479 | Endemism 480 | San Francisco 481 | Award 482 | Ghana 483 | Judaism 484 | Alabama 485 | Blog 486 | The Independent 487 | Melbourne 488 | Cantons of France 489 | Lebanon 490 | West Germany 491 | Quotation mark 492 | Regions of France 493 | Chernivtsi Oblast 494 | Tokyo 495 | Italian language 496 | Connecticut 497 | Country 498 | Screenshot 499 | Ghost town 500 | Iran Daylight Time 501 | NatureServe 502 | Mongolia 503 | Cyprus 504 | Northern Bukovina 505 | Rugby league 506 | Northern Bessarabia 507 | State highway 508 | Harvard University 509 | Yorkshire 510 | Pterygota 511 | Slash (punctuation) 512 | Prize 513 | Science 514 | Asian Games 515 | Eastern Time Zone 516 | Myanmar 517 | Nazi Germany 518 | Ottoman Empire 519 | Quebec 520 | Billboard Hot 100 521 | United Arab Emirates 522 | Neoptera 523 | Hexapoda 524 | Least Concern 525 | Type species 526 | EPPO Code 527 | Wikisource 528 | Kyrgyzstan 529 | Allotriocarida 530 | Volleyball 531 | Geology 532 | Second World War 533 | British Columbia 534 | Socialism 535 | Zoology 536 | The Daily Telegraph 537 | Paleontology 538 | Vienna 539 | Dicondylia 540 | BugGuide 541 | United States Senate 542 | Hermit crab 543 | Paraphrase 544 | CNN 545 | Royal Navy 546 | Indian Standard Time 547 | Billboard 200 548 | Kenya 549 | DVD 550 | Sipuncula 551 | Tajikistan 552 | National park 553 | Economics 554 | Heterocyathus 555 | Uruguay 556 | Heteropsammia 557 | Road 558 | Spanish name 559 | Luxembourg 560 | Korean language 561 | UK Singles Chart 562 | Queensland 563 | Montreal 564 | New York Times 565 | Bolivia 566 | CP/M 567 | Timestamp 568 | Electronic music 569 | INSEE code 570 | ArXiv (identifier) 571 | PubMed 572 | SVG 573 | USA Today 574 | Omnivore 575 | Tunisia 576 | Psychology 577 | ESPN 578 | UEFA 579 | Hawaii 580 | Gastropod 581 | Aliyah 582 | North Carolina 583 | Russian Empire 584 | Tibet 585 | Fungi 586 | Oklahoma 587 | Fauna Europaea 588 | Turkmenistan 589 | British English 590 | The London Gazette 591 | Civil township 592 | Boxing 593 | Barack Obama 594 | Animal Diversity Web 595 | Reuters 596 | Eumetabola 597 | Voter turnout 598 | Transport 599 | False positive 600 | Donald Trump 601 | Kansas 602 | Antarctica 603 | Lake 604 | Ethiopia 605 | Time (magazine) 606 | Marriage 607 | NBC 608 | Beijing 609 | Vertebrate 610 | Czechoslovakia 611 | Protected area 612 | Energy 613 | Poetry 614 | Archaeology 615 | Columbia University 616 | Poverty line 617 | Alaska 618 | Computing platform 619 | British Empire 620 | University of Oxford 621 | Costa Rica 622 | Dublin 623 | A-side and B-side 624 | ZIP code 625 | Actinopterygii 626 | UTC-6 627 | Photoperiodism 628 | Mayor 629 | Sphaeriidae 630 | Animal suicide 631 | Atka mackerel 632 | Starling 633 | Arizona 634 | Entertainment Weekly 635 | Sphaerium beckmani 636 | Junqueira cow 637 | Zaniolepis frenata 638 | Campocraspedon 639 | Zimbabwe 640 | Motorsport 641 | Bird flight 642 | Cnemophilidae 643 | Hinduism 644 | Phalarope 645 | Indiana 646 | Museums 647 | Holometabola 648 | Pytilia 649 | North Macedonia 650 | Malta 651 | Cathartiformes 652 | Darter 653 | Saker falcon 654 | Cathartes 655 | Avian malaria 656 | Coal tit 657 | Magpie duck 658 | Video game developer 659 | Bird bath 660 | Vesper sparrow 661 | Gouldian finch 662 | Debeaking 663 | Vector graphics 664 | Semiplumbeous hawk 665 | Scottish crossbill 666 | Bullfinch 667 | Fregata 668 | Nidicolous 669 | Plushcap 670 | Pallid scops owl 671 | Hip-hop 672 | Blyth's frogmouth 673 | Sunda scops owl 674 | Argus (bird) 675 | Operation Migration 676 | Nik Borrow 677 | Per capita income 678 | Guy Oseary 679 | Madrid 680 | Buddhism 681 | Drainage basin 682 | Sephardic Haredim 683 | Rami Kleinstein 684 | Guy Bavli 685 | David Bar-Hayim 686 | Levin Kipnis 687 | Edna Arbel 688 | Prisoner of Zion 689 | Ayala Procaccia 690 | Nachum Heiman 691 | Zman Tel Aviv 692 | CBS 693 | ARIA Charts 694 | Cucujiformia 695 | Away colours 696 | Regex 697 | 2019 African Games 698 | 1962 Asian Games 699 | 1958 Asian Games 700 | Chemistry 701 | Olympic Games 702 | The Middle Ages 703 | Central Asia 704 | Bengalis 705 | Southeast Asia 706 | Find a Grave 707 | Microsoft Windows 708 | Swing (politics) 709 | White (U.S. Census) 710 | Roman Catholic 711 | Maine 712 | The Times of India 713 | Season (sports) 714 | Jamaica 715 | Video game genre 716 | Munich 717 | Asterids 718 | Rosids 719 | Golf 720 | Language 721 | Hangul 722 | Atlanta 723 | Glasgow 724 | UTC+3 725 | Library of Congress 726 | Deuterostome 727 | COVID-19 728 | Video game publisher 729 | Montenegro 730 | ESPNcricinfo 731 | Brand 732 | UTC-4 733 | IGN 734 | Stockholm 735 | Istanbul 736 | NASA 737 | Gnathostomata 738 | Ukrainian language 739 | Human rights 740 | Chicago Tribune 741 | ProQuest 742 | IMDb 743 | River mouth 744 | Hip hop music 745 | Gene 746 | Netflix 747 | Moldova 748 | Barcelona 749 | Paraguay 750 | Olfactores 751 | Labour Party (UK) 752 | United States dollar 753 | Qatar 754 | Photography 755 | Guatemala 756 | Summit 757 | Cold War 758 | Running 759 | First World War 760 | Precipitation 761 | Edinburgh 762 | Amsterdam 763 | Lima 764 | New Eskaton 765 | Computer program 766 | Xinjiang 767 | Women in science 768 | Manhattan 769 | Warsaw 770 | Magazine 771 | Horror film 772 | Deadline Hollywood 773 | Jordan 774 | Aparaglossata 775 | Agriculture 776 | Internet 777 | Prague 778 | The Hindu 779 | Cretaceous 780 | Latino (U.S. Census) 781 | Vietnam War 782 | Music download 783 | Encyclopedia 784 | Chemical compounds 785 | Pittsburgh 786 | Soap opera 787 | Budapest 788 | George W. Bush 789 | Seattle 790 | Extended play 791 | Washington (state) 792 | Listed building 793 | Palestine 794 | LCCN (identifier) 795 | Portland, Oregon 796 | Panama 797 | Plagiarism 798 | Brooklyn 799 | Teleostomi 800 | Manchester 801 | Bird 802 | Mollusk 803 | Automobile 804 | Historic England 805 | Linguistics 806 | Dependent territory 807 | Athens 808 | Civil engineering 809 | Sea snail 810 | Population density 811 | Finance 812 | Disaster management 813 | Tanzania 814 | Jurassic 815 | Districts of Russia 816 | Western Australia 817 | Louisiana 818 | Portuguese language 819 | Anatomy 820 | The Beatles 821 | Tamil language 822 | Milan 823 | Uganda 824 | Natural environment 825 | FIFA 826 | Cameroon 827 | Blu-ray 828 | Mexico City 829 | Chemical formula 830 | Jimmy Wales 831 | Papua New Guinea 832 | Diaphoretickes 833 | UNESCO 834 | Forbes 835 | Technology 836 | Buenos Aires 837 | Vancouver 838 | Dominican Republic 839 | 2007 840 | Species description 841 | East Germany 842 | Folk music 843 | Kentucky 844 | Multimedia 845 | Monocotyledon 846 | Rio de Janeiro 847 | Automated 848 | Hindi 849 | Houston 850 | Google 851 | Devonian 852 | Member of Parliament 853 | Bible 854 | Mumbai 855 | FishBase 856 | African diaspora 857 | Carboniferous 858 | Cambrian 859 | Triassic 860 | Montana 861 | Handball 862 | Ordovician 863 | San Diego 864 | Archive.today 865 | Stanford University 866 | British Army 867 | Middle Ages 868 | Frequency 869 | Ultratop 870 | Permian 871 | Detroit 872 | Earth 873 | Precambrian 874 | Hamburg 875 | Alberta 876 | Tamil Nadu 877 | Madagascar 878 | Lancashire 879 | Guitar 880 | Trade union 881 | Instagram 882 | Engineering 883 | 2006 884 | Silurian 885 | NPR 886 | Railway station 887 | CAS Registry Number 888 | Yemen 889 | Noctuoidea 890 | Fiji 891 | Haiti 892 | Rowing (sport) 893 | New Orleans 894 | NME 895 | Alternative media 896 | North Korea 897 | Microsoft 898 | Jerusalem 899 | Paleogene 900 | Audery Mill Creek 901 | Horse racing 902 | Post town 903 | Piano 904 | Bavaria 905 | Polish language 906 | Horror fiction 907 | Neogene 908 | Kerala 909 | Copenhagen 910 | Google Books 911 | Central Time Zone 912 | Island 913 | Birmingham 914 | Anglicanism 915 | Software 916 | Mountain range 917 | Investment 918 | Brussels 919 | Muhammad Ali 920 | Asian (U.S. Census) 921 | Video game culture 922 | Brisbane 923 | Church of England 924 | Kosovo 925 | Bachelor of Science 926 | Molar mass 927 | Arachnid 928 | Own goal 929 | Yale University 930 | Caenogastropoda 931 | Auckland 932 | World Athletics 933 | Trinidad and Tobago 934 | Hanyu Pinyin 935 | Sound bite 936 | Time 937 | El Salvador 938 | Microbiology 939 | Columbia Records 940 | Seoul 941 | Cerambycidae 942 | Maharashtra 943 | Chelicerata 944 | Fungus 945 | Media influence 946 | South Carolina 947 | Radio 948 | Telenovela 949 | FA Cup 950 | Senegal 951 | Internet trolling 952 | Nashville, Tennessee 953 | Demonym 954 | Standard Chinese 955 | Sculpture 956 | Liverpool 957 | Thesis 958 | Bass guitar 959 | Chess 960 | Women artists 961 | Icon (computing) 962 | PubChem 963 | UK Albums Chart 964 | Head coach 965 | Roman Empire 966 | Grand Slam (tennis) 967 | JSmol 968 | Formula One 969 | Biology 970 | Kent 971 | Ancient Rome 972 | Inner Carniola 973 | Oslo 974 | Dutch language 975 | Wingspan 976 | Archaeplastida 977 | MTV 978 | Edvard Ravnikar 979 | ITunes 980 | Feminism 981 | German Empire 982 | Pacific Ocean 983 | Atlantic Ocean 984 | Pharmacology 985 | Track gauge 986 | ChemSpider 987 | Doctor of Philosophy 988 | Regions of England 989 | Districts of England 990 | Christmas 991 | Pavel Golia 992 | Predjama Castle 993 | Overtime (sports) 994 | Forum 995 | Swiss Hitparade 996 | Stumped 997 | Majority 998 | Male 999 | Shanghai 1000 | Siddharta (band) ``` -------------------------------------------------------------------------------- /blog/training-computer-use-models-trajectories-1.md: -------------------------------------------------------------------------------- ```markdown 1 | # Training Computer-Use Models: Creating Human Trajectories with Cua 2 | 3 | *Published on May 1, 2025 by Dillon DuPont* 4 | 5 | In our previous posts, we covered [building your own Computer-Use Operator](build-your-own-operator-on-macos-1) and [using the Agent framework](build-your-own-operator-on-macos-2) to simplify development. Today, we'll focus on a critical aspect of improving computer-use agents and models: gathering high-quality demonstration data using Cua's Computer-Use Interface (CUI) and its Gradio UI to create and share human-generated trajectories. 6 | 7 | Why is this important? Underlying models used by Computer-use agents need examples of how humans interact with computers to learn effectively. By creating a dataset of diverse, well-executed tasks, we can help train better models that understand how to navigate user interfaces and accomplish real tasks. 8 | 9 | <video src="https://github.com/user-attachments/assets/c586d460-3877-4b5f-a736-3248886d2134" controls width="600"></video> 10 | 11 | 12 | ## What You'll Learn 13 | 14 | By the end of this tutorial, you'll be able to: 15 | - Set up the Computer-Use Interface (CUI) with Gradio UI support 16 | - Record your own computer interaction trajectories 17 | - Organize and tag your demonstrations 18 | - Upload your datasets to Hugging Face for community sharing 19 | - Contribute to improving computer-use AI for everyone 20 | 21 | **Prerequisites:** 22 | - macOS Sonoma (14.0) or later 23 | - Python 3.10+ 24 | - Basic familiarity with Python and terminal commands 25 | - A Hugging Face account (for uploading datasets) 26 | 27 | **Estimated Time:** 20-30 minutes 28 | 29 | ## Understanding Human Trajectories 30 | 31 | ### What are Human Trajectories? 32 | 33 | Human trajectories, in the context of Computer-use AI Agents, are recordings of how humans interact with computer interfaces to complete tasks. These interactions include: 34 | 35 | - Mouse movements, clicks, and scrolls 36 | - Keyboard input 37 | - Changes in the UI state 38 | - Time spent on different elements 39 | 40 | These trajectories serve as examples for AI models to learn from, helping them understand the relationship between: 41 | 1. The visual state of the screen 42 | 2. The user's goal or task 43 | 3. The most appropriate action to take 44 | 45 | ### Why Human Demonstrations Matter 46 | 47 | Unlike synthetic data or rule-based automation, human demonstrations capture the nuanced decision-making that happens during computer interaction: 48 | 49 | - **Natural Pacing**: Humans pause to think, accelerate through familiar patterns, and adjust to unexpected UI changes 50 | - **Error Recovery**: Humans demonstrate how to recover from mistakes or handle unexpected states 51 | - **Context-Sensitive Actions**: The same UI element might be used differently depending on the task context 52 | 53 | By contributing high-quality demonstrations, you're helping to create more capable, human-like computer-use AI systems. 54 | 55 | ## Setting Up Your Environment 56 | 57 | ### Installing the CUI with Gradio Support 58 | 59 | The Computer-Use Interface includes an optional Gradio UI specifically designed to make recording and sharing demonstrations easy. Let's set it up: 60 | 61 | 1. **Create a Python environment** (optional but recommended): 62 | ```bash 63 | # Using conda 64 | conda create -n cua-trajectories python=3.10 65 | conda activate cua-trajectories 66 | 67 | # Using venv 68 | python -m venv cua-trajectories 69 | source cua-trajectories/bin/activate # On macOS/Linux 70 | ``` 71 | 72 | 2. **Install the CUI package with UI support**: 73 | ```bash 74 | pip install "cua-computer[ui]" 75 | ``` 76 | 77 | 3. **Set up your Hugging Face access token**: 78 | Create a `.env` file in your project directory and add your Hugging Face token: 79 | ```bash 80 | echo "HF_TOKEN=your_huggingface_token" > .env 81 | ``` 82 | You can get your token from your [Hugging Face account settings](https://huggingface.co/settings/tokens). 83 | 84 | ### Understanding the Gradio UI 85 | 86 | The Computer-Use Interface Gradio UI provides three main components: 87 | 88 | 1. **Recording Panel**: Captures your screen, mouse, and keyboard activity during demonstrations 89 | 2. **Review Panel**: Allows you to review, tag, and organize your demonstration recordings 90 | 3. **Upload Panel**: Lets you share your demonstrations with the community via Hugging Face 91 | 92 | The UI is designed to make the entire process seamless, from recording to sharing, without requiring deep technical knowledge of the underlying systems. 93 | 94 | ## Creating Your First Trajectory Dataset 95 | 96 | ### Launching the UI 97 | 98 | To get started, create a simple Python script to launch the Gradio UI: 99 | 100 | ```python 101 | # launch_trajectory_ui.py 102 | from computer.ui.gradio.app import create_gradio_ui 103 | from dotenv import load_dotenv 104 | 105 | # Load your Hugging Face token from .env 106 | load_dotenv('.env') 107 | 108 | # Create and launch the UI 109 | app = create_gradio_ui() 110 | app.launch(share=False) 111 | ``` 112 | 113 | Run this script to start the UI: 114 | 115 | ```bash 116 | python launch_trajectory_ui.py 117 | ``` 118 | 119 | ### Recording a Demonstration 120 | 121 | Let's walk through the process of recording your first demonstration: 122 | 123 | 1. **Start the VM**: Click the "Initialize Computer" button in the UI to initialize a fresh macOS sandbox. This ensures your demonstrations are clean and reproducible. 124 | 2. **Perform a Task**: Complete a simple task like creating a document, organizing files, or searching for information. Natural, everyday tasks make the best demonstrations. 125 | 3. **Review Recording**: Click the "Conversation Logs" or "Function Logs" tabs to review your captured interactions, making sure there is no personal information that you wouldn't want to share. 126 | 4. **Add Metadata**: In the "Save/Share Demonstrations" tab, give your recording a descriptive name (e.g., "Creating a Calendar Event") and add relevant tags (e.g., "productivity", "time-management"). 127 | 5. **Save Your Demonstration**: Click "Save" to store your recording locally. 128 | 129 | <video src="https://github.com/user-attachments/assets/de3c3477-62fe-413c-998d-4063e48de176" controls width="600"></video> 130 | 131 | ### Key Tips for Quality Demonstrations 132 | 133 | To create the most valuable demonstrations: 134 | 135 | - **Start and end at logical points**: Begin with a clear starting state and end when the task is visibly complete 136 | - **Narrate your thought process**: Use the message input to describe what you're trying to do and why 137 | - **Move at a natural pace**: Don't rush or perform actions artificially slowly 138 | - **Include error recovery**: If you make a mistake, keep going and show how to correct it 139 | - **Demonstrate variations**: Record multiple ways to complete the same task 140 | 141 | ## Organizing and Tagging Demonstrations 142 | 143 | Effective tagging and organization make your demonstrations more valuable to researchers and model developers. Consider these tagging strategies: 144 | 145 | ### Task-Based Tags 146 | 147 | Describe what the demonstration accomplishes: 148 | - `web-browsing` 149 | - `document-editing` 150 | - `file-management` 151 | - `email` 152 | - `scheduling` 153 | 154 | ### Application Tags 155 | 156 | Identify the applications used: 157 | - `finder` 158 | - `safari` 159 | - `notes` 160 | - `terminal` 161 | - `calendar` 162 | 163 | ### Complexity Tags 164 | 165 | Indicate the difficulty level: 166 | - `beginner` 167 | - `intermediate` 168 | - `advanced` 169 | - `multi-application` 170 | 171 | ### UI Element Tags 172 | 173 | Highlight specific UI interactions: 174 | - `drag-and-drop` 175 | - `menu-navigation` 176 | - `form-filling` 177 | - `search` 178 | 179 | The Computer-Use Interface UI allows you to apply and manage these tags across all your saved demonstrations, making it easy to create cohesive, well-organized datasets. 180 | 181 | <video src="https://github.com/user-attachments/assets/5ad1df37-026a-457f-8b49-922ae805faef" controls width="600"></video> 182 | 183 | ## Uploading to Hugging Face 184 | 185 | Sharing your demonstrations helps advance research in computer-use AI. The Gradio UI makes uploading to Hugging Face simple: 186 | 187 | ### Preparing for Upload 188 | 189 | 1. **Review Your Demonstrations**: Use the review panel to ensure all demonstrations are complete and correctly tagged. 190 | 191 | 2. **Select Demonstrations to Upload**: You can upload all demonstrations or filter by specific tags. 192 | 193 | 3. **Configure Dataset Information**: 194 | - **Repository Name**: Format as `{your_username}/{dataset_name}`, e.g., `johndoe/productivity-tasks` 195 | - **Visibility**: Choose `public` to contribute to the community or `private` for personal use 196 | - **License**: Standard licenses like CC-BY or MIT are recommended for public datasets 197 | 198 | ### The Upload Process 199 | 200 | 1. **Click "Upload to Hugging Face"**: This initiates the upload preparation. 201 | 202 | 2. **Review Dataset Summary**: Confirm the number of demonstrations and total size. 203 | 204 | 3. **Confirm Upload**: The UI will show progress as files are transferred. 205 | 206 | 4. **Receive Confirmation**: Once complete, you'll see a link to your new dataset on Hugging Face. 207 | 208 | <video src="https://github.com/user-attachments/assets/c586d460-3877-4b5f-a736-3248886d2134" controls width="600"></video> 209 | 210 | Your uploaded dataset will have a standardized format with the following structure: 211 | 212 | ```json 213 | { 214 | "timestamp": "2025-05-01T09:20:40.594878", 215 | "session_id": "1fe9f0fe-9331-4078-aacd-ec7ffb483b86", 216 | "name": "penguin lemon forest", 217 | "tool_calls": [...], // Detailed interaction records 218 | "messages": [...], // User/assistant messages 219 | "tags": ["highquality", "tasks"], 220 | "images": [...] // Screenshots of each state 221 | } 222 | ``` 223 | 224 | This structured format makes it easy for researchers to analyze patterns across different demonstrations and build better computer-use models. 225 | 226 | ```python 227 | from computer import Computer 228 | 229 | computer = Computer(os_type="macos", display="1024x768", memory="8GB", cpu="4") 230 | try: 231 | await computer.run() 232 | 233 | screenshot = await computer.interface.screenshot() 234 | with open("screenshot.png", "wb") as f: 235 | f.write(screenshot) 236 | 237 | await computer.interface.move_cursor(100, 100) 238 | await computer.interface.left_click() 239 | await computer.interface.right_click(300, 300) 240 | await computer.interface.double_click(400, 400) 241 | 242 | await computer.interface.type("Hello, World!") 243 | await computer.interface.press_key("enter") 244 | 245 | await computer.interface.set_clipboard("Test clipboard") 246 | content = await computer.interface.copy_to_clipboard() 247 | print(f"Clipboard content: {content}") 248 | finally: 249 | await computer.stop() 250 | ``` 251 | 252 | ## Example: Shopping List Demonstration 253 | 254 | Let's walk through a concrete example of creating a valuable demonstration: 255 | 256 | ### Task: Adding Shopping List Items to a Doordash Cart 257 | 258 | 1. **Start Recording**: Begin with a clean desktop and a text file containing a shopping list. 259 | 260 | 2. **Task Execution**: Open the file, read the list, open Safari, navigate to Doordash, and add each item to the cart. 261 | 262 | 3. **Narration**: Add messages like "Reading the shopping list" and "Searching for rice on Doordash" to provide context. 263 | 264 | 4. **Completion**: Verify all items are in the cart and end the recording. 265 | 266 | 5. **Tagging**: Add tags like `shopping`, `web-browsing`, `task-completion`, and `multi-step`. 267 | 268 | This type of demonstration is particularly valuable because it showcases real-world task completion requiring multiple applications and context switching. 269 | 270 | ### Exploring Community Datasets 271 | 272 | You can also learn from existing trajectory datasets contributed by the community: 273 | 274 | 1. Visit [Hugging Face Datasets tagged with 'cua'](https://huggingface.co/datasets?other=cua) 275 | 2. Explore different approaches to similar tasks 276 | 3. Download and analyze high-quality demonstrations 277 | 278 | ## Conclusion 279 | 280 | ### Summary 281 | 282 | In this guide, we've covered how to: 283 | - Set up the Computer-Use Interface with Gradio UI 284 | - Record high-quality human demonstrations 285 | - Organize and tag your trajectories 286 | - Share your datasets with the community 287 | 288 | By contributing your own demonstrations, you're helping to build more capable, human-like AI systems that can understand and execute complex computer tasks. 289 | 290 | ### Next Steps 291 | 292 | Now that you know how to create and share trajectories, consider these advanced techniques: 293 | 294 | - Create themed collections around specific productivity workflows 295 | - Collaborate with others to build comprehensive datasets 296 | - Use your datasets to fine-tune your own computer-use models 297 | 298 | ### Resources 299 | 300 | - [Computer-Use Interface GitHub](https://github.com/trycua/cua/tree/main/libs/computer) 301 | - [Hugging Face Datasets Documentation](https://huggingface.co/docs/datasets) 302 | - [Example Dataset: ddupont/test-dataset](https://huggingface.co/datasets/ddupont/test-dataset) 303 | ``` -------------------------------------------------------------------------------- /libs/python/pylume/pylume/pylume.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import sys 3 | import json 4 | import time 5 | import asyncio 6 | import subprocess 7 | from typing import Optional, List, Union, Callable, TypeVar, Any 8 | from functools import wraps 9 | import re 10 | import signal 11 | 12 | from .server import LumeServer 13 | from .client import LumeClient 14 | from .models import ( 15 | VMConfig, 16 | VMStatus, 17 | VMRunOpts, 18 | VMUpdateOpts, 19 | ImageRef, 20 | CloneSpec, 21 | SharedDirectory, 22 | ImageList, 23 | ) 24 | from .exceptions import ( 25 | LumeError, 26 | LumeServerError, 27 | LumeConnectionError, 28 | LumeTimeoutError, 29 | LumeNotFoundError, 30 | LumeConfigError, 31 | LumeVMError, 32 | LumeImageError, 33 | ) 34 | 35 | # Type variable for the decorator 36 | T = TypeVar("T") 37 | 38 | 39 | def ensure_server(func: Callable[..., T]) -> Callable[..., T]: 40 | """Decorator to ensure server is running before executing the method.""" 41 | 42 | @wraps(func) 43 | async def wrapper(self: "PyLume", *args: Any, **kwargs: Any) -> T: 44 | # ensure_running is an async method, so we need to await it 45 | await self.server.ensure_running() 46 | # Initialize client if needed 47 | await self._init_client() 48 | return await func(self, *args, **kwargs) # type: ignore 49 | 50 | return wrapper # type: ignore 51 | 52 | 53 | class PyLume: 54 | def __init__( 55 | self, 56 | debug: bool = False, 57 | server_start_timeout: int = 60, 58 | port: Optional[int] = None, 59 | use_existing_server: bool = False, 60 | host: str = "localhost", 61 | ): 62 | """Initialize the async PyLume client. 63 | 64 | Args: 65 | debug: Enable debug logging 66 | auto_start_server: Whether to automatically start the lume server if not running 67 | server_start_timeout: Timeout in seconds to wait for server to start 68 | port: Port number for the lume server. Required when use_existing_server is True. 69 | use_existing_server: If True, will try to connect to an existing server on the specified port 70 | instead of starting a new one. 71 | host: Host to use for connections (e.g., "localhost", "127.0.0.1", "host.docker.internal") 72 | """ 73 | if use_existing_server and port is None: 74 | raise LumeConfigError("Port must be specified when using an existing server") 75 | 76 | self.server = LumeServer( 77 | debug=debug, 78 | server_start_timeout=server_start_timeout, 79 | port=port, 80 | use_existing_server=use_existing_server, 81 | host=host, 82 | ) 83 | self.client = None 84 | 85 | async def __aenter__(self) -> "PyLume": 86 | """Async context manager entry.""" 87 | if self.server.use_existing_server: 88 | # Just ensure base_url is set for existing server 89 | if self.server.requested_port is None: 90 | raise LumeConfigError("Port must be specified when using an existing server") 91 | 92 | if not self.server.base_url: 93 | self.server.port = self.server.requested_port 94 | self.server.base_url = f"http://{self.server.host}:{self.server.port}/lume" 95 | 96 | # Ensure the server is running (will connect to existing or start new as needed) 97 | await self.server.ensure_running() 98 | 99 | # Initialize the client 100 | await self._init_client() 101 | return self 102 | 103 | async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: 104 | """Async context manager exit.""" 105 | if self.client is not None: 106 | await self.client.close() 107 | await self.server.stop() 108 | 109 | async def _init_client(self) -> None: 110 | """Initialize the client if not already initialized.""" 111 | if self.client is None: 112 | if self.server.base_url is None: 113 | raise RuntimeError("Server base URL not set") 114 | self.client = LumeClient(self.server.base_url, debug=self.server.debug) 115 | 116 | def _log_debug(self, message: str, **kwargs) -> None: 117 | """Log debug information if debug mode is enabled.""" 118 | if self.server.debug: 119 | print(f"DEBUG: {message}") 120 | if kwargs: 121 | print(json.dumps(kwargs, indent=2)) 122 | 123 | async def _handle_api_error(self, e: Exception, operation: str) -> None: 124 | """Handle API errors and raise appropriate custom exceptions.""" 125 | if isinstance(e, subprocess.SubprocessError): 126 | raise LumeConnectionError(f"Failed to connect to PyLume server: {str(e)}") 127 | elif isinstance(e, asyncio.TimeoutError): 128 | raise LumeTimeoutError(f"Request timed out: {str(e)}") 129 | 130 | if not hasattr(e, "status") and not isinstance(e, subprocess.CalledProcessError): 131 | raise LumeServerError(f"Unknown error during {operation}: {str(e)}") 132 | 133 | status_code = getattr(e, "status", 500) 134 | response_text = str(e) 135 | 136 | self._log_debug( 137 | f"{operation} request failed", status_code=status_code, response_text=response_text 138 | ) 139 | 140 | if status_code == 404: 141 | raise LumeNotFoundError(f"Resource not found during {operation}") 142 | elif status_code == 400: 143 | raise LumeConfigError(f"Invalid configuration for {operation}: {response_text}") 144 | elif status_code >= 500: 145 | raise LumeServerError( 146 | f"Server error during {operation}", 147 | status_code=status_code, 148 | response_text=response_text, 149 | ) 150 | else: 151 | raise LumeServerError( 152 | f"Error during {operation}", status_code=status_code, response_text=response_text 153 | ) 154 | 155 | async def _read_output(self) -> None: 156 | """Read and log server output.""" 157 | try: 158 | while True: 159 | if not self.server.server_process or self.server.server_process.poll() is not None: 160 | self._log_debug("Server process ended") 161 | break 162 | 163 | # Read stdout without blocking 164 | if self.server.server_process.stdout: 165 | while True: 166 | line = self.server.server_process.stdout.readline() 167 | if not line: 168 | break 169 | line = line.strip() 170 | self._log_debug(f"Server stdout: {line}") 171 | if "Server started" in line.decode("utf-8"): 172 | self._log_debug("Detected server started message") 173 | return 174 | 175 | # Read stderr without blocking 176 | if self.server.server_process.stderr: 177 | while True: 178 | line = self.server.server_process.stderr.readline() 179 | if not line: 180 | break 181 | line = line.strip() 182 | self._log_debug(f"Server stderr: {line}") 183 | if "error" in line.decode("utf-8").lower(): 184 | raise RuntimeError(f"Server error: {line}") 185 | 186 | await asyncio.sleep(0.1) # Small delay to prevent CPU spinning 187 | except Exception as e: 188 | self._log_debug(f"Error in output reader: {str(e)}") 189 | raise 190 | 191 | @ensure_server 192 | async def create_vm(self, spec: Union[VMConfig, dict]) -> None: 193 | """Create a VM with the given configuration.""" 194 | # Ensure client is initialized 195 | await self._init_client() 196 | 197 | if isinstance(spec, VMConfig): 198 | spec = spec.model_dump(by_alias=True, exclude_none=True) 199 | 200 | # Suppress optional attribute access errors 201 | self.client.print_curl("POST", "/vms", spec) # type: ignore[attr-defined] 202 | await self.client.post("/vms", spec) # type: ignore[attr-defined] 203 | 204 | @ensure_server 205 | async def run_vm(self, name: str, opts: Optional[Union[VMRunOpts, dict]] = None) -> None: 206 | """Run a VM.""" 207 | if opts is None: 208 | opts = VMRunOpts(no_display=False) # type: ignore[attr-defined] 209 | elif isinstance(opts, dict): 210 | opts = VMRunOpts(**opts) 211 | 212 | payload = opts.model_dump(by_alias=True, exclude_none=True) 213 | self.client.print_curl("POST", f"/vms/{name}/run", payload) # type: ignore[attr-defined] 214 | await self.client.post(f"/vms/{name}/run", payload) # type: ignore[attr-defined] 215 | 216 | @ensure_server 217 | async def list_vms(self) -> List[VMStatus]: 218 | """List all VMs.""" 219 | data = await self.client.get("/vms") # type: ignore[attr-defined] 220 | return [VMStatus.model_validate(vm) for vm in data] 221 | 222 | @ensure_server 223 | async def get_vm(self, name: str) -> VMStatus: 224 | """Get VM details.""" 225 | data = await self.client.get(f"/vms/{name}") # type: ignore[attr-defined] 226 | return VMStatus.model_validate(data) 227 | 228 | @ensure_server 229 | async def update_vm(self, name: str, params: Union[VMUpdateOpts, dict]) -> None: 230 | """Update VM settings.""" 231 | if isinstance(params, dict): 232 | params = VMUpdateOpts(**params) 233 | 234 | payload = params.model_dump(by_alias=True, exclude_none=True) 235 | self.client.print_curl("PATCH", f"/vms/{name}", payload) # type: ignore[attr-defined] 236 | await self.client.patch(f"/vms/{name}", payload) # type: ignore[attr-defined] 237 | 238 | @ensure_server 239 | async def stop_vm(self, name: str) -> None: 240 | """Stop a VM.""" 241 | await self.client.post(f"/vms/{name}/stop") # type: ignore[attr-defined] 242 | 243 | @ensure_server 244 | async def delete_vm(self, name: str) -> None: 245 | """Delete a VM.""" 246 | await self.client.delete(f"/vms/{name}") # type: ignore[attr-defined] 247 | 248 | @ensure_server 249 | async def pull_image( 250 | self, spec: Union[ImageRef, dict, str], name: Optional[str] = None 251 | ) -> None: 252 | """Pull a VM image.""" 253 | await self._init_client() 254 | if isinstance(spec, str): 255 | if ":" in spec: 256 | image_str = spec 257 | else: 258 | image_str = f"{spec}:latest" 259 | registry = "ghcr.io" 260 | organization = "trycua" 261 | elif isinstance(spec, dict): 262 | image = spec.get("image", "") 263 | tag = spec.get("tag", "latest") 264 | image_str = f"{image}:{tag}" 265 | registry = spec.get("registry", "ghcr.io") 266 | organization = spec.get("organization", "trycua") 267 | else: 268 | image_str = f"{spec.image}:{spec.tag}" 269 | registry = spec.registry 270 | organization = spec.organization 271 | 272 | payload = { 273 | "image": image_str, 274 | "name": name, 275 | "registry": registry, 276 | "organization": organization, 277 | } 278 | 279 | self.client.print_curl("POST", "/pull", payload) # type: ignore[attr-defined] 280 | await self.client.post("/pull", payload, timeout=300.0) # type: ignore[attr-defined] 281 | 282 | @ensure_server 283 | async def clone_vm(self, name: str, new_name: str) -> None: 284 | """Clone a VM with the given name to a new VM with new_name.""" 285 | config = CloneSpec(name=name, newName=new_name) 286 | self.client.print_curl("POST", "/vms/clone", config.model_dump()) # type: ignore[attr-defined] 287 | await self.client.post("/vms/clone", config.model_dump()) # type: ignore[attr-defined] 288 | 289 | @ensure_server 290 | async def get_latest_ipsw_url(self) -> str: 291 | """Get the latest IPSW URL.""" 292 | await self._init_client() 293 | data = await self.client.get("/ipsw") # type: ignore[attr-defined] 294 | return data["url"] 295 | 296 | @ensure_server 297 | async def get_images(self, organization: Optional[str] = None) -> ImageList: 298 | """Get list of available images.""" 299 | await self._init_client() 300 | params = {"organization": organization} if organization else None 301 | data = await self.client.get("/images", params) # type: ignore[attr-defined] 302 | return ImageList(root=data) 303 | 304 | async def close(self) -> None: 305 | """Close the client and stop the server.""" 306 | if self.client is not None: 307 | await self.client.close() 308 | self.client = None 309 | await asyncio.sleep(1) 310 | await self.server.stop() 311 | 312 | async def _ensure_client(self) -> None: 313 | """Ensure client is initialized.""" 314 | if self.client is None: 315 | await self._init_client() 316 | ``` -------------------------------------------------------------------------------- /libs/python/mcp-server/mcp_server/session_manager.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Session Manager for MCP Server - Handles concurrent client sessions with proper resource isolation. 3 | 4 | This module provides: 5 | - Per-session computer instance management 6 | - Resource pooling and lifecycle management 7 | - Graceful session cleanup 8 | - Concurrent task execution support 9 | """ 10 | 11 | import asyncio 12 | import logging 13 | import time 14 | import uuid 15 | from typing import Dict, Optional, Any, List, Set 16 | from dataclasses import dataclass, field 17 | from contextlib import asynccontextmanager 18 | import weakref 19 | 20 | logger = logging.getLogger("mcp-server.session_manager") 21 | 22 | @dataclass 23 | class SessionInfo: 24 | """Information about an active session.""" 25 | session_id: str 26 | computer: Any # Computer instance 27 | created_at: float 28 | last_activity: float 29 | active_tasks: Set[str] = field(default_factory=set) 30 | is_shutting_down: bool = False 31 | 32 | class ComputerPool: 33 | """Pool of computer instances for efficient resource management.""" 34 | 35 | def __init__(self, max_size: int = 5, idle_timeout: float = 300.0): 36 | self.max_size = max_size 37 | self.idle_timeout = idle_timeout 38 | self._available: List[Any] = [] 39 | self._in_use: Set[Any] = set() 40 | self._creation_lock = asyncio.Lock() 41 | 42 | async def acquire(self) -> Any: 43 | """Acquire a computer instance from the pool.""" 44 | # Try to get an available instance 45 | if self._available: 46 | computer = self._available.pop() 47 | self._in_use.add(computer) 48 | logger.debug(f"Reusing computer instance from pool") 49 | return computer 50 | 51 | # Check if we can create a new one 52 | async with self._creation_lock: 53 | if len(self._in_use) < self.max_size: 54 | logger.debug("Creating new computer instance") 55 | from computer import Computer 56 | computer = Computer(verbosity=logging.INFO) 57 | await computer.run() 58 | self._in_use.add(computer) 59 | return computer 60 | 61 | # Wait for an instance to become available 62 | logger.debug("Waiting for computer instance to become available") 63 | while not self._available: 64 | await asyncio.sleep(0.1) 65 | 66 | computer = self._available.pop() 67 | self._in_use.add(computer) 68 | return computer 69 | 70 | async def release(self, computer: Any) -> None: 71 | """Release a computer instance back to the pool.""" 72 | if computer in self._in_use: 73 | self._in_use.remove(computer) 74 | self._available.append(computer) 75 | logger.debug("Released computer instance back to pool") 76 | 77 | async def cleanup_idle(self) -> None: 78 | """Clean up idle computer instances.""" 79 | current_time = time.time() 80 | idle_instances = [] 81 | 82 | for computer in self._available[:]: 83 | # Check if computer has been idle too long 84 | # Note: We'd need to track last use time per instance for this 85 | # For now, we'll keep instances in the pool 86 | pass 87 | 88 | async def shutdown(self) -> None: 89 | """Shutdown all computer instances in the pool.""" 90 | logger.info("Shutting down computer pool") 91 | 92 | # Close all available instances 93 | for computer in self._available: 94 | try: 95 | if hasattr(computer, 'close'): 96 | await computer.close() 97 | elif hasattr(computer, 'stop'): 98 | await computer.stop() 99 | except Exception as e: 100 | logger.warning(f"Error closing computer instance: {e}") 101 | 102 | # Close all in-use instances 103 | for computer in self._in_use: 104 | try: 105 | if hasattr(computer, 'close'): 106 | await computer.close() 107 | elif hasattr(computer, 'stop'): 108 | await computer.stop() 109 | except Exception as e: 110 | logger.warning(f"Error closing computer instance: {e}") 111 | 112 | self._available.clear() 113 | self._in_use.clear() 114 | 115 | class SessionManager: 116 | """Manages concurrent client sessions with proper resource isolation.""" 117 | 118 | def __init__(self, max_concurrent_sessions: int = 10): 119 | self.max_concurrent_sessions = max_concurrent_sessions 120 | self._sessions: Dict[str, SessionInfo] = {} 121 | self._computer_pool = ComputerPool() 122 | self._session_lock = asyncio.Lock() 123 | self._cleanup_task: Optional[asyncio.Task] = None 124 | self._shutdown_event = asyncio.Event() 125 | 126 | async def start(self) -> None: 127 | """Start the session manager and cleanup task.""" 128 | logger.info("Starting session manager") 129 | self._cleanup_task = asyncio.create_task(self._cleanup_loop()) 130 | 131 | async def stop(self) -> None: 132 | """Stop the session manager and cleanup all resources.""" 133 | logger.info("Stopping session manager") 134 | self._shutdown_event.set() 135 | 136 | if self._cleanup_task: 137 | self._cleanup_task.cancel() 138 | try: 139 | await self._cleanup_task 140 | except asyncio.CancelledError: 141 | pass 142 | 143 | # Force cleanup all sessions 144 | async with self._session_lock: 145 | session_ids = list(self._sessions.keys()) 146 | 147 | for session_id in session_ids: 148 | await self._force_cleanup_session(session_id) 149 | 150 | await self._computer_pool.shutdown() 151 | 152 | @asynccontextmanager 153 | async def get_session(self, session_id: Optional[str] = None) -> Any: 154 | """Get or create a session with proper resource management.""" 155 | if session_id is None: 156 | session_id = str(uuid.uuid4()) 157 | 158 | # Check if session exists and is not shutting down 159 | async with self._session_lock: 160 | if session_id in self._sessions: 161 | session = self._sessions[session_id] 162 | if session.is_shutting_down: 163 | raise RuntimeError(f"Session {session_id} is shutting down") 164 | session.last_activity = time.time() 165 | computer = session.computer 166 | else: 167 | # Create new session 168 | if len(self._sessions) >= self.max_concurrent_sessions: 169 | raise RuntimeError(f"Maximum concurrent sessions ({self.max_concurrent_sessions}) reached") 170 | 171 | computer = await self._computer_pool.acquire() 172 | session = SessionInfo( 173 | session_id=session_id, 174 | computer=computer, 175 | created_at=time.time(), 176 | last_activity=time.time() 177 | ) 178 | self._sessions[session_id] = session 179 | logger.info(f"Created new session: {session_id}") 180 | 181 | try: 182 | yield session 183 | finally: 184 | # Update last activity 185 | async with self._session_lock: 186 | if session_id in self._sessions: 187 | self._sessions[session_id].last_activity = time.time() 188 | 189 | async def register_task(self, session_id: str, task_id: str) -> None: 190 | """Register a task for a session.""" 191 | async with self._session_lock: 192 | if session_id in self._sessions: 193 | self._sessions[session_id].active_tasks.add(task_id) 194 | logger.debug(f"Registered task {task_id} for session {session_id}") 195 | 196 | async def unregister_task(self, session_id: str, task_id: str) -> None: 197 | """Unregister a task from a session.""" 198 | async with self._session_lock: 199 | if session_id in self._sessions: 200 | self._sessions[session_id].active_tasks.discard(task_id) 201 | logger.debug(f"Unregistered task {task_id} from session {session_id}") 202 | 203 | async def cleanup_session(self, session_id: str) -> None: 204 | """Cleanup a specific session.""" 205 | async with self._session_lock: 206 | if session_id not in self._sessions: 207 | return 208 | 209 | session = self._sessions[session_id] 210 | 211 | # Check if session has active tasks 212 | if session.active_tasks: 213 | logger.info(f"Session {session_id} has active tasks, marking for shutdown") 214 | session.is_shutting_down = True 215 | return 216 | 217 | # Actually cleanup the session 218 | await self._force_cleanup_session(session_id) 219 | 220 | async def _force_cleanup_session(self, session_id: str) -> None: 221 | """Force cleanup a session regardless of active tasks.""" 222 | async with self._session_lock: 223 | if session_id not in self._sessions: 224 | return 225 | 226 | session = self._sessions[session_id] 227 | logger.info(f"Cleaning up session: {session_id}") 228 | 229 | # Release computer back to pool 230 | await self._computer_pool.release(session.computer) 231 | 232 | # Remove session 233 | del self._sessions[session_id] 234 | 235 | async def _cleanup_loop(self) -> None: 236 | """Background task to cleanup idle sessions.""" 237 | while not self._shutdown_event.is_set(): 238 | try: 239 | await asyncio.sleep(60) # Run cleanup every minute 240 | 241 | current_time = time.time() 242 | idle_timeout = 600.0 # 10 minutes 243 | 244 | async with self._session_lock: 245 | idle_sessions = [] 246 | for session_id, session in self._sessions.items(): 247 | if not session.is_shutting_down and not session.active_tasks: 248 | if current_time - session.last_activity > idle_timeout: 249 | idle_sessions.append(session_id) 250 | 251 | # Cleanup idle sessions 252 | for session_id in idle_sessions: 253 | await self._force_cleanup_session(session_id) 254 | logger.info(f"Cleaned up idle session: {session_id}") 255 | 256 | except asyncio.CancelledError: 257 | break 258 | except Exception as e: 259 | logger.error(f"Error in cleanup loop: {e}") 260 | 261 | def get_session_stats(self) -> Dict[str, Any]: 262 | """Get statistics about active sessions.""" 263 | async def _get_stats(): 264 | async with self._session_lock: 265 | return { 266 | "total_sessions": len(self._sessions), 267 | "max_concurrent": self.max_concurrent_sessions, 268 | "sessions": { 269 | session_id: { 270 | "created_at": session.created_at, 271 | "last_activity": session.last_activity, 272 | "active_tasks": len(session.active_tasks), 273 | "is_shutting_down": session.is_shutting_down 274 | } 275 | for session_id, session in self._sessions.items() 276 | } 277 | } 278 | 279 | # Run in current event loop or create new one 280 | try: 281 | loop = asyncio.get_running_loop() 282 | return asyncio.run_coroutine_threadsafe(_get_stats(), loop).result() 283 | except RuntimeError: 284 | # No event loop running, create a new one 285 | return asyncio.run(_get_stats()) 286 | 287 | # Global session manager instance 288 | _session_manager: Optional[SessionManager] = None 289 | 290 | def get_session_manager() -> SessionManager: 291 | """Get the global session manager instance.""" 292 | global _session_manager 293 | if _session_manager is None: 294 | _session_manager = SessionManager() 295 | return _session_manager 296 | 297 | async def initialize_session_manager() -> None: 298 | """Initialize the global session manager.""" 299 | global _session_manager 300 | if _session_manager is None: 301 | _session_manager = SessionManager() 302 | await _session_manager.start() 303 | return _session_manager 304 | 305 | async def shutdown_session_manager() -> None: 306 | """Shutdown the global session manager.""" 307 | global _session_manager 308 | if _session_manager is not None: 309 | await _session_manager.stop() 310 | _session_manager = None 311 | ``` -------------------------------------------------------------------------------- /.github/workflows/pypi-reusable-publish.yml: -------------------------------------------------------------------------------- ```yaml 1 | name: Reusable Package Publish Workflow 2 | 3 | on: 4 | workflow_call: 5 | inputs: 6 | package_name: 7 | description: "Name of the package (e.g. pylume, computer, agent)" 8 | required: true 9 | type: string 10 | package_dir: 11 | description: "Directory containing the package relative to workspace root (e.g. libs/python/pylume)" 12 | required: true 13 | type: string 14 | version: 15 | description: "Version to publish" 16 | required: true 17 | type: string 18 | is_lume_package: 19 | description: "Whether this package includes the lume binary" 20 | required: false 21 | type: boolean 22 | default: false 23 | base_package_name: 24 | description: "PyPI package name (e.g. pylume, cua-agent)" 25 | required: true 26 | type: string 27 | make_latest: 28 | description: "Whether to mark this release as latest (should only be true for lume)" 29 | required: false 30 | type: boolean 31 | default: false 32 | secrets: 33 | PYPI_TOKEN: 34 | required: true 35 | outputs: 36 | version: 37 | description: "The version that was published" 38 | value: ${{ jobs.build-and-publish.outputs.version }} 39 | 40 | jobs: 41 | build-and-publish: 42 | runs-on: macos-latest 43 | permissions: 44 | contents: write # This permission is needed for creating releases 45 | outputs: 46 | version: ${{ steps.set-version.outputs.version }} 47 | steps: 48 | - uses: actions/checkout@v4 49 | with: 50 | fetch-depth: 0 # Full history for release creation 51 | 52 | - name: Set up Python 53 | uses: actions/setup-python@v4 54 | with: 55 | python-version: "3.11" 56 | 57 | - name: Create root pdm.lock file 58 | run: | 59 | # Create an empty pdm.lock file in the root 60 | touch pdm.lock 61 | 62 | - name: Install PDM 63 | uses: pdm-project/setup-pdm@v3 64 | with: 65 | python-version: "3.11" 66 | cache: true 67 | 68 | - name: Set version 69 | id: set-version 70 | run: | 71 | echo "VERSION=${{ inputs.version }}" >> $GITHUB_ENV 72 | echo "version=${{ inputs.version }}" >> $GITHUB_OUTPUT 73 | 74 | - name: Verify version consistency 75 | run: | 76 | # Install toml parser 77 | pip install toml 78 | 79 | # Verify version matches using script (exits with error if mismatch) 80 | python ${GITHUB_WORKSPACE}/.github/scripts/get_pyproject_version.py \ 81 | ${{ inputs.package_dir }}/pyproject.toml \ 82 | ${{ inputs.version }} 83 | 84 | - name: Initialize PDM in package directory 85 | run: | 86 | # Make sure we're working with a properly initialized PDM project 87 | cd ${{ inputs.package_dir }} 88 | 89 | # Create pdm.lock if it doesn't exist 90 | if [ ! -f "pdm.lock" ]; then 91 | echo "No pdm.lock found, initializing PDM project..." 92 | pdm lock 93 | fi 94 | 95 | # Conditional step for lume binary download (only for pylume package) 96 | - name: Download and setup lume binary 97 | if: inputs.is_lume_package 98 | run: | 99 | # Create a temporary directory for extraction 100 | mkdir -p temp_lume 101 | 102 | # Download the latest lume release directly 103 | echo "Downloading latest lume version..." 104 | curl -sL "https://github.com/trycua/lume/releases/latest/download/lume.tar.gz" -o temp_lume/lume.tar.gz 105 | 106 | # Extract the tar file (ignore ownership and suppress warnings) 107 | cd temp_lume && tar --no-same-owner -xzf lume.tar.gz 108 | 109 | # Make the binary executable 110 | chmod +x lume 111 | 112 | # Copy the lume binary to the correct location in the pylume package 113 | mkdir -p "${GITHUB_WORKSPACE}/${{ inputs.package_dir }}/pylume" 114 | cp lume "${GITHUB_WORKSPACE}/${{ inputs.package_dir }}/pylume/lume" 115 | 116 | # Verify the binary exists and is executable 117 | test -x "${GITHUB_WORKSPACE}/${{ inputs.package_dir }}/pylume/lume" || { echo "lume binary not found or not executable"; exit 1; } 118 | 119 | # Get the version from the downloaded binary for reference 120 | LUME_VERSION=$(./lume --version | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' || echo "unknown") 121 | echo "Using lume version: $LUME_VERSION" 122 | 123 | # Cleanup 124 | cd "${GITHUB_WORKSPACE}" && rm -rf temp_lume 125 | 126 | # Save the lume version for reference 127 | echo "LUME_VERSION=${LUME_VERSION}" >> $GITHUB_ENV 128 | 129 | - name: Build and publish 130 | env: 131 | PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} 132 | run: | 133 | cd ${{ inputs.package_dir }} 134 | # Build with PDM 135 | pdm build 136 | 137 | # For pylume package, verify the binary is in the wheel 138 | if [ "${{ inputs.is_lume_package }}" = "true" ]; then 139 | python -m pip install wheel 140 | wheel unpack dist/*.whl --dest temp_wheel 141 | echo "Listing contents of wheel directory:" 142 | find temp_wheel -type f 143 | test -f temp_wheel/pylume-*/pylume/lume || { echo "lume binary not found in wheel"; exit 1; } 144 | rm -rf temp_wheel 145 | echo "Publishing ${{ inputs.base_package_name }} ${VERSION} with lume ${LUME_VERSION}" 146 | else 147 | echo "Publishing ${{ inputs.base_package_name }} ${VERSION}" 148 | fi 149 | 150 | # Install and use twine directly instead of PDM publish 151 | echo "Installing twine for direct publishing..." 152 | pip install twine 153 | 154 | echo "Publishing to PyPI using twine..." 155 | TWINE_USERNAME="__token__" TWINE_PASSWORD="$PYPI_TOKEN" python -m twine upload dist/* 156 | 157 | # Save the wheel file path for the release 158 | WHEEL_FILE=$(ls dist/*.whl | head -1) 159 | echo "WHEEL_FILE=${WHEEL_FILE}" >> $GITHUB_ENV 160 | 161 | - name: Prepare Simple Release Notes 162 | if: startsWith(github.ref, 'refs/tags/') 163 | run: | 164 | # Create release notes based on package type 165 | echo "# ${{ inputs.base_package_name }} v${VERSION}" > release_notes.md 166 | echo "" >> release_notes.md 167 | 168 | if [ "${{ inputs.package_name }}" = "pylume" ]; then 169 | echo "## Python SDK for lume - run macOS and Linux VMs on Apple Silicon" >> release_notes.md 170 | echo "" >> release_notes.md 171 | echo "This package provides Python bindings for the lume virtualization tool." >> release_notes.md 172 | echo "" >> release_notes.md 173 | echo "## Dependencies" >> release_notes.md 174 | echo "* lume binary: v${LUME_VERSION}" >> release_notes.md 175 | elif [ "${{ inputs.package_name }}" = "computer" ]; then 176 | echo "## Computer control library for the Computer Universal Automation (CUA) project" >> release_notes.md 177 | echo "" >> release_notes.md 178 | echo "## Dependencies" >> release_notes.md 179 | echo "* pylume: ${PYLUME_VERSION:-latest}" >> release_notes.md 180 | elif [ "${{ inputs.package_name }}" = "agent" ]; then 181 | echo "## Dependencies" >> release_notes.md 182 | echo "* cua-computer: ${COMPUTER_VERSION:-latest}" >> release_notes.md 183 | echo "* cua-som: ${SOM_VERSION:-latest}" >> release_notes.md 184 | echo "" >> release_notes.md 185 | echo "## Installation Options" >> release_notes.md 186 | echo "" >> release_notes.md 187 | echo "### Basic installation with Anthropic" >> release_notes.md 188 | echo '```bash' >> release_notes.md 189 | echo "pip install cua-agent[anthropic]==${VERSION}" >> release_notes.md 190 | echo '```' >> release_notes.md 191 | echo "" >> release_notes.md 192 | echo "### With SOM (recommended)" >> release_notes.md 193 | echo '```bash' >> release_notes.md 194 | echo "pip install cua-agent[som]==${VERSION}" >> release_notes.md 195 | echo '```' >> release_notes.md 196 | echo "" >> release_notes.md 197 | echo "### All features" >> release_notes.md 198 | echo '```bash' >> release_notes.md 199 | echo "pip install cua-agent[all]==${VERSION}" >> release_notes.md 200 | echo '```' >> release_notes.md 201 | elif [ "${{ inputs.package_name }}" = "som" ]; then 202 | echo "## Computer Vision and OCR library for detecting and analyzing UI elements" >> release_notes.md 203 | echo "" >> release_notes.md 204 | echo "This package provides enhanced UI understanding capabilities through computer vision and OCR." >> release_notes.md 205 | elif [ "${{ inputs.package_name }}" = "computer-server" ]; then 206 | echo "## Computer Server for the Computer Universal Automation (CUA) project" >> release_notes.md 207 | echo "" >> release_notes.md 208 | echo "A FastAPI-based server implementation for computer control." >> release_notes.md 209 | echo "" >> release_notes.md 210 | echo "## Dependencies" >> release_notes.md 211 | echo "* cua-computer: ${COMPUTER_VERSION:-latest}" >> release_notes.md 212 | echo "" >> release_notes.md 213 | echo "## Usage" >> release_notes.md 214 | echo '```bash' >> release_notes.md 215 | echo "# Run the server" >> release_notes.md 216 | echo "cua-computer-server" >> release_notes.md 217 | echo '```' >> release_notes.md 218 | elif [ "${{ inputs.package_name }}" = "mcp-server" ]; then 219 | echo "## MCP Server for the Computer-Use Agent (CUA)" >> release_notes.md 220 | echo "" >> release_notes.md 221 | echo "This package provides MCP (Model Context Protocol) integration for CUA agents, allowing them to be used with Claude Desktop, Cursor, and other MCP clients." >> release_notes.md 222 | echo "" >> release_notes.md 223 | echo "## Dependencies" >> release_notes.md 224 | echo "* cua-computer: ${COMPUTER_VERSION:-latest}" >> release_notes.md 225 | echo "* cua-agent: ${AGENT_VERSION:-latest}" >> release_notes.md 226 | echo "" >> release_notes.md 227 | echo "## Usage" >> release_notes.md 228 | echo '```bash' >> release_notes.md 229 | echo "# Run the MCP server directly" >> release_notes.md 230 | echo "cua-mcp-server" >> release_notes.md 231 | echo '```' >> release_notes.md 232 | echo "" >> release_notes.md 233 | echo "## Claude Desktop Integration" >> release_notes.md 234 | echo "Add to your Claude Desktop configuration (~/.config/claude-desktop/claude_desktop_config.json or OS-specific location):" >> release_notes.md 235 | echo '```json' >> release_notes.md 236 | echo '"mcpServers": {' >> release_notes.md 237 | echo ' "cua-agent": {' >> release_notes.md 238 | echo ' "command": "cua-mcp-server",' >> release_notes.md 239 | echo ' "args": [],' >> release_notes.md 240 | echo ' "env": {' >> release_notes.md 241 | echo ' "CUA_AGENT_LOOP": "OMNI",' >> release_notes.md 242 | echo ' "CUA_MODEL_PROVIDER": "ANTHROPIC",' >> release_notes.md 243 | echo ' "CUA_MODEL_NAME": "claude-3-opus-20240229",' >> release_notes.md 244 | echo ' "ANTHROPIC_API_KEY": "your-api-key",' >> release_notes.md 245 | echo ' "PYTHONIOENCODING": "utf-8"' >> release_notes.md 246 | echo ' }' >> release_notes.md 247 | echo ' }' >> release_notes.md 248 | echo '}' >> release_notes.md 249 | echo '```' >> release_notes.md 250 | fi 251 | 252 | # Add installation section if not agent (which has its own installation section) 253 | if [ "${{ inputs.package_name }}" != "agent" ]; then 254 | echo "" >> release_notes.md 255 | echo "## Installation" >> release_notes.md 256 | echo '```bash' >> release_notes.md 257 | echo "pip install ${{ inputs.base_package_name }}==${VERSION}" >> release_notes.md 258 | echo '```' >> release_notes.md 259 | fi 260 | 261 | echo "Release notes created:" 262 | cat release_notes.md 263 | 264 | - name: Create GitHub Release 265 | uses: softprops/action-gh-release@v2 266 | if: startsWith(github.ref, 'refs/tags/') 267 | with: 268 | name: "${{ inputs.base_package_name }} v${{ env.VERSION }}" 269 | body_path: release_notes.md 270 | files: ${{ inputs.package_dir }}/${{ env.WHEEL_FILE }} 271 | draft: false 272 | prerelease: false 273 | make_latest: ${{ inputs.package_name == 'lume' }} 274 | env: 275 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 276 | ``` -------------------------------------------------------------------------------- /libs/python/agent/agent/loops/composed_grounded.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Composed-grounded agent loop implementation that combines grounding and thinking models. 3 | Uses a two-stage approach: grounding model for element detection, thinking model for reasoning. 4 | """ 5 | 6 | import uuid 7 | import asyncio 8 | import json 9 | import base64 10 | from typing import Dict, List, Any, Optional, Tuple 11 | from io import BytesIO 12 | from PIL import Image 13 | import litellm 14 | 15 | from ..decorators import register_agent 16 | from ..types import Messages, AgentResponse, Tools, AgentCapability 17 | from ..loops.base import AsyncAgentConfig 18 | from ..responses import ( 19 | convert_computer_calls_xy2desc, 20 | convert_responses_items_to_completion_messages, 21 | convert_completion_messages_to_responses_items, 22 | convert_computer_calls_desc2xy, 23 | get_all_element_descriptions 24 | ) 25 | from ..agent import find_agent_config 26 | 27 | GROUNDED_COMPUTER_TOOL_SCHEMA = { 28 | "type": "function", 29 | "function": { 30 | "name": "computer", 31 | "description": "Control a computer by taking screenshots and interacting with UI elements. This tool uses element descriptions to locate and interact with UI elements on the screen (e.g., 'red submit button', 'search text field', 'hamburger menu icon', 'close button in top right corner').", 32 | "parameters": { 33 | "type": "object", 34 | "properties": { 35 | "action": { 36 | "type": "string", 37 | "enum": [ 38 | "screenshot", 39 | "click", 40 | "double_click", 41 | "drag", 42 | "type", 43 | "keypress", 44 | "scroll", 45 | "move", 46 | "wait", 47 | "get_current_url", 48 | "get_dimensions", 49 | "get_environment" 50 | ], 51 | "description": "The action to perform (required for all actions)" 52 | }, 53 | "element_description": { 54 | "type": "string", 55 | "description": "Description of the element to interact with (required for click, double_click, move, scroll actions)" 56 | }, 57 | "start_element_description": { 58 | "type": "string", 59 | "description": "Description of the element to start dragging from (required for drag action)" 60 | }, 61 | "end_element_description": { 62 | "type": "string", 63 | "description": "Description of the element to drag to (required for drag action)" 64 | }, 65 | "text": { 66 | "type": "string", 67 | "description": "The text to type (required for type action)" 68 | }, 69 | "keys": { 70 | "type": "array", 71 | "items": { 72 | "type": "string" 73 | }, 74 | "description": "Key(s) to press (required for keypress action)" 75 | }, 76 | "button": { 77 | "type": "string", 78 | "enum": [ 79 | "left", 80 | "right", 81 | "wheel", 82 | "back", 83 | "forward" 84 | ], 85 | "description": "The mouse button to use for click action (required for click and double_click action)", 86 | }, 87 | "scroll_x": { 88 | "type": "integer", 89 | "description": "Horizontal scroll amount for scroll action (required for scroll action)", 90 | }, 91 | "scroll_y": { 92 | "type": "integer", 93 | "description": "Vertical scroll amount for scroll action (required for scroll action)", 94 | }, 95 | }, 96 | "required": [ 97 | "action" 98 | ] 99 | } 100 | } 101 | } 102 | 103 | def _prepare_tools_for_grounded(tool_schemas: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 104 | """Prepare tools for grounded API format""" 105 | grounded_tools = [] 106 | 107 | for schema in tool_schemas: 108 | if schema["type"] == "computer": 109 | grounded_tools.append(GROUNDED_COMPUTER_TOOL_SCHEMA) 110 | else: 111 | grounded_tools.append(schema) 112 | 113 | return grounded_tools 114 | 115 | def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str]: 116 | """Get the last computer call output image from messages.""" 117 | for message in reversed(messages): 118 | if (isinstance(message, dict) and 119 | message.get("type") == "computer_call_output" and 120 | isinstance(message.get("output"), dict) and 121 | message["output"].get("type") == "input_image"): 122 | image_url = message["output"].get("image_url", "") 123 | if image_url.startswith("data:image/png;base64,"): 124 | return image_url.split(",", 1)[1] 125 | return None 126 | 127 | 128 | @register_agent(r".*\+.*", priority=1) 129 | class ComposedGroundedConfig(AsyncAgentConfig): 130 | """ 131 | Composed-grounded agent configuration that uses both grounding and thinking models. 132 | 133 | The model parameter should be in format: "grounding_model+thinking_model" 134 | e.g., "huggingface-local/HelloKKMe/GTA1-7B+gemini/gemini-1.5-pro" 135 | """ 136 | 137 | def __init__(self): 138 | self.desc2xy: Dict[str, Tuple[float, float]] = {} 139 | 140 | async def predict_step( 141 | self, 142 | messages: List[Dict[str, Any]], 143 | model: str, 144 | tools: Optional[List[Dict[str, Any]]] = None, 145 | max_retries: Optional[int] = None, 146 | stream: bool = False, 147 | computer_handler=None, 148 | use_prompt_caching: Optional[bool] = False, 149 | _on_api_start=None, 150 | _on_api_end=None, 151 | _on_usage=None, 152 | _on_screenshot=None, 153 | **kwargs 154 | ) -> Dict[str, Any]: 155 | """ 156 | Composed-grounded predict step implementation. 157 | 158 | Process: 159 | 0. Store last computer call image, if none then take a screenshot 160 | 1. Convert computer calls from xy to descriptions 161 | 2. Convert responses items to completion messages 162 | 3. Call thinking model with litellm.acompletion 163 | 4. Convert completion messages to responses items 164 | 5. Get all element descriptions and populate desc2xy mapping 165 | 6. Convert computer calls from descriptions back to xy coordinates 166 | 7. Return output and usage 167 | """ 168 | # Parse the composed model 169 | if "+" not in model: 170 | raise ValueError(f"Composed model must be in format 'grounding_model+thinking_model', got: {model}") 171 | grounding_model, thinking_model = model.split("+", 1) 172 | 173 | pre_output_items = [] 174 | 175 | # Step 0: Store last computer call image, if none then take a screenshot 176 | last_image_b64 = get_last_computer_call_image(messages) 177 | if last_image_b64 is None: 178 | # Take a screenshot 179 | screenshot_b64 = await computer_handler.screenshot() # type: ignore 180 | if screenshot_b64: 181 | 182 | call_id = uuid.uuid4().hex 183 | pre_output_items += [ 184 | { 185 | "type": "message", 186 | "role": "assistant", 187 | "content": [ 188 | { 189 | "type": "output_text", 190 | "text": "Taking a screenshot to see the current computer screen." 191 | } 192 | ] 193 | }, 194 | { 195 | "action": { 196 | "type": "screenshot" 197 | }, 198 | "call_id": call_id, 199 | "status": "completed", 200 | "type": "computer_call" 201 | }, 202 | { 203 | "type": "computer_call_output", 204 | "call_id": call_id, 205 | "output": { 206 | "type": "input_image", 207 | "image_url": f"data:image/png;base64,{screenshot_b64}" 208 | } 209 | }, 210 | ] 211 | last_image_b64 = screenshot_b64 212 | 213 | # Call screenshot callback if provided 214 | if _on_screenshot: 215 | await _on_screenshot(screenshot_b64) 216 | 217 | tool_schemas = _prepare_tools_for_grounded(tools) # type: ignore 218 | 219 | # Step 1: Convert computer calls from xy to descriptions 220 | input_messages = messages + pre_output_items 221 | messages_with_descriptions = convert_computer_calls_xy2desc(input_messages, self.desc2xy) 222 | 223 | # Step 2: Convert responses items to completion messages 224 | completion_messages = convert_responses_items_to_completion_messages( 225 | messages_with_descriptions, 226 | allow_images_in_tool_results=False 227 | ) 228 | 229 | # Step 3: Call thinking model with litellm.acompletion 230 | api_kwargs = { 231 | "model": thinking_model, 232 | "messages": completion_messages, 233 | "tools": tool_schemas, 234 | "max_retries": max_retries, 235 | "stream": stream, 236 | **kwargs 237 | } 238 | 239 | if use_prompt_caching: 240 | api_kwargs["use_prompt_caching"] = use_prompt_caching 241 | 242 | # Call API start hook 243 | if _on_api_start: 244 | await _on_api_start(api_kwargs) 245 | 246 | # Make the completion call 247 | response = await litellm.acompletion(**api_kwargs) 248 | 249 | # Call API end hook 250 | if _on_api_end: 251 | await _on_api_end(api_kwargs, response) 252 | 253 | # Extract usage information 254 | usage = { 255 | **response.usage.model_dump(), # type: ignore 256 | "response_cost": response._hidden_params.get("response_cost", 0.0), 257 | } 258 | if _on_usage: 259 | await _on_usage(usage) 260 | 261 | # Step 4: Convert completion messages back to responses items format 262 | response_dict = response.model_dump() # type: ignore 263 | choice_messages = [choice["message"] for choice in response_dict["choices"]] 264 | thinking_output_items = [] 265 | 266 | for choice_message in choice_messages: 267 | thinking_output_items.extend(convert_completion_messages_to_responses_items([choice_message])) 268 | 269 | # Step 5: Get all element descriptions and populate desc2xy mapping 270 | element_descriptions = get_all_element_descriptions(thinking_output_items) 271 | 272 | if element_descriptions and last_image_b64: 273 | # Use grounding model to predict coordinates for each description 274 | grounding_agent_conf = find_agent_config(grounding_model) 275 | if grounding_agent_conf: 276 | grounding_agent = grounding_agent_conf.agent_class() 277 | 278 | for desc in element_descriptions: 279 | for _ in range(3): # try 3 times 280 | coords = await grounding_agent.predict_click( 281 | model=grounding_model, 282 | image_b64=last_image_b64, 283 | instruction=desc 284 | ) 285 | if coords: 286 | self.desc2xy[desc] = coords 287 | break 288 | 289 | # Step 6: Convert computer calls from descriptions back to xy coordinates 290 | final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy) 291 | 292 | # Step 7: Return output and usage 293 | return { 294 | "output": pre_output_items + final_output_items, 295 | "usage": usage 296 | } 297 | 298 | async def predict_click( 299 | self, 300 | model: str, 301 | image_b64: str, 302 | instruction: str, 303 | **kwargs 304 | ) -> Optional[Tuple[int, int]]: 305 | """ 306 | Predict click coordinates using the grounding model. 307 | 308 | For composed models, uses only the grounding model part for click prediction. 309 | """ 310 | # Parse the composed model to get grounding model 311 | if "+" not in model: 312 | raise ValueError(f"Composed model must be in format 'grounding_model+thinking_model', got: {model}") 313 | grounding_model, thinking_model = model.split("+", 1) 314 | 315 | # Find and use the grounding agent 316 | grounding_agent_conf = find_agent_config(grounding_model) 317 | if grounding_agent_conf: 318 | grounding_agent = grounding_agent_conf.agent_class() 319 | return await grounding_agent.predict_click( 320 | model=grounding_model, 321 | image_b64=image_b64, 322 | instruction=instruction, 323 | **kwargs 324 | ) 325 | 326 | return None 327 | 328 | def get_capabilities(self) -> List[AgentCapability]: 329 | """Return the capabilities supported by this agent.""" 330 | return ["click", "step"] 331 | ``` -------------------------------------------------------------------------------- /.github/scripts/tests/test_get_pyproject_version.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Comprehensive tests for get_pyproject_version.py script using unittest. 3 | 4 | This test suite covers: 5 | - Version matching validation 6 | - Error handling for missing versions 7 | - Invalid input handling 8 | - File not found scenarios 9 | - Malformed TOML handling 10 | """ 11 | 12 | import sys 13 | import unittest 14 | import tempfile 15 | from pathlib import Path 16 | from io import StringIO 17 | from unittest.mock import patch 18 | 19 | # Add parent directory to path to import the module 20 | sys.path.insert(0, str(Path(__file__).parent.parent)) 21 | 22 | # Import after path is modified 23 | import get_pyproject_version 24 | 25 | 26 | class TestGetPyprojectVersion(unittest.TestCase): 27 | """Test suite for get_pyproject_version.py functionality.""" 28 | 29 | def setUp(self): 30 | """Reset sys.argv before each test.""" 31 | self.original_argv = sys.argv.copy() 32 | 33 | def tearDown(self): 34 | """Restore sys.argv after each test.""" 35 | sys.argv = self.original_argv 36 | 37 | def create_pyproject_toml(self, version: str) -> Path: 38 | """Helper to create a temporary pyproject.toml file with a given version.""" 39 | temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.toml', delete=False) 40 | temp_file.write(f""" 41 | [project] 42 | name = "test-project" 43 | version = "{version}" 44 | description = "A test project" 45 | """) 46 | temp_file.close() 47 | return Path(temp_file.name) 48 | 49 | def create_pyproject_toml_no_version(self) -> Path: 50 | """Helper to create a pyproject.toml without a version field.""" 51 | temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.toml', delete=False) 52 | temp_file.write(""" 53 | [project] 54 | name = "test-project" 55 | description = "A test project without version" 56 | """) 57 | temp_file.close() 58 | return Path(temp_file.name) 59 | 60 | def create_pyproject_toml_no_project(self) -> Path: 61 | """Helper to create a pyproject.toml without a project section.""" 62 | temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.toml', delete=False) 63 | temp_file.write(""" 64 | [tool.poetry] 65 | name = "test-project" 66 | version = "1.0.0" 67 | """) 68 | temp_file.close() 69 | return Path(temp_file.name) 70 | 71 | def create_malformed_toml(self) -> Path: 72 | """Helper to create a malformed TOML file.""" 73 | temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.toml', delete=False) 74 | temp_file.write(""" 75 | [project 76 | name = "test-project 77 | version = "1.0.0" 78 | """) 79 | temp_file.close() 80 | return Path(temp_file.name) 81 | 82 | # Test: Successful version match 83 | def test_matching_versions(self): 84 | """Test that matching versions result in success.""" 85 | pyproject_file = self.create_pyproject_toml("1.2.3") 86 | 87 | try: 88 | sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.2.3'] 89 | 90 | # Capture stdout 91 | captured_output = StringIO() 92 | with patch('sys.stdout', captured_output): 93 | with self.assertRaises(SystemExit) as cm: 94 | get_pyproject_version.main() 95 | 96 | self.assertEqual(cm.exception.code, 0) 97 | self.assertIn("✅ Version consistency check passed: 1.2.3", captured_output.getvalue()) 98 | finally: 99 | pyproject_file.unlink() 100 | 101 | # Test: Version mismatch 102 | def test_version_mismatch(self): 103 | """Test that mismatched versions result in failure with appropriate error message.""" 104 | pyproject_file = self.create_pyproject_toml("1.2.3") 105 | 106 | try: 107 | sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.2.4'] 108 | 109 | # Capture stderr 110 | captured_error = StringIO() 111 | with patch('sys.stderr', captured_error): 112 | with self.assertRaises(SystemExit) as cm: 113 | get_pyproject_version.main() 114 | 115 | self.assertEqual(cm.exception.code, 1) 116 | error_output = captured_error.getvalue() 117 | self.assertIn("❌ Version mismatch detected!", error_output) 118 | self.assertIn("pyproject.toml version: 1.2.3", error_output) 119 | self.assertIn("Expected version: 1.2.4", error_output) 120 | self.assertIn("Please update pyproject.toml to version 1.2.4", error_output) 121 | finally: 122 | pyproject_file.unlink() 123 | 124 | # Test: Missing version in pyproject.toml 125 | def test_missing_version_field(self): 126 | """Test handling of pyproject.toml without a version field.""" 127 | pyproject_file = self.create_pyproject_toml_no_version() 128 | 129 | try: 130 | sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.0.0'] 131 | 132 | captured_error = StringIO() 133 | with patch('sys.stderr', captured_error): 134 | with self.assertRaises(SystemExit) as cm: 135 | get_pyproject_version.main() 136 | 137 | self.assertEqual(cm.exception.code, 1) 138 | self.assertIn("❌ ERROR: No version found in pyproject.toml", captured_error.getvalue()) 139 | finally: 140 | pyproject_file.unlink() 141 | 142 | # Test: Missing project section 143 | def test_missing_project_section(self): 144 | """Test handling of pyproject.toml without a project section.""" 145 | pyproject_file = self.create_pyproject_toml_no_project() 146 | 147 | try: 148 | sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.0.0'] 149 | 150 | captured_error = StringIO() 151 | with patch('sys.stderr', captured_error): 152 | with self.assertRaises(SystemExit) as cm: 153 | get_pyproject_version.main() 154 | 155 | self.assertEqual(cm.exception.code, 1) 156 | self.assertIn("❌ ERROR: No version found in pyproject.toml", captured_error.getvalue()) 157 | finally: 158 | pyproject_file.unlink() 159 | 160 | # Test: File not found 161 | def test_file_not_found(self): 162 | """Test handling of non-existent pyproject.toml file.""" 163 | sys.argv = ['get_pyproject_version.py', '/nonexistent/pyproject.toml', '1.0.0'] 164 | 165 | with self.assertRaises(SystemExit) as cm: 166 | get_pyproject_version.main() 167 | 168 | self.assertEqual(cm.exception.code, 1) 169 | 170 | # Test: Malformed TOML 171 | def test_malformed_toml(self): 172 | """Test handling of malformed TOML file.""" 173 | pyproject_file = self.create_malformed_toml() 174 | 175 | try: 176 | sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.0.0'] 177 | 178 | with self.assertRaises(SystemExit) as cm: 179 | get_pyproject_version.main() 180 | 181 | self.assertEqual(cm.exception.code, 1) 182 | finally: 183 | pyproject_file.unlink() 184 | 185 | # Test: Incorrect number of arguments - too few 186 | def test_too_few_arguments(self): 187 | """Test that providing too few arguments results in usage error.""" 188 | sys.argv = ['get_pyproject_version.py', 'pyproject.toml'] 189 | 190 | captured_error = StringIO() 191 | with patch('sys.stderr', captured_error): 192 | with self.assertRaises(SystemExit) as cm: 193 | get_pyproject_version.main() 194 | 195 | self.assertEqual(cm.exception.code, 1) 196 | self.assertIn("Usage: python get_pyproject_version.py <pyproject_path> <expected_version>", 197 | captured_error.getvalue()) 198 | 199 | # Test: Incorrect number of arguments - too many 200 | def test_too_many_arguments(self): 201 | """Test that providing too many arguments results in usage error.""" 202 | sys.argv = ['get_pyproject_version.py', 'pyproject.toml', '1.0.0', 'extra'] 203 | 204 | captured_error = StringIO() 205 | with patch('sys.stderr', captured_error): 206 | with self.assertRaises(SystemExit) as cm: 207 | get_pyproject_version.main() 208 | 209 | self.assertEqual(cm.exception.code, 1) 210 | self.assertIn("Usage: python get_pyproject_version.py <pyproject_path> <expected_version>", 211 | captured_error.getvalue()) 212 | 213 | # Test: No arguments 214 | def test_no_arguments(self): 215 | """Test that providing no arguments results in usage error.""" 216 | sys.argv = ['get_pyproject_version.py'] 217 | 218 | captured_error = StringIO() 219 | with patch('sys.stderr', captured_error): 220 | with self.assertRaises(SystemExit) as cm: 221 | get_pyproject_version.main() 222 | 223 | self.assertEqual(cm.exception.code, 1) 224 | self.assertIn("Usage: python get_pyproject_version.py <pyproject_path> <expected_version>", 225 | captured_error.getvalue()) 226 | 227 | # Test: Version with pre-release tags 228 | def test_version_with_prerelease_tags(self): 229 | """Test matching versions with pre-release tags like alpha, beta, rc.""" 230 | pyproject_file = self.create_pyproject_toml("1.2.3-rc.1") 231 | 232 | try: 233 | sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.2.3-rc.1'] 234 | 235 | captured_output = StringIO() 236 | with patch('sys.stdout', captured_output): 237 | with self.assertRaises(SystemExit) as cm: 238 | get_pyproject_version.main() 239 | 240 | self.assertEqual(cm.exception.code, 0) 241 | self.assertIn("✅ Version consistency check passed: 1.2.3-rc.1", captured_output.getvalue()) 242 | finally: 243 | pyproject_file.unlink() 244 | 245 | # Test: Version with build metadata 246 | def test_version_with_build_metadata(self): 247 | """Test matching versions with build metadata.""" 248 | pyproject_file = self.create_pyproject_toml("1.2.3+build.123") 249 | 250 | try: 251 | sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.2.3+build.123'] 252 | 253 | captured_output = StringIO() 254 | with patch('sys.stdout', captured_output): 255 | with self.assertRaises(SystemExit) as cm: 256 | get_pyproject_version.main() 257 | 258 | self.assertEqual(cm.exception.code, 0) 259 | self.assertIn("✅ Version consistency check passed: 1.2.3+build.123", captured_output.getvalue()) 260 | finally: 261 | pyproject_file.unlink() 262 | 263 | # Test: Various semantic version formats 264 | def test_semantic_version_0_0_1(self): 265 | """Test semantic version 0.0.1.""" 266 | self._test_version_format("0.0.1") 267 | 268 | def test_semantic_version_1_0_0(self): 269 | """Test semantic version 1.0.0.""" 270 | self._test_version_format("1.0.0") 271 | 272 | def test_semantic_version_10_20_30(self): 273 | """Test semantic version 10.20.30.""" 274 | self._test_version_format("10.20.30") 275 | 276 | def test_semantic_version_alpha(self): 277 | """Test semantic version with alpha tag.""" 278 | self._test_version_format("1.2.3-alpha") 279 | 280 | def test_semantic_version_beta(self): 281 | """Test semantic version with beta tag.""" 282 | self._test_version_format("1.2.3-beta.1") 283 | 284 | def test_semantic_version_rc_with_build(self): 285 | """Test semantic version with rc and build metadata.""" 286 | self._test_version_format("1.2.3-rc.1+build.456") 287 | 288 | def _test_version_format(self, version: str): 289 | """Helper method to test various semantic version formats.""" 290 | pyproject_file = self.create_pyproject_toml(version) 291 | 292 | try: 293 | sys.argv = ['get_pyproject_version.py', str(pyproject_file), version] 294 | 295 | captured_output = StringIO() 296 | with patch('sys.stdout', captured_output): 297 | with self.assertRaises(SystemExit) as cm: 298 | get_pyproject_version.main() 299 | 300 | self.assertEqual(cm.exception.code, 0) 301 | self.assertIn(f"✅ Version consistency check passed: {version}", captured_output.getvalue()) 302 | finally: 303 | pyproject_file.unlink() 304 | 305 | # Test: Empty version string 306 | def test_empty_version_string(self): 307 | """Test handling of empty version string.""" 308 | pyproject_file = self.create_pyproject_toml("") 309 | 310 | try: 311 | sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.0.0'] 312 | 313 | captured_error = StringIO() 314 | with patch('sys.stderr', captured_error): 315 | with self.assertRaises(SystemExit) as cm: 316 | get_pyproject_version.main() 317 | 318 | self.assertEqual(cm.exception.code, 1) 319 | # Empty string is falsy, so it should trigger error 320 | self.assertIn("❌", captured_error.getvalue()) 321 | finally: 322 | pyproject_file.unlink() 323 | 324 | 325 | class TestSuiteInfo(unittest.TestCase): 326 | """Test suite metadata.""" 327 | 328 | def test_suite_info(self): 329 | """Display test suite information.""" 330 | print("\n" + "="*70) 331 | print("Test Suite: get_pyproject_version.py") 332 | print("Framework: unittest (Python built-in)") 333 | print("TOML Library: tomllib (Python 3.11+ built-in)") 334 | print("="*70) 335 | self.assertTrue(True) 336 | 337 | 338 | if __name__ == '__main__': 339 | # Run tests with verbose output 340 | unittest.main(verbosity=2) 341 | ``` -------------------------------------------------------------------------------- /libs/python/computer-server/computer_server/watchdog.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Watchdog module for monitoring the Computer API server health. 3 | Unix/Linux only - provides process management and restart capabilities. 4 | """ 5 | 6 | import asyncio 7 | import fcntl 8 | import json 9 | import logging 10 | import os 11 | import platform 12 | import subprocess 13 | import sys 14 | import time 15 | import websockets 16 | from typing import Optional 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | def instance_already_running(label="watchdog"): 22 | """ 23 | Detect if an an instance with the label is already running, globally 24 | at the operating system level. 25 | 26 | Using `os.open` ensures that the file pointer won't be closed 27 | by Python's garbage collector after the function's scope is exited. 28 | 29 | The lock will be released when the program exits, or could be 30 | released if the file pointer were closed. 31 | """ 32 | 33 | lock_file_pointer = os.open(f"/tmp/instance_{label}.lock", os.O_WRONLY | os.O_CREAT) 34 | 35 | try: 36 | fcntl.lockf(lock_file_pointer, fcntl.LOCK_EX | fcntl.LOCK_NB) 37 | already_running = False 38 | except IOError: 39 | already_running = True 40 | 41 | return already_running 42 | 43 | 44 | class Watchdog: 45 | """Watchdog class to monitor server health via WebSocket connection. 46 | Unix/Linux only - provides restart capabilities. 47 | """ 48 | 49 | def __init__(self, cli_args: Optional[dict] = None, ping_interval: int = 30): 50 | """ 51 | Initialize the watchdog. 52 | 53 | Args: 54 | cli_args: Dictionary of CLI arguments to replicate when restarting 55 | ping_interval: Interval between ping checks in seconds 56 | """ 57 | # Check if running on Unix/Linux 58 | if platform.system() not in ['Linux', 'Darwin']: 59 | raise RuntimeError("Watchdog is only supported on Unix/Linux systems") 60 | 61 | # Store CLI arguments for restart 62 | self.cli_args = cli_args or {} 63 | self.host = self.cli_args.get('host', 'localhost') 64 | self.port = self.cli_args.get('port', 8000) 65 | self.ping_interval = ping_interval 66 | self.container_name = os.environ.get("CONTAINER_NAME") 67 | self.running = False 68 | self.restart_enabled = True 69 | 70 | @property 71 | def ws_uri(self) -> str: 72 | """Get the WebSocket URI using the current IP address. 73 | 74 | Returns: 75 | WebSocket URI for the Computer API Server 76 | """ 77 | ip_address = "localhost" if not self.container_name else f"{self.container_name}.containers.cloud.trycua.com" 78 | protocol = "wss" if self.container_name else "ws" 79 | port = "8443" if self.container_name else "8000" 80 | return f"{protocol}://{ip_address}:{port}/ws" 81 | 82 | async def ping(self) -> bool: 83 | """ 84 | Test connection to the WebSocket endpoint. 85 | 86 | Returns: 87 | True if connection successful, False otherwise 88 | """ 89 | try: 90 | # Create a simple ping message 91 | ping_message = { 92 | "command": "get_screen_size", 93 | "params": {} 94 | } 95 | 96 | # Try to connect to the WebSocket 97 | async with websockets.connect( 98 | self.ws_uri, 99 | max_size=1024 * 1024 * 10 # 10MB limit to match server 100 | ) as websocket: 101 | # Send ping message 102 | await websocket.send(json.dumps(ping_message)) 103 | 104 | # Wait for any response or just close 105 | try: 106 | response = await asyncio.wait_for(websocket.recv(), timeout=5) 107 | logger.debug(f"Ping response received: {response[:100]}...") 108 | return True 109 | except asyncio.TimeoutError: 110 | return False 111 | except Exception as e: 112 | logger.warning(f"Ping failed: {e}") 113 | return False 114 | 115 | def kill_processes_on_port(self, port: int) -> bool: 116 | """ 117 | Kill any processes using the specified port. 118 | 119 | Args: 120 | port: Port number to check and kill processes on 121 | 122 | Returns: 123 | True if processes were killed or none found, False on error 124 | """ 125 | try: 126 | # Find processes using the port 127 | result = subprocess.run( 128 | ["lsof", "-ti", f":{port}"], 129 | capture_output=True, 130 | text=True, 131 | timeout=10 132 | ) 133 | 134 | if result.returncode == 0 and result.stdout.strip(): 135 | pids = result.stdout.strip().split('\n') 136 | logger.info(f"Found {len(pids)} processes using port {port}: {pids}") 137 | 138 | # Kill each process 139 | for pid in pids: 140 | if pid.strip(): 141 | try: 142 | subprocess.run(["kill", "-9", pid.strip()], timeout=5) 143 | logger.info(f"Killed process {pid}") 144 | except subprocess.TimeoutExpired: 145 | logger.warning(f"Timeout killing process {pid}") 146 | except Exception as e: 147 | logger.warning(f"Error killing process {pid}: {e}") 148 | 149 | return True 150 | else: 151 | logger.debug(f"No processes found using port {port}") 152 | return True 153 | 154 | except subprocess.TimeoutExpired: 155 | logger.error(f"Timeout finding processes on port {port}") 156 | return False 157 | except Exception as e: 158 | logger.error(f"Error finding processes on port {port}: {e}") 159 | return False 160 | 161 | def restart_server(self) -> bool: 162 | """ 163 | Attempt to restart the server by killing existing processes and starting new one. 164 | 165 | Returns: 166 | True if restart was attempted, False on error 167 | """ 168 | if not self.restart_enabled: 169 | logger.info("Server restart is disabled") 170 | return False 171 | 172 | try: 173 | logger.info("Attempting to restart server...") 174 | 175 | # Kill processes on the port 176 | port_to_kill = 8443 if self.container_name else self.port 177 | if not self.kill_processes_on_port(port_to_kill): 178 | logger.error("Failed to kill processes on port, restart aborted") 179 | return False 180 | 181 | # Wait a moment for processes to die 182 | time.sleep(2) 183 | 184 | # Try to restart the server 185 | # In container mode, we can't easily restart, so just log 186 | if self.container_name: 187 | logger.warning("Container mode detected - cannot restart server automatically") 188 | logger.warning("Container orchestrator should handle restart") 189 | return False 190 | else: 191 | # For local mode, try to restart the CLI 192 | logger.info("Attempting to restart local server...") 193 | 194 | # Get the current Python executable and script 195 | python_exe = sys.executable 196 | 197 | # Try to find the CLI module 198 | try: 199 | # Build command with all original CLI arguments 200 | cmd = [python_exe, "-m", "computer_server.cli"] 201 | 202 | # Add all CLI arguments except watchdog-related ones 203 | for key, value in self.cli_args.items(): 204 | if key in ['watchdog', 'watchdog_interval', 'no_restart']: 205 | continue # Skip watchdog args to avoid recursive watchdog 206 | 207 | # Convert underscores to hyphens for CLI args 208 | arg_name = f"--{key.replace('_', '-')}" 209 | 210 | if isinstance(value, bool): 211 | if value: # Only add flag if True 212 | cmd.append(arg_name) 213 | else: 214 | cmd.extend([arg_name, str(value)]) 215 | 216 | logger.info(f"Starting server with command: {' '.join(cmd)}") 217 | 218 | # Start process in background 219 | subprocess.Popen( 220 | cmd, 221 | stdout=subprocess.DEVNULL, 222 | stderr=subprocess.DEVNULL, 223 | start_new_session=True 224 | ) 225 | 226 | logger.info("Server restart initiated") 227 | return True 228 | 229 | except Exception as e: 230 | logger.error(f"Failed to restart server: {e}") 231 | return False 232 | 233 | except Exception as e: 234 | logger.error(f"Error during server restart: {e}") 235 | return False 236 | 237 | async def start_monitoring(self) -> None: 238 | """Start the watchdog monitoring loop.""" 239 | self.running = True 240 | logger.info(f"Starting watchdog monitoring for {self.ws_uri}") 241 | logger.info(f"Ping interval: {self.ping_interval} seconds") 242 | if self.container_name: 243 | logger.info(f"Container mode detected: {self.container_name}") 244 | 245 | consecutive_failures = 0 246 | max_failures = 3 247 | 248 | while self.running: 249 | try: 250 | success = await self.ping() 251 | 252 | if success: 253 | if consecutive_failures > 0: 254 | logger.info("Server connection restored") 255 | consecutive_failures = 0 256 | logger.debug("Ping successful") 257 | else: 258 | consecutive_failures += 1 259 | logger.warning(f"Ping failed ({consecutive_failures}/{max_failures})") 260 | 261 | if consecutive_failures >= max_failures: 262 | logger.error(f"Server appears to be down after {max_failures} consecutive failures") 263 | 264 | # Attempt to restart the server 265 | if self.restart_enabled: 266 | logger.info("Attempting automatic server restart...") 267 | restart_success = self.restart_server() 268 | 269 | if restart_success: 270 | logger.info("Server restart initiated, waiting before next ping...") 271 | # Wait longer after restart attempt 272 | await asyncio.sleep(self.ping_interval * 2) 273 | consecutive_failures = 0 # Reset counter after restart attempt 274 | else: 275 | logger.error("Server restart failed") 276 | else: 277 | logger.warning("Automatic restart is disabled") 278 | 279 | # Wait for next ping interval 280 | await asyncio.sleep(self.ping_interval) 281 | 282 | except asyncio.CancelledError: 283 | logger.info("Watchdog monitoring cancelled") 284 | break 285 | except Exception as e: 286 | logger.error(f"Unexpected error in watchdog loop: {e}") 287 | await asyncio.sleep(self.ping_interval) 288 | 289 | def stop_monitoring(self) -> None: 290 | """Stop the watchdog monitoring.""" 291 | self.running = False 292 | logger.info("Stopping watchdog monitoring") 293 | 294 | 295 | async def run_watchdog(cli_args: Optional[dict] = None, ping_interval: int = 30) -> None: 296 | """ 297 | Run the watchdog monitoring. 298 | 299 | Args: 300 | cli_args: Dictionary of CLI arguments to replicate when restarting 301 | ping_interval: Interval between ping checks in seconds 302 | """ 303 | watchdog = Watchdog(cli_args=cli_args, ping_interval=ping_interval) 304 | 305 | try: 306 | await watchdog.start_monitoring() 307 | except KeyboardInterrupt: 308 | logger.info("Watchdog stopped by user") 309 | finally: 310 | watchdog.stop_monitoring() 311 | 312 | 313 | if __name__ == "__main__": 314 | # For testing the watchdog standalone 315 | import argparse 316 | 317 | parser = argparse.ArgumentParser(description="Run Computer API server watchdog") 318 | parser.add_argument("--host", default="localhost", help="Server host to monitor") 319 | parser.add_argument("--port", type=int, default=8000, help="Server port to monitor") 320 | parser.add_argument("--ping-interval", type=int, default=30, help="Ping interval in seconds") 321 | 322 | args = parser.parse_args() 323 | 324 | logging.basicConfig( 325 | level=logging.INFO, 326 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 327 | ) 328 | 329 | cli_args = { 330 | 'host': args.host, 331 | 'port': args.port 332 | } 333 | asyncio.run(run_watchdog(cli_args, args.ping_interval)) 334 | ```