# Directory Structure ``` ├── .gitignore ├── .python-version ├── assets │ └── web-ui.png ├── Dockerfile ├── LICENSE ├── pyproject.toml ├── README.md ├── smithery.yaml ├── src │ └── mcp_server_browser_use │ ├── __init__.py │ ├── agent │ │ ├── __init__.py │ │ ├── custom_agent.py │ │ ├── custom_massage_manager.py │ │ ├── custom_prompts.py │ │ └── custom_views.py │ ├── browser │ │ ├── __init__.py │ │ ├── custom_browser.py │ │ └── custom_context.py │ ├── controller │ │ ├── __init__.py │ │ └── custom_controller.py │ ├── server.py │ └── utils │ ├── __init__.py │ ├── agent_state.py │ ├── deep_research.py │ ├── llm.py │ └── utils.py └── uv.lock ``` # Files -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- ``` 1 | 3.11 2 | ``` -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- ``` 1 | # File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig 2 | # Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,macos,python 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,macos,python 4 | 5 | ### macOS ### 6 | # General 7 | .DS_Store 8 | .AppleDouble 9 | .LSOverride 10 | 11 | # Icon must end with two \r 12 | Icon 13 | 14 | 15 | # Thumbnails 16 | ._* 17 | 18 | # Files that might appear in the root of a volume 19 | .DocumentRevisions-V100 20 | .fseventsd 21 | .Spotlight-V100 22 | .TemporaryItems 23 | .Trashes 24 | .VolumeIcon.icns 25 | .com.apple.timemachine.donotpresent 26 | 27 | # Directories potentially created on remote AFP share 28 | .AppleDB 29 | .AppleDesktop 30 | Network Trash Folder 31 | Temporary Items 32 | .apdisk 33 | 34 | ### macOS Patch ### 35 | # iCloud generated files 36 | *.icloud 37 | 38 | ### Python ### 39 | # Byte-compiled / optimized / DLL files 40 | __pycache__/ 41 | *.py[cod] 42 | *$py.class 43 | 44 | # C extensions 45 | *.so 46 | 47 | # Distribution / packaging 48 | .Python 49 | build/ 50 | develop-eggs/ 51 | dist/ 52 | downloads/ 53 | eggs/ 54 | .eggs/ 55 | lib/ 56 | lib64/ 57 | parts/ 58 | sdist/ 59 | var/ 60 | wheels/ 61 | share/python-wheels/ 62 | *.egg-info/ 63 | .installed.cfg 64 | *.egg 65 | MANIFEST 66 | 67 | # PyInstaller 68 | # Usually these files are written by a python script from a template 69 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 70 | *.manifest 71 | *.spec 72 | 73 | # Installer logs 74 | pip-log.txt 75 | pip-delete-this-directory.txt 76 | 77 | # Unit test / coverage reports 78 | htmlcov/ 79 | .tox/ 80 | .nox/ 81 | .coverage 82 | .coverage.* 83 | .cache 84 | nosetests.xml 85 | coverage.xml 86 | *.cover 87 | *.py,cover 88 | .hypothesis/ 89 | .pytest_cache/ 90 | cover/ 91 | 92 | # Translations 93 | *.mo 94 | *.pot 95 | 96 | # Django stuff: 97 | *.log 98 | local_settings.py 99 | db.sqlite3 100 | db.sqlite3-journal 101 | 102 | # Flask stuff: 103 | instance/ 104 | .webassets-cache 105 | 106 | # Scrapy stuff: 107 | .scrapy 108 | 109 | # Sphinx documentation 110 | docs/_build/ 111 | 112 | # PyBuilder 113 | .pybuilder/ 114 | target/ 115 | 116 | # Jupyter Notebook 117 | .ipynb_checkpoints 118 | 119 | # IPython 120 | profile_default/ 121 | ipython_config.py 122 | 123 | # pyenv 124 | # For a library or package, you might want to ignore these files since the code is 125 | # intended to run in multiple environments; otherwise, check them in: 126 | # .python-version 127 | 128 | # pipenv 129 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 130 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 131 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 132 | # install all needed dependencies. 133 | #Pipfile.lock 134 | 135 | # poetry 136 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 137 | # This is especially recommended for binary packages to ensure reproducibility, and is more 138 | # commonly ignored for libraries. 139 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 140 | #poetry.lock 141 | 142 | # pdm 143 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 144 | #pdm.lock 145 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 146 | # in version control. 147 | # https://pdm.fming.dev/#use-with-ide 148 | .pdm.toml 149 | 150 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 151 | __pypackages__/ 152 | 153 | # Celery stuff 154 | celerybeat-schedule 155 | celerybeat.pid 156 | 157 | # SageMath parsed files 158 | *.sage.py 159 | 160 | # Environments 161 | .env 162 | .venv 163 | env/ 164 | venv/ 165 | ENV/ 166 | env.bak/ 167 | venv.bak/ 168 | 169 | # Spyder project settings 170 | .spyderproject 171 | .spyproject 172 | 173 | # Rope project settings 174 | .ropeproject 175 | 176 | # mkdocs documentation 177 | /site 178 | 179 | # mypy 180 | .mypy_cache/ 181 | .dmypy.json 182 | dmypy.json 183 | 184 | # Pyre type checker 185 | .pyre/ 186 | 187 | # pytype static type analyzer 188 | .pytype/ 189 | 190 | # Cython debug symbols 191 | cython_debug/ 192 | 193 | # PyCharm 194 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 195 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 196 | # and can be added to the global gitignore or merged into this file. For a more nuclear 197 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 198 | #.idea/ 199 | 200 | ### Python Patch ### 201 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 202 | poetry.toml 203 | 204 | # ruff 205 | .ruff_cache/ 206 | 207 | # LSP config files 208 | pyrightconfig.json 209 | 210 | ### VisualStudioCode ### 211 | .vscode/* 212 | !.vscode/settings.json 213 | !.vscode/tasks.json 214 | !.vscode/launch.json 215 | !.vscode/extensions.json 216 | !.vscode/*.code-snippets 217 | 218 | # Local History for Visual Studio Code 219 | .history/ 220 | 221 | # Built Visual Studio Code Extensions 222 | *.vsix 223 | 224 | ### VisualStudioCode Patch ### 225 | # Ignore all local history of files 226 | .history 227 | .ionide 228 | 229 | # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,macos,python 230 | 231 | # Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option) 232 | 233 | agent_history.gif 234 | trace.json 235 | recording.mp4 236 | ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- ```markdown 1 | <img src="./assets/web-ui.png" alt="Browser Use Web UI" width="full"/> 2 | 3 | <br/> 4 | 5 | # browser-use MCP server 6 | [](https://docs.browser-use.com) 7 | [](LICENSE) 8 | 9 | > **Project Note**: This MCP server implementation builds upon the [browser-use/web-ui](https://github.com/browser-use/web-ui) foundation. Core browser automation logic and configuration patterns are adapted from the original project. 10 | 11 | AI-driven browser automation server implementing the Model Context Protocol (MCP) for natural language browser control. 12 | 13 | <a href="https://glama.ai/mcp/servers/dz6dy5hw59"><img width="380" height="200" src="https://glama.ai/mcp/servers/dz6dy5hw59/badge" alt="Browser-Use Server MCP server" /></a> 14 | 15 | ## Features 16 | 17 | - 🧠 **MCP Integration** - Full protocol implementation for AI agent communication 18 | - 🌐 **Browser Automation** - Page navigation, form filling, and element interaction 19 | - 👁️ **Visual Understanding** - Screenshot analysis and vision-based interactions 20 | - 🔄 **State Persistence** - Maintain browser sessions between tasks 21 | - 🔌 **Multi-LLM Support** - OpenAI, Anthropic, Azure, DeepSeek integration 22 | 23 | ## Quick Start 24 | 25 | ### Prerequisites 26 | 27 | - Python 3.11 or higher 28 | - uv (fast Python package installer) 29 | - Chrome/Chromium browser 30 | 31 | ### Installation 32 | 33 | #### Claude Desktop 34 | 35 | On MacOS: `~/Library/Application\ Support/Claude/claude_desktop_config.json` 36 | On Windows: `%APPDATA%/Claude/claude_desktop_config.json` 37 | 38 | ```json 39 | "mcpServers": { 40 | "browser-use": { 41 | "command": "uvx", 42 | "args": [ 43 | "mcp-server-browser-use", 44 | ], 45 | "env": { 46 | "OPENROUTER_API_KEY": "", 47 | "OPENROUTER_ENDPOINT": "https://openrouter.ai/api/v1", 48 | "OPENAI_ENDPOINT": "https://api.openai.com/v1", 49 | "OPENAI_API_KEY": "", 50 | "ANTHROPIC_ENDPOINT": "https://api.anthropic.com", 51 | "ANTHROPIC_API_KEY": "", 52 | "GOOGLE_API_KEY": "", 53 | "AZURE_OPENAI_ENDPOINT": "", 54 | "AZURE_OPENAI_API_KEY": "", 55 | "DEEPSEEK_ENDPOINT": "https://api.deepseek.com", 56 | "DEEPSEEK_API_KEY": "", 57 | "MISTRAL_API_KEY": "", 58 | "MISTRAL_ENDPOINT": "https://api.mistral.ai/v1", 59 | "OLLAMA_ENDPOINT": "http://localhost:11434", 60 | "ANONYMIZED_TELEMETRY": "true", 61 | "BROWSER_USE_LOGGING_LEVEL": "info", 62 | "CHROME_PATH": "", 63 | "CHROME_USER_DATA": "", 64 | "CHROME_DEBUGGING_PORT": "9222", 65 | "CHROME_DEBUGGING_HOST": "localhost", 66 | "CHROME_PERSISTENT_SESSION": "false", 67 | "BROWSER_HEADLESS": "false", 68 | "BROWSER_DISABLE_SECURITY": "false", 69 | "BROWSER_WINDOW_WIDTH": "1280", 70 | "BROWSER_WINDOW_HEIGHT": "720", 71 | "BROWSER_TRACE_PATH": "trace.json", 72 | "BROWSER_RECORDING_PATH": "recording.mp4", 73 | "RESOLUTION": "1920x1080x24", 74 | "RESOLUTION_WIDTH": "1920", 75 | "RESOLUTION_HEIGHT": "1080", 76 | "VNC_PASSWORD": "youvncpassword", 77 | "MCP_MODEL_PROVIDER": "anthropic", 78 | "MCP_MODEL_NAME": "claude-3-5-sonnet-20241022", 79 | "MCP_TEMPERATURE": "0.3", 80 | "MCP_MAX_STEPS": "30", 81 | "MCP_USE_VISION": "true", 82 | "MCP_MAX_ACTIONS_PER_STEP": "5", 83 | "MCP_TOOL_CALL_IN_CONTENT": "true" 84 | } 85 | } 86 | ``` 87 | 88 | ### Local Development 89 | 90 | ```json 91 | "browser-use": { 92 | "command": "uv", 93 | "args": [ 94 | "--directory", 95 | "/path/to/mcp-browser-use", 96 | "run", 97 | "mcp-server-browser-use" 98 | ], 99 | "env": { 100 | ... 101 | } 102 | } 103 | ``` 104 | 105 | ## Development 106 | 107 | ```bash 108 | # Install dev dependencies 109 | uv sync 110 | 111 | # Run with debugger 112 | npx @modelcontextprotocol/inspector uv --directory . run mcp-server-browser-use 113 | ``` 114 | 115 | ## Troubleshooting 116 | 117 | - **Browser Conflicts**: Close all Chrome instances before starting. 118 | - **API Errors**: Verify API keys in environment variables match your LLM provider. 119 | - **Vision Support**: Ensure `MCP_USE_VISION=true` for screenshot analysis. 120 | 121 | ## Provider Configuration 122 | 123 | The server supports multiple LLM providers through environment variables. Here are the available options for `MCP_MODEL_PROVIDER`: 124 | 125 | | Provider | Value | Required Env Variables | 126 | |----------|--------|----------------------| 127 | | Anthropic | `anthropic` | `ANTHROPIC_API_KEY`<br>`ANTHROPIC_ENDPOINT` (optional) | 128 | | OpenAI | `openai` | `OPENAI_API_KEY`<br>`OPENAI_ENDPOINT` (optional) | 129 | | Azure OpenAI | `azure_openai` | `AZURE_OPENAI_API_KEY`<br>`AZURE_OPENAI_ENDPOINT` | 130 | | DeepSeek | `deepseek` | `DEEPSEEK_API_KEY`<br>`DEEPSEEK_ENDPOINT` (optional) | 131 | | Gemini | `gemini` | `GOOGLE_API_KEY` | 132 | | Mistral | `mistral` | `MISTRAL_API_KEY`<br>`MISTRAL_ENDPOINT` (optional) | 133 | | Ollama | `ollama` | `OLLAMA_ENDPOINT` (optional, defaults to localhost:11434) | 134 | | OpenRouter | `openrouter` | `OPENROUTER_API_KEY`<br>`OPENROUTER_ENDPOINT` (optional) | 135 | 136 | ### Notes: 137 | - For endpoints marked as optional, default values will be used if not specified 138 | - Temperature can be configured using `MCP_TEMPERATURE` (default: 0.3) 139 | - Model can be specified using `MCP_MODEL_NAME` 140 | - For Ollama models, additional context settings like `num_ctx` and `num_predict` are configurable 141 | 142 | ## Credits 143 | 144 | This project extends the [browser-use/web-ui](https://github.com/browser-use/web-ui) under MIT License. Special thanks to the original authors for their browser automation framework. 145 | 146 | ## License 147 | 148 | MIT - See [LICENSE](LICENSE) for details. 149 | ``` -------------------------------------------------------------------------------- /src/mcp_server_browser_use/agent/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /src/mcp_server_browser_use/browser/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /src/mcp_server_browser_use/controller/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /src/mcp_server_browser_use/utils/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /src/mcp_server_browser_use/__init__.py: -------------------------------------------------------------------------------- ```python 1 | """MCP server for browser-use""" 2 | 3 | from mcp_server_browser_use.server import app, main 4 | 5 | 6 | __all__ = ["app", "main"] 7 | ``` -------------------------------------------------------------------------------- /src/mcp_server_browser_use/browser/custom_context.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import logging 3 | import os 4 | 5 | from browser_use.browser.browser import Browser 6 | from browser_use.browser.context import BrowserContext, BrowserContextConfig 7 | from playwright.async_api import Browser as PlaywrightBrowser 8 | from playwright.async_api import BrowserContext as PlaywrightBrowserContext 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class CustomBrowserContext(BrowserContext): 14 | def __init__( 15 | self, 16 | browser: "Browser", 17 | config: BrowserContextConfig = BrowserContextConfig() 18 | ): 19 | super(CustomBrowserContext, self).__init__(browser=browser, config=config) ``` -------------------------------------------------------------------------------- /src/mcp_server_browser_use/utils/agent_state.py: -------------------------------------------------------------------------------- ```python 1 | import asyncio 2 | 3 | class AgentState: 4 | _instance = None 5 | 6 | def __init__(self): 7 | if not hasattr(self, '_stop_requested'): 8 | self._stop_requested = asyncio.Event() 9 | self.last_valid_state = None # store the last valid browser state 10 | 11 | def __new__(cls): 12 | if cls._instance is None: 13 | cls._instance = super(AgentState, cls).__new__(cls) 14 | return cls._instance 15 | 16 | def request_stop(self): 17 | self._stop_requested.set() 18 | 19 | def clear_stop(self): 20 | self._stop_requested.clear() 21 | self.last_valid_state = None 22 | 23 | def is_stop_requested(self): 24 | return self._stop_requested.is_set() 25 | 26 | def set_last_valid_state(self, state): 27 | self.last_valid_state = state 28 | 29 | def get_last_valid_state(self): 30 | return self.last_valid_state ``` -------------------------------------------------------------------------------- /smithery.yaml: -------------------------------------------------------------------------------- ```yaml 1 | startCommand: 2 | type: stdio 3 | configSchema: 4 | type: object 5 | required: 6 | - OPENROUTER_API_KEY 7 | properties: 8 | OPENROUTER_API_KEY: 9 | type: string 10 | description: "The API key for OpenRouter." 11 | MCP_MODEL_NAME: 12 | type: string 13 | description: "The model to use on OpenRouter (default: openai/o3-mini-high)." 14 | BROWSER_HEADLESS: 15 | type: string 16 | description: "Set to 'true' to run the browser in headless mode (default: 'false')." 17 | commandFunction: 18 | |- 19 | (config) => ({ 20 | command: 'mcp-server-browser-use', 21 | args: [], 22 | env: { 23 | OPENROUTER_API_KEY: config.OPENROUTER_API_KEY || '', 24 | MCP_MODEL_NAME: config.MCP_MODEL_NAME || 'openai/o3-mini-high', 25 | BROWSER_HEADLESS: config.BROWSER_HEADLESS || 'false', 26 | PORT: '8000' 27 | } 28 | }) 29 | ``` -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- ```dockerfile 1 | # Use the official Python 3.13 slim image as base 2 | FROM python:3.13-slim 3 | 4 | # Install system dependencies: Chromium, its driver, Xvfb for headless operation, and ca-certificates 5 | RUN apt-get update && \ 6 | apt-get install -y --no-install-recommends \ 7 | chromium \ 8 | chromium-driver \ 9 | xvfb \ 10 | ca-certificates && \ 11 | rm -rf /var/lib/apt/lists/* 12 | 13 | # Set the working directory 14 | WORKDIR /app 15 | 16 | # Copy project files into the container 17 | COPY . /app 18 | 19 | # Install uv (fast Python package installer) using pip 20 | RUN pip install --upgrade pip && pip install uv 21 | 22 | # Install project dependencies using uv's pip command 23 | RUN pip install . 24 | 25 | # Expose the port that the server will listen on (likely 8000) 26 | EXPOSE 8000 27 | 28 | # Set environment variables as needed (adjust as necessary) 29 | ENV BROWSER_HEADLESS="true" \ 30 | BROWSER_USE_LOGGING_LEVEL="info" 31 | 32 | RUN playwright install 33 | 34 | # Set the entrypoint to the command that Smithery expects 35 | ENTRYPOINT ["uv", "run", "src/mcp_server_browser_use/server.py"] 36 | 37 | # Default command if no arguments are provided 38 | CMD ["--help"] ``` -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- ```toml 1 | [project] 2 | name = "mcp_server_browser_use" 3 | version = "0.1.3" 4 | description = "MCP server for browser-use" 5 | readme = "README.md" 6 | requires-python = ">=3.11" 7 | authors = [ 8 | { name = "Igor Tarasenko" }, 9 | { name = "Martin Jakobsson" }, 10 | ] 11 | license = { text = "MIT" } 12 | classifiers = [ 13 | "Development Status :: 4 - Beta", 14 | "Programming Language :: Python :: 3", 15 | "Programming Language :: Python :: 3.11", 16 | "Operating System :: OS Independent", 17 | ] 18 | 19 | dependencies = [ 20 | "langchain>=0.3.14", 21 | "langchain-openai>=0.2.14", 22 | "pydantic>=2.10.5", 23 | "fastapi>=0.115.6", 24 | "uvicorn>=0.22.0", 25 | "openai>=1.59.5", 26 | "python-dotenv>=1.0.1", 27 | "pyperclip>=1.9.0", 28 | "langchain-ollama>=0.2.2", 29 | "instructor>=1.7.2", 30 | "json-repair>=0.35.0", 31 | "langchain-mistralai>=0.2.6", 32 | "fastmcp>=0.4.1", 33 | "browser-use==0.1.29", 34 | ] 35 | 36 | [build-system] 37 | requires = ["hatchling"] 38 | build-backend = "hatchling.build" 39 | 40 | [tool.hatch.build.targets.wheel] 41 | packages = ["src/mcp_server_browser_use"] 42 | 43 | [project.scripts] 44 | mcp-server-browser-use = "mcp_server_browser_use.server:main" 45 | ``` -------------------------------------------------------------------------------- /src/mcp_server_browser_use/agent/custom_views.py: -------------------------------------------------------------------------------- ```python 1 | from dataclasses import dataclass 2 | from typing import Type 3 | 4 | from browser_use.agent.views import AgentOutput 5 | from browser_use.controller.registry.views import ActionModel 6 | from pydantic import BaseModel, ConfigDict, Field, create_model 7 | 8 | 9 | @dataclass 10 | class CustomAgentStepInfo: 11 | step_number: int 12 | max_steps: int 13 | task: str 14 | add_infos: str 15 | memory: str 16 | task_progress: str 17 | future_plans: str 18 | 19 | 20 | class CustomAgentBrain(BaseModel): 21 | """Current state of the agent""" 22 | 23 | prev_action_evaluation: str 24 | important_contents: str 25 | task_progress: str 26 | future_plans: str 27 | thought: str 28 | summary: str 29 | 30 | 31 | class CustomAgentOutput(AgentOutput): 32 | """Output model for agent 33 | 34 | @dev note: this model is extended with custom actions in AgentService. You can also use some fields that are not in this model as provided by the linter, as long as they are registered in the DynamicActions model. 35 | """ 36 | 37 | model_config = ConfigDict(arbitrary_types_allowed=True) 38 | 39 | current_state: CustomAgentBrain 40 | action: list[ActionModel] 41 | 42 | @staticmethod 43 | def type_with_custom_actions( 44 | custom_actions: Type[ActionModel], 45 | ) -> Type["CustomAgentOutput"]: 46 | """Extend actions with custom actions""" 47 | return create_model( 48 | "CustomAgentOutput", 49 | __base__=CustomAgentOutput, 50 | action=( 51 | list[custom_actions], 52 | Field(...), 53 | ), # Properly annotated field with no default 54 | __module__=CustomAgentOutput.__module__, 55 | ) 56 | ``` -------------------------------------------------------------------------------- /src/mcp_server_browser_use/controller/custom_controller.py: -------------------------------------------------------------------------------- ```python 1 | import pdb 2 | 3 | import pyperclip 4 | from typing import Optional, Type 5 | from pydantic import BaseModel 6 | from browser_use.agent.views import ActionResult 7 | from browser_use.browser.context import BrowserContext 8 | from browser_use.controller.service import Controller, DoneAction 9 | from main_content_extractor import MainContentExtractor 10 | from browser_use.controller.views import ( 11 | ClickElementAction, 12 | DoneAction, 13 | ExtractPageContentAction, 14 | GoToUrlAction, 15 | InputTextAction, 16 | OpenTabAction, 17 | ScrollAction, 18 | SearchGoogleAction, 19 | SendKeysAction, 20 | SwitchTabAction, 21 | ) 22 | import logging 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | class CustomController(Controller): 28 | def __init__(self, exclude_actions: list[str] = [], 29 | output_model: Optional[Type[BaseModel]] = None 30 | ): 31 | super().__init__(exclude_actions=exclude_actions, output_model=output_model) 32 | self._register_custom_actions() 33 | 34 | def _register_custom_actions(self): 35 | """Register all custom browser actions""" 36 | 37 | @self.registry.action("Copy text to clipboard") 38 | def copy_to_clipboard(text: str): 39 | pyperclip.copy(text) 40 | return ActionResult(extracted_content=text) 41 | 42 | @self.registry.action("Paste text from clipboard", requires_browser=True) 43 | async def paste_from_clipboard(browser: BrowserContext): 44 | text = pyperclip.paste() 45 | # send text to browser 46 | page = await browser.get_current_page() 47 | await page.keyboard.type(text) 48 | 49 | return ActionResult(extracted_content=text) 50 | 51 | @self.registry.action( 52 | 'Extract page content to get the pure text or markdown with links if include_links is set to true', 53 | param_model=ExtractPageContentAction, 54 | requires_browser=True, 55 | ) 56 | async def extract_content(params: ExtractPageContentAction, browser: BrowserContext): 57 | page = await browser.get_current_page() 58 | # use jina reader 59 | url = page.url 60 | jina_url = f"https://r.jina.ai/{url}" 61 | await page.goto(jina_url) 62 | output_format = 'markdown' if params.include_links else 'text' 63 | content = MainContentExtractor.extract( # type: ignore 64 | html=await page.content(), 65 | output_format=output_format, 66 | ) 67 | # go back to org url 68 | await page.go_back() 69 | msg = f'📄 Extracted page content as {output_format}\n: {content}\n' 70 | logger.info(msg) 71 | return ActionResult(extracted_content=msg) 72 | ``` -------------------------------------------------------------------------------- /src/mcp_server_browser_use/browser/custom_browser.py: -------------------------------------------------------------------------------- ```python 1 | import asyncio 2 | import pdb 3 | 4 | from playwright.async_api import Browser as PlaywrightBrowser 5 | from playwright.async_api import ( 6 | BrowserContext as PlaywrightBrowserContext, 7 | ) 8 | from playwright.async_api import ( 9 | Playwright, 10 | async_playwright, 11 | ) 12 | from browser_use.browser.browser import Browser 13 | from browser_use.browser.context import BrowserContext, BrowserContextConfig 14 | from playwright.async_api import BrowserContext as PlaywrightBrowserContext 15 | import logging 16 | 17 | from .custom_context import CustomBrowserContext 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | class CustomBrowser(Browser): 22 | 23 | async def new_context( 24 | self, 25 | config: BrowserContextConfig = BrowserContextConfig() 26 | ) -> CustomBrowserContext: 27 | return CustomBrowserContext(config=config, browser=self) 28 | 29 | async def _setup_browser_with_instance(self, playwright: Playwright) -> PlaywrightBrowser: 30 | """Sets up and returns a Playwright Browser instance with anti-detection measures.""" 31 | if not self.config.chrome_instance_path: 32 | raise ValueError('Chrome instance path is required') 33 | import subprocess 34 | 35 | import requests 36 | 37 | try: 38 | # Check if browser is already running 39 | response = requests.get('http://localhost:9222/json/version', timeout=2) 40 | if response.status_code == 200: 41 | logger.info('Reusing existing Chrome instance') 42 | browser = await playwright.chromium.connect_over_cdp( 43 | endpoint_url='http://localhost:9222', 44 | timeout=20000, # 20 second timeout for connection 45 | ) 46 | return browser 47 | except requests.ConnectionError: 48 | logger.debug('No existing Chrome instance found, starting a new one') 49 | 50 | # Start a new Chrome instance 51 | subprocess.Popen( 52 | [ 53 | self.config.chrome_instance_path, 54 | '--remote-debugging-port=9222', 55 | ] + self.config.extra_chromium_args, 56 | stdout=subprocess.DEVNULL, 57 | stderr=subprocess.DEVNULL, 58 | ) 59 | 60 | # try to connect first in case the browser have not started 61 | for _ in range(10): 62 | try: 63 | response = requests.get('http://localhost:9222/json/version', timeout=2) 64 | if response.status_code == 200: 65 | break 66 | except requests.ConnectionError: 67 | pass 68 | await asyncio.sleep(1) 69 | 70 | # Attempt to connect again after starting a new instance 71 | try: 72 | browser = await playwright.chromium.connect_over_cdp( 73 | endpoint_url='http://localhost:9222', 74 | timeout=20000, # 20 second timeout for connection 75 | ) 76 | return browser 77 | except Exception as e: 78 | logger.error(f'Failed to start a new Chrome instance.: {str(e)}') 79 | raise RuntimeError( 80 | ' To start chrome in Debug mode, you need to close all existing Chrome instances and try again otherwise we can not connect to the instance.' 81 | ) ``` -------------------------------------------------------------------------------- /src/mcp_server_browser_use/utils/llm.py: -------------------------------------------------------------------------------- ```python 1 | from openai import OpenAI 2 | import pdb 3 | from langchain_openai import ChatOpenAI 4 | from langchain_core.globals import get_llm_cache 5 | from langchain_core.language_models.base import ( 6 | BaseLanguageModel, 7 | LangSmithParams, 8 | LanguageModelInput, 9 | ) 10 | from langchain_core.load import dumpd, dumps 11 | from langchain_core.messages import ( 12 | AIMessage, 13 | SystemMessage, 14 | AnyMessage, 15 | BaseMessage, 16 | BaseMessageChunk, 17 | HumanMessage, 18 | convert_to_messages, 19 | message_chunk_to_message, 20 | ) 21 | from langchain_core.outputs import ( 22 | ChatGeneration, 23 | ChatGenerationChunk, 24 | ChatResult, 25 | LLMResult, 26 | RunInfo, 27 | ) 28 | from langchain_ollama import ChatOllama 29 | from langchain_core.output_parsers.base import OutputParserLike 30 | from langchain_core.runnables import Runnable, RunnableConfig 31 | from langchain_core.tools import BaseTool 32 | 33 | from typing import ( 34 | TYPE_CHECKING, 35 | Any, 36 | Callable, 37 | Literal, 38 | Optional, 39 | Union, 40 | cast, 41 | ) 42 | 43 | class DeepSeekR1ChatOpenAI(ChatOpenAI): 44 | 45 | def __init__(self, *args: Any, **kwargs: Any) -> None: 46 | super().__init__(*args, **kwargs) 47 | self.client = OpenAI( 48 | base_url=kwargs.get("base_url"), api_key=kwargs.get("api_key") 49 | ) 50 | 51 | async def ainvoke( 52 | self, 53 | input: LanguageModelInput, 54 | config: Optional[RunnableConfig] = None, 55 | *, 56 | stop: Optional[list[str]] = None, 57 | **kwargs: Any, 58 | ) -> AIMessage: 59 | message_history = [] 60 | for input_ in input: 61 | if isinstance(input_, SystemMessage): 62 | message_history.append({"role": "system", "content": input_.content}) 63 | elif isinstance(input_, AIMessage): 64 | message_history.append({"role": "assistant", "content": input_.content}) 65 | else: 66 | message_history.append({"role": "user", "content": input_.content}) 67 | 68 | response = self.client.chat.completions.create( 69 | model=self.model_name, messages=message_history 70 | ) 71 | 72 | reasoning_content = response.choices[0].message.reasoning_content 73 | content = response.choices[0].message.content 74 | return AIMessage(content=content, reasoning_content=reasoning_content) 75 | 76 | def invoke( 77 | self, 78 | input: LanguageModelInput, 79 | config: Optional[RunnableConfig] = None, 80 | *, 81 | stop: Optional[list[str]] = None, 82 | **kwargs: Any, 83 | ) -> AIMessage: 84 | message_history = [] 85 | for input_ in input: 86 | if isinstance(input_, SystemMessage): 87 | message_history.append({"role": "system", "content": input_.content}) 88 | elif isinstance(input_, AIMessage): 89 | message_history.append({"role": "assistant", "content": input_.content}) 90 | else: 91 | message_history.append({"role": "user", "content": input_.content}) 92 | 93 | response = self.client.chat.completions.create( 94 | model=self.model_name, 95 | messages=message_history 96 | ) 97 | 98 | reasoning_content = response.choices[0].message.reasoning_content 99 | content = response.choices[0].message.content 100 | return AIMessage(content=content, reasoning_content=reasoning_content) 101 | 102 | class DeepSeekR1ChatOllama(ChatOllama): 103 | 104 | async def ainvoke( 105 | self, 106 | input: LanguageModelInput, 107 | config: Optional[RunnableConfig] = None, 108 | *, 109 | stop: Optional[list[str]] = None, 110 | **kwargs: Any, 111 | ) -> AIMessage: 112 | org_ai_message = await super().ainvoke(input=input) 113 | org_content = org_ai_message.content 114 | reasoning_content = org_content.split("</think>")[0].replace("<think>", "") 115 | content = org_content.split("</think>")[1] 116 | if "**JSON Response:**" in content: 117 | content = content.split("**JSON Response:**")[-1] 118 | return AIMessage(content=content, reasoning_content=reasoning_content) 119 | 120 | def invoke( 121 | self, 122 | input: LanguageModelInput, 123 | config: Optional[RunnableConfig] = None, 124 | *, 125 | stop: Optional[list[str]] = None, 126 | **kwargs: Any, 127 | ) -> AIMessage: 128 | org_ai_message = super().invoke(input=input) 129 | org_content = org_ai_message.content 130 | reasoning_content = org_content.split("</think>")[0].replace("<think>", "") 131 | content = org_content.split("</think>")[1] 132 | if "**JSON Response:**" in content: 133 | content = content.split("**JSON Response:**")[-1] 134 | return AIMessage(content=content, reasoning_content=reasoning_content) 135 | ``` -------------------------------------------------------------------------------- /src/mcp_server_browser_use/agent/custom_massage_manager.py: -------------------------------------------------------------------------------- ```python 1 | from __future__ import annotations 2 | 3 | import logging 4 | from typing import List, Optional, Type 5 | 6 | from browser_use.agent.message_manager.service import MessageManager 7 | from browser_use.agent.message_manager.views import MessageHistory 8 | from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt 9 | from browser_use.agent.views import ActionResult, AgentStepInfo, ActionModel 10 | from browser_use.browser.views import BrowserState 11 | from langchain_core.language_models import BaseChatModel 12 | from langchain_anthropic import ChatAnthropic 13 | from langchain_core.language_models import BaseChatModel 14 | from langchain_core.messages import ( 15 | AIMessage, 16 | BaseMessage, 17 | HumanMessage, 18 | ToolMessage 19 | ) 20 | from langchain_openai import ChatOpenAI 21 | from ..utils.llm import DeepSeekR1ChatOpenAI 22 | from .custom_prompts import CustomAgentMessagePrompt 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | class CustomMassageManager(MessageManager): 28 | def __init__( 29 | self, 30 | llm: BaseChatModel, 31 | task: str, 32 | action_descriptions: str, 33 | system_prompt_class: Type[SystemPrompt], 34 | agent_prompt_class: Type[AgentMessagePrompt], 35 | max_input_tokens: int = 128000, 36 | estimated_characters_per_token: int = 3, 37 | image_tokens: int = 800, 38 | include_attributes: list[str] = [], 39 | max_error_length: int = 400, 40 | max_actions_per_step: int = 10, 41 | message_context: Optional[str] = None 42 | ): 43 | super().__init__( 44 | llm=llm, 45 | task=task, 46 | action_descriptions=action_descriptions, 47 | system_prompt_class=system_prompt_class, 48 | max_input_tokens=max_input_tokens, 49 | estimated_characters_per_token=estimated_characters_per_token, 50 | image_tokens=image_tokens, 51 | include_attributes=include_attributes, 52 | max_error_length=max_error_length, 53 | max_actions_per_step=max_actions_per_step, 54 | message_context=message_context 55 | ) 56 | self.agent_prompt_class = agent_prompt_class 57 | # Custom: Move Task info to state_message 58 | self.history = MessageHistory() 59 | self._add_message_with_tokens(self.system_prompt) 60 | 61 | if self.message_context: 62 | context_message = HumanMessage(content=self.message_context) 63 | self._add_message_with_tokens(context_message) 64 | 65 | def cut_messages(self): 66 | """Get current message list, potentially trimmed to max tokens""" 67 | diff = self.history.total_tokens - self.max_input_tokens 68 | min_message_len = 2 if self.message_context is not None else 1 69 | 70 | while diff > 0 and len(self.history.messages) > min_message_len: 71 | self.history.remove_message(min_message_len) # always remove the oldest message 72 | diff = self.history.total_tokens - self.max_input_tokens 73 | 74 | def add_state_message( 75 | self, 76 | state: BrowserState, 77 | actions: Optional[List[ActionModel]] = None, 78 | result: Optional[List[ActionResult]] = None, 79 | step_info: Optional[AgentStepInfo] = None, 80 | ) -> None: 81 | """Add browser state as human message""" 82 | # otherwise add state message and result to next message (which will not stay in memory) 83 | state_message = self.agent_prompt_class( 84 | state, 85 | actions, 86 | result, 87 | include_attributes=self.include_attributes, 88 | max_error_length=self.max_error_length, 89 | step_info=step_info, 90 | ).get_user_message() 91 | self._add_message_with_tokens(state_message) 92 | 93 | def _count_text_tokens(self, text: str) -> int: 94 | if isinstance(self.llm, (ChatOpenAI, ChatAnthropic, DeepSeekR1ChatOpenAI)): 95 | try: 96 | tokens = self.llm.get_num_tokens(text) 97 | except Exception: 98 | tokens = ( 99 | len(text) // self.estimated_characters_per_token 100 | ) # Rough estimate if no tokenizer available 101 | else: 102 | tokens = ( 103 | len(text) // self.estimated_characters_per_token 104 | ) # Rough estimate if no tokenizer available 105 | return tokens 106 | 107 | def _remove_state_message_by_index(self, remove_ind=-1) -> None: 108 | """Remove last state message from history""" 109 | i = len(self.history.messages) - 1 110 | remove_cnt = 0 111 | while i >= 0: 112 | if isinstance(self.history.messages[i].message, HumanMessage): 113 | remove_cnt += 1 114 | if remove_cnt == abs(remove_ind): 115 | self.history.remove_message(i) 116 | break 117 | i -= 1 118 | ``` -------------------------------------------------------------------------------- /src/mcp_server_browser_use/server.py: -------------------------------------------------------------------------------- ```python 1 | import asyncio 2 | import os 3 | import sys 4 | import traceback 5 | from typing import List, Optional 6 | 7 | import logging 8 | logging.getLogger().addHandler(logging.NullHandler()) 9 | logging.getLogger().propagate = False 10 | 11 | from mcp_server_browser_use.agent.custom_prompts import ( 12 | CustomAgentMessagePrompt, 13 | CustomSystemPrompt, 14 | ) 15 | 16 | from browser_use import BrowserConfig 17 | from browser_use.browser.context import BrowserContextConfig, BrowserContextWindowSize 18 | from fastmcp.server import FastMCP 19 | from mcp.types import TextContent 20 | 21 | from mcp_server_browser_use.agent.custom_agent import CustomAgent 22 | from mcp_server_browser_use.browser.custom_browser import CustomBrowser 23 | from mcp_server_browser_use.controller.custom_controller import CustomController 24 | from mcp_server_browser_use.utils import utils 25 | from mcp_server_browser_use.utils.agent_state import AgentState 26 | 27 | # Global references for single "running agent" approach 28 | _global_agent = None 29 | _global_browser = None 30 | _global_browser_context = None 31 | _global_agent_state = AgentState() 32 | 33 | app = FastMCP("mcp_server_browser_use") 34 | 35 | 36 | def get_env_bool(key: str, default: bool = False) -> bool: 37 | """Get boolean value from environment variable.""" 38 | return os.getenv(key, str(default)).lower() in ("true", "1", "yes") 39 | 40 | 41 | async def _safe_cleanup(): 42 | """Safely clean up browser resources""" 43 | global _global_browser, _global_agent_state, _global_browser_context, _global_agent 44 | 45 | try: 46 | if _global_agent_state: 47 | try: 48 | await _global_agent_state.request_stop() 49 | except Exception: 50 | pass 51 | 52 | if _global_browser_context: 53 | try: 54 | await _global_browser_context.close() 55 | except Exception: 56 | pass 57 | 58 | if _global_browser: 59 | try: 60 | await _global_browser.close() 61 | except Exception: 62 | pass 63 | 64 | except Exception as e: 65 | # Log the error, but don't re-raise 66 | print(f"Error during cleanup: {e}", file=sys.stderr) 67 | finally: 68 | # Reset global variables 69 | _global_browser = None 70 | _global_browser_context = None 71 | _global_agent_state = AgentState() 72 | _global_agent = None 73 | 74 | 75 | @app.tool() 76 | async def run_browser_agent(task: str, add_infos: str = "") -> str: 77 | """Handle run-browser-agent tool calls.""" 78 | global _global_agent, _global_browser, _global_browser_context, _global_agent_state 79 | 80 | try: 81 | # Clear any previous agent stop signals 82 | _global_agent_state.clear_stop() 83 | 84 | # Get browser configuration 85 | headless = get_env_bool("BROWSER_HEADLESS", True) 86 | disable_security = get_env_bool("BROWSER_DISABLE_SECURITY", False) 87 | window_w = int(os.getenv("BROWSER_WINDOW_WIDTH", "1280")) 88 | window_h = int(os.getenv("BROWSER_WINDOW_HEIGHT", "720")) 89 | 90 | # Get agent configuration 91 | model_provider = os.getenv("MCP_MODEL_PROVIDER", "openrouter") 92 | model_name = os.getenv("MCP_MODEL_NAME", "openai/o3-mini-high") 93 | temperature = float(os.getenv("MCP_TEMPERATURE", "0.7")) 94 | max_steps = int(os.getenv("MCP_MAX_STEPS", "100")) 95 | use_vision = get_env_bool("MCP_USE_VISION", True) 96 | max_actions_per_step = int(os.getenv("MCP_MAX_ACTIONS_PER_STEP", "5")) 97 | tool_calling_method = os.getenv("MCP_TOOL_CALLING_METHOD", "auto") 98 | 99 | # Configure browser window size 100 | extra_chromium_args = [f"--window-size={window_w},{window_h}"] 101 | 102 | # Initialize browser if needed 103 | if not _global_browser: 104 | _global_browser = CustomBrowser( 105 | config=BrowserConfig( 106 | headless=headless, 107 | disable_security=disable_security, 108 | extra_chromium_args=extra_chromium_args, 109 | ) 110 | ) 111 | 112 | # Initialize browser context if needed 113 | if not _global_browser_context: 114 | _global_browser_context = await _global_browser.new_context( 115 | config=BrowserContextConfig( 116 | trace_path=os.getenv("BROWSER_TRACE_PATH"), 117 | save_recording_path=os.getenv("BROWSER_RECORDING_PATH"), 118 | no_viewport=False, 119 | browser_window_size=BrowserContextWindowSize( 120 | width=window_w, height=window_h 121 | ), 122 | ) 123 | ) 124 | 125 | # Prepare LLM 126 | llm = utils.get_llm_model( 127 | provider=model_provider, model_name=model_name, temperature=temperature 128 | ) 129 | 130 | # Create controller and agent 131 | controller = CustomController() 132 | _global_agent = CustomAgent( 133 | task=task, 134 | add_infos=add_infos, 135 | use_vision=use_vision, 136 | llm=llm, 137 | browser=_global_browser, 138 | browser_context=_global_browser_context, 139 | controller=controller, 140 | system_prompt_class=CustomSystemPrompt, 141 | agent_prompt_class=CustomAgentMessagePrompt, 142 | max_actions_per_step=max_actions_per_step, 143 | agent_state=_global_agent_state, 144 | tool_calling_method=tool_calling_method, 145 | ) 146 | 147 | # Run agent with improved error handling 148 | try: 149 | history = await _global_agent.run(max_steps=max_steps) 150 | final_result = ( 151 | history.final_result() 152 | or f"No final result. Possibly incomplete. {history}" 153 | ) 154 | return final_result 155 | except asyncio.CancelledError: 156 | return "Task was cancelled" 157 | except Exception as e: 158 | logging.error(f"Agent run error: {str(e)}\n{traceback.format_exc()}") 159 | return f"Error during task execution: {str(e)}" 160 | 161 | except Exception as e: 162 | logging.error(f"run-browser-agent error: {str(e)}\n{traceback.format_exc()}") 163 | return f"Error during task execution: {str(e)}" 164 | 165 | finally: 166 | asyncio.create_task(_safe_cleanup()) 167 | 168 | 169 | def main(): 170 | app.run() 171 | 172 | 173 | if __name__ == "__main__": 174 | main() 175 | ``` -------------------------------------------------------------------------------- /src/mcp_server_browser_use/utils/utils.py: -------------------------------------------------------------------------------- ```python 1 | import base64 2 | import os 3 | import time 4 | from pathlib import Path 5 | from typing import Dict, Optional 6 | import requests 7 | 8 | from langchain_anthropic import ChatAnthropic 9 | from langchain_mistralai import ChatMistralAI 10 | from langchain_google_genai import ChatGoogleGenerativeAI 11 | from langchain_ollama import ChatOllama 12 | from langchain_openai import AzureChatOpenAI, ChatOpenAI 13 | 14 | from .llm import DeepSeekR1ChatOpenAI, DeepSeekR1ChatOllama 15 | 16 | def get_llm_model(provider: str, **kwargs): 17 | """ 18 | 获取LLM 模型 19 | :param provider: 模型类型 20 | :param kwargs: 21 | :return: 22 | """ 23 | if provider not in ["ollama"]: 24 | env_var = ( 25 | "GOOGLE_API_KEY" if provider == "gemini" else f"{provider.upper()}_API_KEY" 26 | ) 27 | api_key = kwargs.get("api_key", "") or os.getenv(env_var, "") 28 | if not api_key: 29 | raise ValueError(f"API key for {provider} is not set") 30 | kwargs["api_key"] = api_key 31 | 32 | if provider == "anthropic": 33 | if not kwargs.get("base_url", ""): 34 | base_url = "https://api.anthropic.com" 35 | else: 36 | base_url = kwargs.get("base_url") 37 | 38 | return ChatAnthropic( 39 | model_name=kwargs.get("model_name", "claude-3-5-sonnet-20240620"), 40 | temperature=kwargs.get("temperature", 0.0), 41 | base_url=base_url, 42 | api_key=api_key, 43 | ) 44 | elif provider == "mistral": 45 | if not kwargs.get("base_url", ""): 46 | base_url = os.getenv("MISTRAL_ENDPOINT", "https://api.mistral.ai/v1") 47 | else: 48 | base_url = kwargs.get("base_url") 49 | if not kwargs.get("api_key", ""): 50 | api_key = os.getenv("MISTRAL_API_KEY", "") 51 | else: 52 | api_key = kwargs.get("api_key") 53 | 54 | return ChatMistralAI( 55 | model=kwargs.get("model_name", "mistral-large-latest"), 56 | temperature=kwargs.get("temperature", 0.0), 57 | base_url=base_url, 58 | api_key=api_key, 59 | ) 60 | elif provider == "openai": 61 | if not kwargs.get("base_url", ""): 62 | base_url = os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1") 63 | else: 64 | base_url = kwargs.get("base_url") 65 | 66 | return ChatOpenAI( 67 | model=kwargs.get("model_name", "gpt-4o"), 68 | temperature=kwargs.get("temperature", 0.0), 69 | base_url=base_url, 70 | api_key=api_key, 71 | ) 72 | elif provider == "deepseek": 73 | if not kwargs.get("base_url", ""): 74 | base_url = os.getenv("DEEPSEEK_ENDPOINT", "") 75 | else: 76 | base_url = kwargs.get("base_url") 77 | 78 | if kwargs.get("model_name", "deepseek-chat") == "deepseek-reasoner": 79 | return DeepSeekR1ChatOpenAI( 80 | model=kwargs.get("model_name", "deepseek-reasoner"), 81 | temperature=kwargs.get("temperature", 0.0), 82 | base_url=base_url, 83 | api_key=api_key, 84 | ) 85 | else: 86 | return ChatOpenAI( 87 | model=kwargs.get("model_name", "deepseek-chat"), 88 | temperature=kwargs.get("temperature", 0.0), 89 | base_url=base_url, 90 | api_key=api_key, 91 | ) 92 | elif provider == "gemini": 93 | return ChatGoogleGenerativeAI( 94 | model=kwargs.get("model_name", "gemini-2.0-flash-exp"), 95 | temperature=kwargs.get("temperature", 0.0), 96 | google_api_key=api_key, 97 | ) 98 | elif provider == "ollama": 99 | if not kwargs.get("base_url", ""): 100 | base_url = os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434") 101 | else: 102 | base_url = kwargs.get("base_url") 103 | 104 | if "deepseek-r1" in kwargs.get("model_name", "qwen2.5:7b"): 105 | return DeepSeekR1ChatOllama( 106 | model=kwargs.get("model_name", "deepseek-r1:14b"), 107 | temperature=kwargs.get("temperature", 0.0), 108 | num_ctx=kwargs.get("num_ctx", 32000), 109 | base_url=base_url, 110 | ) 111 | else: 112 | return ChatOllama( 113 | model=kwargs.get("model_name", "qwen2.5:7b"), 114 | temperature=kwargs.get("temperature", 0.0), 115 | num_ctx=kwargs.get("num_ctx", 32000), 116 | num_predict=kwargs.get("num_predict", 1024), 117 | base_url=base_url, 118 | ) 119 | elif provider == "azure_openai": 120 | if not kwargs.get("base_url", ""): 121 | base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "") 122 | else: 123 | base_url = kwargs.get("base_url") 124 | return AzureChatOpenAI( 125 | model=kwargs.get("model_name", "gpt-4o"), 126 | temperature=kwargs.get("temperature", 0.0), 127 | api_version="2024-05-01-preview", 128 | azure_endpoint=base_url, 129 | api_key=api_key, 130 | ) 131 | elif provider == "openrouter": 132 | if not kwargs.get("base_url", ""): 133 | base_url = os.getenv("OPENROUTER_ENDPOINT", "") 134 | else: 135 | base_url = kwargs.get("base_url") 136 | 137 | model_name = kwargs.get("model_name", "openai/o3-mini-high") 138 | if "r1" in model_name or "aion" in model_name: 139 | return DeepSeekR1ChatOpenAI( 140 | model=model_name, 141 | temperature=kwargs.get("temperature", 0.0), 142 | base_url=base_url, 143 | api_key=api_key, 144 | ) 145 | else: 146 | return ChatOpenAI( 147 | model=model_name, 148 | temperature=kwargs.get("temperature", 0.0), 149 | base_url=base_url, 150 | api_key=api_key, 151 | ) 152 | else: 153 | raise ValueError(f"Unsupported provider: {provider}") 154 | 155 | 156 | def encode_image(img_path): 157 | if not img_path: 158 | return None 159 | with open(img_path, "rb") as fin: 160 | image_data = base64.b64encode(fin.read()).decode("utf-8") 161 | return image_data 162 | 163 | 164 | def get_latest_files( 165 | directory: str, file_types: list = [".webm", ".zip"] 166 | ) -> Dict[str, Optional[str]]: 167 | """Get the latest recording and trace files""" 168 | latest_files: Dict[str, Optional[str]] = {ext: None for ext in file_types} 169 | 170 | if not os.path.exists(directory): 171 | os.makedirs(directory, exist_ok=True) 172 | return latest_files 173 | 174 | for file_type in file_types: 175 | try: 176 | matches = list(Path(directory).rglob(f"*{file_type}")) 177 | if matches: 178 | latest = max(matches, key=lambda p: p.stat().st_mtime) 179 | # Only return files that are complete (not being written) 180 | if time.time() - latest.stat().st_mtime > 1.0: 181 | latest_files[file_type] = str(latest) 182 | except Exception as e: 183 | print(f"Error getting latest {file_type} file: {e}") 184 | 185 | return latest_files 186 | 187 | 188 | async def capture_screenshot(browser_context): 189 | """Capture and encode a screenshot""" 190 | # Extract the Playwright browser instance 191 | playwright_browser = ( 192 | browser_context.browser.playwright_browser 193 | ) # Ensure this is correct. 194 | 195 | # Check if the browser instance is valid and if an existing context can be reused 196 | if playwright_browser and playwright_browser.contexts: 197 | playwright_context = playwright_browser.contexts[0] 198 | else: 199 | return None 200 | 201 | # Access pages in the context 202 | pages = None 203 | if playwright_context: 204 | pages = playwright_context.pages 205 | 206 | # Use an existing page or create a new one if none exist 207 | if pages: 208 | active_page = pages[0] 209 | for page in pages: 210 | if page.url != "about:blank": 211 | active_page = page 212 | else: 213 | return None 214 | 215 | # Take screenshot 216 | try: 217 | screenshot = await active_page.screenshot(type="jpeg", quality=75, scale="css") 218 | encoded = base64.b64encode(screenshot).decode("utf-8") 219 | return encoded 220 | except Exception as e: 221 | return None 222 | ``` -------------------------------------------------------------------------------- /src/mcp_server_browser_use/agent/custom_prompts.py: -------------------------------------------------------------------------------- ```python 1 | import pdb 2 | from typing import List, Optional 3 | 4 | from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt 5 | from browser_use.agent.views import ActionResult, ActionModel 6 | from browser_use.browser.views import BrowserState 7 | from langchain_core.messages import HumanMessage, SystemMessage 8 | from datetime import datetime 9 | 10 | from .custom_views import CustomAgentStepInfo 11 | 12 | 13 | class CustomSystemPrompt(SystemPrompt): 14 | def important_rules(self) -> str: 15 | """ 16 | Returns the important rules for the agent. 17 | """ 18 | text = r""" 19 | 1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format: 20 | { 21 | "current_state": { 22 | "prev_action_evaluation": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not. Note that the result you output must be consistent with the reasoning you output afterwards. If you consider it to be 'Failed,' you should reflect on this during your thought.", 23 | "important_contents": "Output important contents closely related to user\'s instruction on the current page. If there is, please output the contents. If not, please output empty string ''.", 24 | "task_progress": "Task Progress is a general summary of the current contents that have been completed. Just summarize the contents that have been actually completed based on the content at current step and the history operations. Please list each completed item individually, such as: 1. Input username. 2. Input Password. 3. Click confirm button. Please return string type not a list.", 25 | "future_plans": "Based on the user's request and the current state, outline the remaining steps needed to complete the task. This should be a concise list of actions yet to be performed, such as: 1. Select a date. 2. Choose a specific time slot. 3. Confirm booking. Please return string type not a list.", 26 | "thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If your output of prev_action_evaluation is 'Failed', please reflect and output your reflection here.", 27 | "summary": "Please generate a brief natural language description for the operation in next actions based on your Thought." 28 | }, 29 | "action": [ 30 | * actions in sequences, please refer to **Common action sequences**. Each output action MUST be formated as: \{action_name\: action_params\}* 31 | ] 32 | } 33 | 34 | 2. ACTIONS: You can specify multiple actions to be executed in sequence. 35 | 36 | Common action sequences: 37 | - Form filling: [ 38 | {"input_text": {"index": 1, "text": "username"}}, 39 | {"input_text": {"index": 2, "text": "password"}}, 40 | {"click_element": {"index": 3}} 41 | ] 42 | - Navigation and extraction: [ 43 | {"go_to_url": {"url": "https://example.com"}}, 44 | {"extract_page_content": {}} 45 | ] 46 | 47 | 48 | 3. ELEMENT INTERACTION: 49 | - Only use indexes that exist in the provided element list 50 | - Each element has a unique index number (e.g., "33[:]<button>") 51 | - Elements marked with "_[:]" are non-interactive (for context only) 52 | 53 | 4. NAVIGATION & ERROR HANDLING: 54 | - If no suitable elements exist, use other functions to complete the task 55 | - If stuck, try alternative approaches 56 | - Handle popups/cookies by accepting or closing them 57 | - Use scroll to find elements you are looking for 58 | 59 | 5. TASK COMPLETION: 60 | - If you think all the requirements of user\'s instruction have been completed and no further operation is required, output the **Done** action to terminate the operation process. 61 | - Don't hallucinate actions. 62 | - If the task requires specific information - make sure to include everything in the done function. This is what the user will see. 63 | - If you are running out of steps (current step), think about speeding it up, and ALWAYS use the done action as the last action. 64 | - Note that you must verify if you've truly fulfilled the user's request by examining the actual page content, not just by looking at the actions you output but also whether the action is executed successfully. Pay particular attention when errors occur during action execution. 65 | 66 | 6. VISUAL CONTEXT: 67 | - When an image is provided, use it to understand the page layout 68 | - Bounding boxes with labels correspond to element indexes 69 | - Each bounding box and its label have the same color 70 | - Most often the label is inside the bounding box, on the top right 71 | - Visual context helps verify element locations and relationships 72 | - sometimes labels overlap, so use the context to verify the correct element 73 | 74 | 7. Form filling: 75 | - If you fill an input field and your action sequence is interrupted, most often a list with suggestions poped up under the field and you need to first select the right element from the suggestion list. 76 | 77 | 8. ACTION SEQUENCING: 78 | - Actions are executed in the order they appear in the list 79 | - Each action should logically follow from the previous one 80 | - If the page changes after an action, the sequence is interrupted and you get the new state. 81 | - If content only disappears the sequence continues. 82 | - Only provide the action sequence until you think the page will change. 83 | - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page like saving, extracting, checkboxes... 84 | - only use multiple actions if it makes sense. 85 | """ 86 | text += f" - use maximum {self.max_actions_per_step} actions per sequence" 87 | return text 88 | 89 | def input_format(self) -> str: 90 | return """ 91 | INPUT STRUCTURE: 92 | 1. Task: The user\'s instructions you need to complete. 93 | 2. Hints(Optional): Some hints to help you complete the user\'s instructions. 94 | 3. Memory: Important contents are recorded during historical operations for use in subsequent operations. 95 | 4. Current URL: The webpage you're currently on 96 | 5. Available Tabs: List of open browser tabs 97 | 6. Interactive Elements: List in the format: 98 | index[:]<element_type>element_text</element_type> 99 | - index: Numeric identifier for interaction 100 | - element_type: HTML element type (button, input, etc.) 101 | - element_text: Visible text or element description 102 | 103 | Example: 104 | 33[:]<button>Submit Form</button> 105 | _[:] Non-interactive text 106 | 107 | 108 | Notes: 109 | - Only elements with numeric indexes are interactive 110 | - _[:] elements provide context but cannot be interacted with 111 | """ 112 | 113 | def get_system_message(self) -> SystemMessage: 114 | """ 115 | Get the system prompt for the agent. 116 | 117 | Returns: 118 | str: Formatted system prompt 119 | """ 120 | AGENT_PROMPT = f"""You are a precise browser automation agent that interacts with websites through structured commands. Your role is to: 121 | 1. Analyze the provided webpage elements and structure 122 | 2. Plan a sequence of actions to accomplish the given task 123 | 3. Your final result MUST be a valid JSON as the **RESPONSE FORMAT** described, containing your action sequence and state assessment, No need extra content to expalin. 124 | 125 | {self.input_format()} 126 | 127 | {self.important_rules()} 128 | 129 | Functions: 130 | {self.default_action_description} 131 | 132 | Remember: Your responses must be valid JSON matching the specified format. Each action in the sequence must be valid.""" 133 | return SystemMessage(content=AGENT_PROMPT) 134 | 135 | 136 | class CustomAgentMessagePrompt(AgentMessagePrompt): 137 | def __init__( 138 | self, 139 | state: BrowserState, 140 | actions: Optional[List[ActionModel]] = None, 141 | result: Optional[List[ActionResult]] = None, 142 | include_attributes: list[str] = [], 143 | max_error_length: int = 400, 144 | step_info: Optional[CustomAgentStepInfo] = None, 145 | ): 146 | super(CustomAgentMessagePrompt, self).__init__(state=state, 147 | result=result, 148 | include_attributes=include_attributes, 149 | max_error_length=max_error_length, 150 | step_info=step_info 151 | ) 152 | self.actions = actions 153 | 154 | def get_user_message(self) -> HumanMessage: 155 | if self.step_info: 156 | step_info_description = f'Current step: {self.step_info.step_number}/{self.step_info.max_steps}\n' 157 | else: 158 | step_info_description = '' 159 | 160 | time_str = datetime.now().strftime("%Y-%m-%d %H:%M") 161 | step_info_description += "Current date and time: {time_str}" 162 | 163 | elements_text = self.state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes) 164 | 165 | has_content_above = (self.state.pixels_above or 0) > 0 166 | has_content_below = (self.state.pixels_below or 0) > 0 167 | 168 | if elements_text != '': 169 | if has_content_above: 170 | elements_text = ( 171 | f'... {self.state.pixels_above} pixels above - scroll or extract content to see more ...\n{elements_text}' 172 | ) 173 | else: 174 | elements_text = f'[Start of page]\n{elements_text}' 175 | if has_content_below: 176 | elements_text = ( 177 | f'{elements_text}\n... {self.state.pixels_below} pixels below - scroll or extract content to see more ...' 178 | ) 179 | else: 180 | elements_text = f'{elements_text}\n[End of page]' 181 | else: 182 | elements_text = 'empty page' 183 | 184 | state_description = f""" 185 | {step_info_description} 186 | 1. Task: {self.step_info.task}. 187 | 2. Hints(Optional): 188 | {self.step_info.add_infos} 189 | 3. Memory: 190 | {self.step_info.memory} 191 | 4. Current url: {self.state.url} 192 | 5. Available tabs: 193 | {self.state.tabs} 194 | 6. Interactive elements: 195 | {elements_text} 196 | """ 197 | 198 | if self.actions and self.result: 199 | state_description += "\n **Previous Actions** \n" 200 | state_description += f'Previous step: {self.step_info.step_number-1}/{self.step_info.max_steps} \n' 201 | for i, result in enumerate(self.result): 202 | action = self.actions[i] 203 | state_description += f"Previous action {i + 1}/{len(self.result)}: {action.model_dump_json(exclude_unset=True)}\n" 204 | if result.include_in_memory: 205 | if result.extracted_content: 206 | state_description += f"Result of previous action {i + 1}/{len(self.result)}: {result.extracted_content}\n" 207 | if result.error: 208 | # only use last 300 characters of error 209 | error = result.error[-self.max_error_length:] 210 | state_description += ( 211 | f"Error of previous action {i + 1}/{len(self.result)}: ...{error}\n" 212 | ) 213 | 214 | if self.state.screenshot: 215 | # Format message for vision model 216 | return HumanMessage( 217 | content=[ 218 | {"type": "text", "text": state_description}, 219 | { 220 | "type": "image_url", 221 | "image_url": { 222 | "url": f"data:image/png;base64,{self.state.screenshot}" 223 | }, 224 | }, 225 | ] 226 | ) 227 | 228 | return HumanMessage(content=state_description) ``` -------------------------------------------------------------------------------- /src/mcp_server_browser_use/utils/deep_research.py: -------------------------------------------------------------------------------- ```python 1 | import pdb 2 | 3 | from dotenv import load_dotenv 4 | 5 | load_dotenv() 6 | import asyncio 7 | import os 8 | import sys 9 | import logging 10 | from pprint import pprint 11 | from uuid import uuid4 12 | from mcp_server_browser_use.utils import utils 13 | from mcp_server_browser_use.agent.custom_agent import CustomAgent 14 | import json 15 | from browser_use.agent.service import Agent 16 | from browser_use.browser.browser import BrowserConfig, Browser 17 | from langchain.schema import SystemMessage, HumanMessage 18 | from json_repair import repair_json 19 | from mcp_server_browser_use.agent.custom_prompts import ( 20 | CustomSystemPrompt, 21 | CustomAgentMessagePrompt, 22 | ) 23 | from mcp_server_browser_use.controller.custom_controller import CustomController 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | async def deep_research(task, llm, **kwargs): 29 | task_id = str(uuid4()) 30 | save_dir = kwargs.get("save_dir", os.path.join(f"./tmp/deep_research/{task_id}")) 31 | logger.info(f"Save Deep Research at: {save_dir}") 32 | os.makedirs(save_dir, exist_ok=True) 33 | 34 | # max qyery num per iteration 35 | max_query_num = kwargs.get("max_query_num", 3) 36 | search_system_prompt = f""" 37 | You are a **Deep Researcher**, an AI agent specializing in in-depth information gathering and research using a web browser with **automated execution capabilities**. Your expertise lies in formulating comprehensive research plans and executing them meticulously to fulfill complex user requests. You will analyze user instructions, devise a detailed research plan, and determine the necessary search queries to gather the required information. 38 | 39 | **Your Task:** 40 | 41 | Given a user's research topic, you will: 42 | 43 | 1. **Develop a Research Plan:** Outline the key aspects and subtopics that need to be investigated to thoroughly address the user's request. This plan should be a high-level overview of the research direction. 44 | 2. **Generate Search Queries:** Based on your research plan, generate a list of specific search queries to be executed in a web browser. These queries should be designed to efficiently gather relevant information for each aspect of your plan. 45 | 46 | **Output Format:** 47 | 48 | Your output will be a JSON object with the following structure: 49 | 50 | ```json 51 | {{ 52 | "plan": "A concise, high-level research plan outlining the key areas to investigate.", 53 | "queries": [ 54 | "search query 1", 55 | "search query 2", 56 | //... up to a maximum of {max_query_num} search queries 57 | ] 58 | }} 59 | ``` 60 | 61 | **Important:** 62 | 63 | * Limit your output to a **maximum of {max_query_num}** search queries. 64 | * Make the search queries to help the automated agent find the needed information. Consider what keywords are most likely to lead to useful results. 65 | * If you have gathered for all the information you want and no further search queries are required, output queries with an empty list: `[]` 66 | * Make sure output search queries are different from the history queries. 67 | 68 | **Inputs:** 69 | 70 | 1. **User Instruction:** The original instruction given by the user. 71 | 2. **Previous Queries:** History Queries. 72 | 3. **Previous Search Results:** Textual data gathered from prior search queries. If there are no previous search results this string will be empty. 73 | """ 74 | search_messages = [SystemMessage(content=search_system_prompt)] 75 | 76 | record_system_prompt = """ 77 | You are an expert information recorder. Your role is to process user instructions, current search results, and previously recorded information to extract, summarize, and record new, useful information that helps fulfill the user's request. Your output will be a JSON formatted list, where each element represents a piece of extracted information and follows the structure: `{"url": "source_url", "title": "source_title", "summary_content": "concise_summary", "thinking": "reasoning"}`. 78 | 79 | **Important Considerations:** 80 | 81 | 1. **Minimize Information Loss:** While concise, prioritize retaining important details and nuances from the sources. Aim for a summary that captures the essence of the information without over-simplification. **Crucially, ensure to preserve key data and figures within the `summary_content`. This is essential for later stages, such as generating tables and reports.** 82 | 83 | 2. **Avoid Redundancy:** Do not record information that is already present in the Previous Recorded Information. Check for semantic similarity, not just exact matches. However, if the same information is expressed differently in a new source and this variation adds valuable context or clarity, it should be included. 84 | 85 | 3. **Source Information:** Extract and include the source title and URL for each piece of information summarized. This is crucial for verification and context. **The Current Search Results are provided in a specific format, where each item starts with "Title:", followed by the title, then "URL Source:", followed by the URL, and finally "Markdown Content:", followed by the content. Please extract the title and URL from this structure.** If a piece of information cannot be attributed to a specific source from the provided search results, use `"url": "unknown"` and `"title": "unknown"`. 86 | 87 | 4. **Thinking and Report Structure:** For each extracted piece of information, add a `"thinking"` key. This field should contain your assessment of how this information could be used in a report, which section it might belong to (e.g., introduction, background, analysis, conclusion, specific subtopics), and any other relevant thoughts about its significance or connection to other information. 88 | 89 | **Output Format:** 90 | 91 | Provide your output as a JSON formatted list. Each item in the list must adhere to the following format: 92 | 93 | ```json 94 | [ 95 | { 96 | "url": "source_url_1", 97 | "title": "source_title_1", 98 | "summary_content": "Concise summary of content. Remember to include key data and figures here.", 99 | "thinking": "This could be used in the introduction to set the context. It also relates to the section on the history of the topic." 100 | }, 101 | // ... more entries 102 | { 103 | "url": "unknown", 104 | "title": "unknown", 105 | "summary_content": "concise_summary_of_content_without_clear_source", 106 | "thinking": "This might be useful background information, but I need to verify its accuracy. Could be used in the methodology section to explain how data was collected." 107 | } 108 | ] 109 | ``` 110 | 111 | **Inputs:** 112 | 113 | 1. **User Instruction:** The original instruction given by the user. This helps you determine what kind of information will be useful and how to structure your thinking. 114 | 2. **Previous Recorded Information:** Textual data gathered and recorded from previous searches and processing, represented as a single text string. 115 | 3. **Current Search Results:** Textual data gathered from the most recent search query. 116 | """ 117 | record_messages = [SystemMessage(content=record_system_prompt)] 118 | 119 | browser = Browser( 120 | config=BrowserConfig( 121 | disable_security=True, 122 | headless=kwargs.get( 123 | "headless", False 124 | ), # Set to False to see browser actions 125 | ) 126 | ) 127 | controller = CustomController() 128 | 129 | search_iteration = 0 130 | max_search_iterations = kwargs.get( 131 | "max_search_iterations", 10 132 | ) # Limit search iterations to prevent infinite loop 133 | use_vision = kwargs.get("use_vision", False) 134 | 135 | history_query = [] 136 | history_infos = [] 137 | try: 138 | while search_iteration < max_search_iterations: 139 | search_iteration += 1 140 | logger.info(f"Start {search_iteration}th Search...") 141 | history_query_ = json.dumps(history_query, indent=4) 142 | history_infos_ = json.dumps(history_infos, indent=4) 143 | query_prompt = f"This is search {search_iteration} of {max_search_iterations} maximum searches allowed.\n User Instruction:{task} \n Previous Queries:\n {history_query_} \n Previous Search Results:\n {history_infos_}\n" 144 | search_messages.append(HumanMessage(content=query_prompt)) 145 | ai_query_msg = llm.invoke(search_messages[:1] + search_messages[1:][-1:]) 146 | search_messages.append(ai_query_msg) 147 | if hasattr(ai_query_msg, "reasoning_content"): 148 | logger.info("🤯 Start Search Deep Thinking: ") 149 | logger.info(ai_query_msg.reasoning_content) 150 | logger.info("🤯 End Search Deep Thinking") 151 | ai_query_content = ai_query_msg.content.replace("```json", "").replace( 152 | "```", "" 153 | ) 154 | ai_query_content = repair_json(ai_query_content) 155 | ai_query_content = json.loads(ai_query_content) 156 | query_plan = ai_query_content["plan"] 157 | logger.info(f"Current Iteration {search_iteration} Planing:") 158 | logger.info(query_plan) 159 | query_tasks = ai_query_content["queries"] 160 | if not query_tasks: 161 | break 162 | else: 163 | history_query.extend(query_tasks) 164 | logger.info("Query tasks:") 165 | logger.info(query_tasks) 166 | 167 | # 2. Perform Web Search and Auto exec 168 | # Paralle BU agents 169 | add_infos = ( 170 | "1. Please click on the most relevant link to get information and go deeper, instead of just staying on the search page. \n" 171 | "2. When opening a PDF file, please remember to extract the content using extract_content instead of simply opening it for the user to view." 172 | ) 173 | agents = [ 174 | CustomAgent( 175 | task=task, 176 | llm=llm, 177 | add_infos=add_infos, 178 | browser=browser, 179 | use_vision=use_vision, 180 | system_prompt_class=CustomSystemPrompt, 181 | agent_prompt_class=CustomAgentMessagePrompt, 182 | max_actions_per_step=5, 183 | controller=controller, 184 | ) 185 | for task in query_tasks 186 | ] 187 | query_results = await asyncio.gather( 188 | *[agent.run(max_steps=kwargs.get("max_steps", 10)) for agent in agents] 189 | ) 190 | 191 | # 3. Summarize Search Result 192 | query_result_dir = os.path.join(save_dir, "query_results") 193 | os.makedirs(query_result_dir, exist_ok=True) 194 | for i in range(len(query_tasks)): 195 | query_result = query_results[i].final_result() 196 | querr_save_path = os.path.join( 197 | query_result_dir, f"{search_iteration}-{i}.md" 198 | ) 199 | logger.info(f"save query: {query_tasks[i]} at {querr_save_path}") 200 | with open(querr_save_path, "w", encoding="utf-8") as fw: 201 | fw.write(f"Query: {query_tasks[i]}\n") 202 | fw.write(query_result) 203 | history_infos_ = json.dumps(history_infos, indent=4) 204 | record_prompt = f"User Instruction:{task}. \nPrevious Recorded Information:\n {json.dumps(history_infos_)} \n Current Search Results: {query_result}\n " 205 | record_messages.append(HumanMessage(content=record_prompt)) 206 | ai_record_msg = llm.invoke(record_messages[:1] + record_messages[-1:]) 207 | record_messages.append(ai_record_msg) 208 | if hasattr(ai_record_msg, "reasoning_content"): 209 | logger.info("🤯 Start Record Deep Thinking: ") 210 | logger.info(ai_record_msg.reasoning_content) 211 | logger.info("🤯 End Record Deep Thinking") 212 | record_content = ai_record_msg.content 213 | record_content = repair_json(record_content) 214 | new_record_infos = json.loads(record_content) 215 | history_infos.extend(new_record_infos) 216 | 217 | logger.info("\nFinish Searching, Start Generating Report...") 218 | 219 | # 5. Report Generation in Markdown (or JSON if you prefer) 220 | writer_system_prompt = """ 221 | You are a **Deep Researcher** and a professional report writer tasked with creating polished, high-quality reports that fully meet the user's needs, based on the user's instructions and the relevant information provided. You will write the report using Markdown format, ensuring it is both informative and visually appealing. 222 | 223 | **Specific Instructions:** 224 | 225 | * **Structure for Impact:** The report must have a clear, logical, and impactful structure. Begin with a compelling introduction that immediately grabs the reader's attention. Develop well-structured body paragraphs that flow smoothly and logically, and conclude with a concise and memorable conclusion that summarizes key takeaways and leaves a lasting impression. 226 | * **Engaging and Vivid Language:** Employ precise, vivid, and descriptive language to make the report captivating and enjoyable to read. Use stylistic techniques to enhance engagement. Tailor your tone, vocabulary, and writing style to perfectly suit the subject matter and the intended audience to maximize impact and readability. 227 | * **Accuracy, Credibility, and Citations:** Ensure that all information presented is meticulously accurate, rigorously truthful, and robustly supported by the available data. **Cite sources exclusively using bracketed sequential numbers within the text (e.g., [1], [2], etc.). If no references are used, omit citations entirely.** These numbers must correspond to a numbered list of references at the end of the report. 228 | * **Publication-Ready Formatting:** Adhere strictly to Markdown formatting for excellent readability and a clean, highly professional visual appearance. Pay close attention to formatting details like headings, lists, emphasis, and spacing to optimize the visual presentation and reader experience. The report should be ready for immediate publication upon completion, requiring minimal to no further editing for style or format. 229 | * **Conciseness and Clarity (Unless Specified Otherwise):** When the user does not provide a specific length, prioritize concise and to-the-point writing, maximizing information density while maintaining clarity. 230 | * **Data-Driven Comparisons with Tables:** **When appropriate and beneficial for enhancing clarity and impact, present data comparisons in well-structured Markdown tables. This is especially encouraged when dealing with numerical data or when a visual comparison can significantly improve the reader's understanding.** 231 | * **Length Adherence:** When the user specifies a length constraint, meticulously stay within reasonable bounds of that specification, ensuring the content is appropriately scaled without sacrificing quality or completeness. 232 | * **Comprehensive Instruction Following:** Pay meticulous attention to all details and nuances provided in the user instructions. Strive to fulfill every aspect of the user's request with the highest degree of accuracy and attention to detail, creating a report that not only meets but exceeds expectations for quality and professionalism. 233 | * **Reference List Formatting:** The reference list at the end must be formatted as follows: 234 | `[1] Title (URL, if available)` 235 | **Each reference must be separated by a blank line to ensure proper spacing.** For example: 236 | 237 | ``` 238 | [1] Title 1 (URL1, if available) 239 | 240 | [2] Title 2 (URL2, if available) 241 | ``` 242 | **Furthermore, ensure that the reference list is free of duplicates. Each unique source should be listed only once, regardless of how many times it is cited in the text.** 243 | * **ABSOLUTE FINAL OUTPUT RESTRICTION:** **Your output must contain ONLY the finished, publication-ready Markdown report. Do not include ANY extraneous text, phrases, preambles, meta-commentary, or markdown code indicators (e.g., "```markdown```"). The report should begin directly with the title and introductory paragraph, and end directly after the conclusion and the reference list (if applicable).** **Your response will be deemed a failure if this instruction is not followed precisely.** 244 | 245 | **Inputs:** 246 | 247 | 1. **User Instruction:** The original instruction given by the user. This helps you determine what kind of information will be useful and how to structure your thinking. 248 | 2. **Search Information:** Information gathered from the search queries. 249 | """ 250 | 251 | history_infos_ = json.dumps(history_infos, indent=4) 252 | record_json_path = os.path.join(save_dir, "record_infos.json") 253 | logger.info(f"save All recorded information at {record_json_path}") 254 | with open(record_json_path, "w") as fw: 255 | json.dump(history_infos, fw, indent=4) 256 | report_prompt = ( 257 | f"User Instruction:{task} \n Search Information:\n {history_infos_}" 258 | ) 259 | report_messages = [ 260 | SystemMessage(content=writer_system_prompt), 261 | HumanMessage(content=report_prompt), 262 | ] # New context for report generation 263 | ai_report_msg = llm.invoke(report_messages) 264 | if hasattr(ai_report_msg, "reasoning_content"): 265 | logger.info("🤯 Start Report Deep Thinking: ") 266 | logger.info(ai_report_msg.reasoning_content) 267 | logger.info("🤯 End Report Deep Thinking") 268 | report_content = ai_report_msg.content 269 | 270 | report_file_path = os.path.join(save_dir, "final_report.md") 271 | with open(report_file_path, "w", encoding="utf-8") as f: 272 | f.write(report_content) 273 | logger.info(f"Save Report at: {report_file_path}") 274 | return report_content, report_file_path 275 | 276 | except Exception as e: 277 | logger.error(f"Deep research Error: {e}") 278 | return "", None 279 | finally: 280 | if browser: 281 | await browser.close() 282 | logger.info("Browser closed.") 283 | ``` -------------------------------------------------------------------------------- /src/mcp_server_browser_use/agent/custom_agent.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import logging 3 | import pdb 4 | import traceback 5 | from typing import Optional, Type, List, Dict, Any, Callable 6 | from PIL import Image, ImageDraw, ImageFont 7 | import os 8 | import base64 9 | import io 10 | import platform 11 | from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt 12 | from browser_use.agent.service import Agent 13 | from browser_use.agent.views import ( 14 | ActionResult, 15 | ActionModel, 16 | AgentHistoryList, 17 | AgentOutput, 18 | AgentHistory, 19 | ) 20 | from browser_use.browser.browser import Browser 21 | from browser_use.browser.context import BrowserContext 22 | from browser_use.browser.views import BrowserStateHistory 23 | from browser_use.controller.service import Controller 24 | from browser_use.telemetry.views import ( 25 | AgentEndTelemetryEvent, 26 | AgentRunTelemetryEvent, 27 | AgentStepTelemetryEvent, 28 | ) 29 | from browser_use.utils import time_execution_async 30 | from langchain_core.language_models.chat_models import BaseChatModel 31 | from langchain_core.messages import ( 32 | BaseMessage, 33 | ) 34 | from json_repair import repair_json 35 | from mcp_server_browser_use.utils.agent_state import AgentState 36 | 37 | from .custom_massage_manager import CustomMassageManager 38 | from .custom_views import CustomAgentOutput, CustomAgentStepInfo 39 | 40 | logger = logging.getLogger(__name__) 41 | 42 | 43 | class CustomAgent(Agent): 44 | def __init__( 45 | self, 46 | task: str, 47 | llm: BaseChatModel, 48 | add_infos: str = "", 49 | browser: Browser | None = None, 50 | browser_context: BrowserContext | None = None, 51 | controller: Controller = Controller(), 52 | use_vision: bool = True, 53 | save_conversation_path: Optional[str] = None, 54 | max_failures: int = 5, 55 | retry_delay: int = 10, 56 | system_prompt_class: Type[SystemPrompt] = SystemPrompt, 57 | agent_prompt_class: Type[AgentMessagePrompt] = AgentMessagePrompt, 58 | max_input_tokens: int = 128000, 59 | validate_output: bool = False, 60 | include_attributes: list[str] = [ 61 | "title", 62 | "type", 63 | "name", 64 | "role", 65 | "tabindex", 66 | "aria-label", 67 | "placeholder", 68 | "value", 69 | "alt", 70 | "aria-expanded", 71 | ], 72 | max_error_length: int = 400, 73 | max_actions_per_step: int = 10, 74 | tool_call_in_content: bool = True, 75 | agent_state: AgentState = None, 76 | initial_actions: Optional[List[Dict[str, Dict[str, Any]]]] = None, 77 | # Cloud Callbacks 78 | register_new_step_callback: ( 79 | Callable[["BrowserState", "AgentOutput", int], None] | None 80 | ) = None, 81 | register_done_callback: Callable[["AgentHistoryList"], None] | None = None, 82 | tool_calling_method: Optional[str] = "auto", 83 | ): 84 | super().__init__( 85 | task=task, 86 | llm=llm, 87 | browser=browser, 88 | browser_context=browser_context, 89 | controller=controller, 90 | use_vision=use_vision, 91 | save_conversation_path=save_conversation_path, 92 | max_failures=max_failures, 93 | retry_delay=retry_delay, 94 | system_prompt_class=system_prompt_class, 95 | max_input_tokens=max_input_tokens, 96 | validate_output=validate_output, 97 | include_attributes=include_attributes, 98 | max_error_length=max_error_length, 99 | max_actions_per_step=max_actions_per_step, 100 | tool_call_in_content=tool_call_in_content, 101 | initial_actions=initial_actions, 102 | register_new_step_callback=register_new_step_callback, 103 | register_done_callback=register_done_callback, 104 | tool_calling_method=tool_calling_method, 105 | ) 106 | if self.model_name in ["deepseek-reasoner"] or "deepseek-r1" in self.model_name: 107 | # deepseek-reasoner does not support function calling 108 | self.use_deepseek_r1 = True 109 | # deepseek-reasoner only support 64000 context 110 | self.max_input_tokens = 64000 111 | else: 112 | self.use_deepseek_r1 = False 113 | 114 | # record last actions 115 | self._last_actions = None 116 | # record extract content 117 | self.extracted_content = "" 118 | # custom new info 119 | self.add_infos = add_infos 120 | # agent_state for Stop 121 | self.agent_state = agent_state 122 | self.agent_prompt_class = agent_prompt_class 123 | self.message_manager = CustomMassageManager( 124 | llm=self.llm, 125 | task=self.task, 126 | action_descriptions=self.controller.registry.get_prompt_description(), 127 | system_prompt_class=self.system_prompt_class, 128 | agent_prompt_class=agent_prompt_class, 129 | max_input_tokens=self.max_input_tokens, 130 | include_attributes=self.include_attributes, 131 | max_error_length=self.max_error_length, 132 | max_actions_per_step=self.max_actions_per_step, 133 | ) 134 | 135 | def _setup_action_models(self) -> None: 136 | """Setup dynamic action models from controller's registry""" 137 | # Get the dynamic action model from controller's registry 138 | self.ActionModel = self.controller.registry.create_action_model() 139 | # Create output model with the dynamic actions 140 | self.AgentOutput = CustomAgentOutput.type_with_custom_actions(self.ActionModel) 141 | 142 | def _log_response(self, response: CustomAgentOutput) -> None: 143 | """Log the model's response""" 144 | if "Success" in response.current_state.prev_action_evaluation: 145 | emoji = "✅" 146 | elif "Failed" in response.current_state.prev_action_evaluation: 147 | emoji = "❌" 148 | else: 149 | emoji = "🤷" 150 | 151 | logger.info(f"{emoji} Eval: {response.current_state.prev_action_evaluation}") 152 | logger.info(f"🧠 New Memory: {response.current_state.important_contents}") 153 | logger.info(f"⏳ Task Progress: \n{response.current_state.task_progress}") 154 | logger.info(f"📋 Future Plans: \n{response.current_state.future_plans}") 155 | logger.info(f"🤔 Thought: {response.current_state.thought}") 156 | logger.info(f"🎯 Summary: {response.current_state.summary}") 157 | for i, action in enumerate(response.action): 158 | logger.info( 159 | f"🛠️ Action {i + 1}/{len(response.action)}: {action.model_dump_json(exclude_unset=True)}" 160 | ) 161 | 162 | def update_step_info( 163 | self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None 164 | ): 165 | """ 166 | update step info 167 | """ 168 | if step_info is None: 169 | return 170 | 171 | step_info.step_number += 1 172 | important_contents = model_output.current_state.important_contents 173 | if ( 174 | important_contents 175 | and "None" not in important_contents 176 | and important_contents not in step_info.memory 177 | ): 178 | step_info.memory += important_contents + "\n" 179 | 180 | task_progress = model_output.current_state.task_progress 181 | if task_progress and "None" not in task_progress: 182 | step_info.task_progress = task_progress 183 | 184 | future_plans = model_output.current_state.future_plans 185 | if future_plans and "None" not in future_plans: 186 | step_info.future_plans = future_plans 187 | 188 | @time_execution_async("--get_next_action") 189 | async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput: 190 | """Get next action from LLM based on current state""" 191 | messages_to_process = ( 192 | self.message_manager.merge_successive_human_messages(input_messages) 193 | if self.use_deepseek_r1 194 | else input_messages 195 | ) 196 | 197 | ai_message = self.llm.invoke(messages_to_process) 198 | self.message_manager._add_message_with_tokens(ai_message) 199 | 200 | if self.use_deepseek_r1: 201 | logger.info("🤯 Start Deep Thinking: ") 202 | logger.info(ai_message.reasoning_content) 203 | logger.info("🤯 End Deep Thinking") 204 | 205 | if isinstance(ai_message.content, list): 206 | ai_content = ai_message.content[0] 207 | else: 208 | ai_content = ai_message.content 209 | 210 | ai_content = ai_content.replace("```json", "").replace("```", "") 211 | ai_content = repair_json(ai_content) 212 | parsed_json = json.loads(ai_content) 213 | parsed: AgentOutput = self.AgentOutput(**parsed_json) 214 | 215 | if parsed is None: 216 | logger.debug(ai_message.content) 217 | raise ValueError("Could not parse response.") 218 | 219 | # Limit actions to maximum allowed per step 220 | parsed.action = parsed.action[: self.max_actions_per_step] 221 | self._log_response(parsed) 222 | self.n_steps += 1 223 | 224 | return parsed 225 | 226 | @time_execution_async("--step") 227 | async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None: 228 | """Execute one step of the task""" 229 | logger.info(f"\n📍 Step {self.n_steps}") 230 | state = None 231 | model_output = None 232 | result: list[ActionResult] = [] 233 | 234 | try: 235 | state = await self.browser_context.get_state(use_vision=self.use_vision) 236 | self.message_manager.add_state_message( 237 | state, self._last_actions, self._last_result, step_info 238 | ) 239 | input_messages = self.message_manager.get_messages() 240 | try: 241 | model_output = await self.get_next_action(input_messages) 242 | if self.register_new_step_callback: 243 | self.register_new_step_callback(state, model_output, self.n_steps) 244 | self.update_step_info(model_output, step_info) 245 | logger.info(f"🧠 All Memory: \n{step_info.memory}") 246 | self._save_conversation(input_messages, model_output) 247 | if self.model_name != "deepseek-reasoner": 248 | # remove prev message 249 | self.message_manager._remove_state_message_by_index(-1) 250 | except Exception as e: 251 | # model call failed, remove last state message from history 252 | self.message_manager._remove_state_message_by_index(-1) 253 | raise e 254 | 255 | actions: list[ActionModel] = model_output.action 256 | result: list[ActionResult] = await self.controller.multi_act( 257 | actions, self.browser_context 258 | ) 259 | if len(result) != len(actions): 260 | # I think something changes, such information should let LLM know 261 | for ri in range(len(result), len(actions)): 262 | result.append( 263 | ActionResult( 264 | extracted_content=None, 265 | include_in_memory=True, 266 | error=f"{actions[ri].model_dump_json(exclude_unset=True)} is Failed to execute. \ 267 | Something new appeared after action {actions[len(result) - 1].model_dump_json(exclude_unset=True)}", 268 | is_done=False, 269 | ) 270 | ) 271 | if len(actions) == 0: 272 | # TODO: fix no action case 273 | result = [ 274 | ActionResult( 275 | is_done=True, 276 | extracted_content=step_info.memory, 277 | include_in_memory=True, 278 | ) 279 | ] 280 | for ret_ in result: 281 | if "Extracted page" in ret_.extracted_content: 282 | # record every extracted page 283 | self.extracted_content += ret_.extracted_content 284 | self._last_result = result 285 | self._last_actions = actions 286 | if len(result) > 0 and result[-1].is_done: 287 | if not self.extracted_content: 288 | self.extracted_content = step_info.memory 289 | result[-1].extracted_content = self.extracted_content 290 | logger.info(f"📄 Result: {result[-1].extracted_content}") 291 | 292 | self.consecutive_failures = 0 293 | 294 | except Exception as e: 295 | result = await self._handle_step_error(e) 296 | self._last_result = result 297 | 298 | finally: 299 | actions = ( 300 | [a.model_dump(exclude_unset=True) for a in model_output.action] 301 | if model_output 302 | else [] 303 | ) 304 | self.telemetry.capture( 305 | AgentStepTelemetryEvent( 306 | agent_id=self.agent_id, 307 | step=self.n_steps, 308 | actions=actions, 309 | consecutive_failures=self.consecutive_failures, 310 | step_error=( 311 | [r.error for r in result if r.error] 312 | if result 313 | else ["No result"] 314 | ), 315 | ) 316 | ) 317 | if not result: 318 | return 319 | 320 | if state: 321 | self._make_history_item(model_output, state, result) 322 | 323 | async def run(self, max_steps: int = 100) -> AgentHistoryList: 324 | """Execute the task with maximum number of steps""" 325 | try: 326 | self._log_agent_run() 327 | 328 | # Execute initial actions if provided 329 | if self.initial_actions: 330 | result = await self.controller.multi_act( 331 | self.initial_actions, 332 | self.browser_context, 333 | check_for_new_elements=False, 334 | ) 335 | self._last_result = result 336 | 337 | step_info = CustomAgentStepInfo( 338 | task=self.task, 339 | add_infos=self.add_infos, 340 | step_number=1, 341 | max_steps=max_steps, 342 | memory="", 343 | task_progress="", 344 | future_plans="", 345 | ) 346 | 347 | for step in range(max_steps): 348 | # 1) Check if stop requested 349 | if self.agent_state and self.agent_state.is_stop_requested(): 350 | logger.info("🛑 Stop requested by user") 351 | self._create_stop_history_item() 352 | break 353 | 354 | # 2) Store last valid state before step 355 | if self.browser_context and self.agent_state: 356 | state = await self.browser_context.get_state( 357 | use_vision=self.use_vision 358 | ) 359 | self.agent_state.set_last_valid_state(state) 360 | 361 | if self._too_many_failures(): 362 | break 363 | 364 | # 3) Do the step 365 | await self.step(step_info) 366 | 367 | if self.history.is_done(): 368 | if ( 369 | self.validate_output and step < max_steps - 1 370 | ): # if last step, we dont need to validate 371 | if not await self._validate_output(): 372 | continue 373 | 374 | logger.info("✅ Task completed successfully") 375 | break 376 | else: 377 | logger.info("❌ Failed to complete task in maximum steps") 378 | if not self.extracted_content: 379 | self.history.history[-1].result[ 380 | -1 381 | ].extracted_content = step_info.memory 382 | else: 383 | self.history.history[-1].result[ 384 | -1 385 | ].extracted_content = self.extracted_content 386 | 387 | return self.history 388 | 389 | finally: 390 | self.telemetry.capture( 391 | AgentEndTelemetryEvent( 392 | agent_id=self.agent_id, 393 | success=self.history.is_done(), 394 | steps=self.n_steps, 395 | max_steps_reached=self.n_steps >= max_steps, 396 | errors=self.history.errors(), 397 | ) 398 | ) 399 | 400 | if not self.injected_browser_context: 401 | await self.browser_context.close() 402 | 403 | if not self.injected_browser and self.browser: 404 | await self.browser.close() 405 | 406 | if self.generate_gif: 407 | output_path: str = "agent_history.gif" 408 | if isinstance(self.generate_gif, str): 409 | output_path = self.generate_gif 410 | 411 | self.create_history_gif(output_path=output_path) 412 | 413 | def _create_stop_history_item(self): 414 | """Create a history item for when the agent is stopped.""" 415 | try: 416 | # Attempt to retrieve the last valid state from agent_state 417 | state = None 418 | if self.agent_state: 419 | last_state = self.agent_state.get_last_valid_state() 420 | if last_state: 421 | # Convert to BrowserStateHistory 422 | state = BrowserStateHistory( 423 | url=getattr(last_state, "url", ""), 424 | title=getattr(last_state, "title", ""), 425 | tabs=getattr(last_state, "tabs", []), 426 | interacted_element=[None], 427 | screenshot=getattr(last_state, "screenshot", None), 428 | ) 429 | else: 430 | state = self._create_empty_state() 431 | else: 432 | state = self._create_empty_state() 433 | 434 | # Create a final item in the agent history indicating done 435 | stop_history = AgentHistory( 436 | model_output=None, 437 | state=state, 438 | result=[ActionResult(extracted_content=None, error=None, is_done=True)], 439 | ) 440 | self.history.history.append(stop_history) 441 | 442 | except Exception as e: 443 | logger.error(f"Error creating stop history item: {e}") 444 | # Create empty state as fallback 445 | state = self._create_empty_state() 446 | stop_history = AgentHistory( 447 | model_output=None, 448 | state=state, 449 | result=[ActionResult(extracted_content=None, error=None, is_done=True)], 450 | ) 451 | self.history.history.append(stop_history) 452 | 453 | def _convert_to_browser_state_history(self, browser_state): 454 | return BrowserStateHistory( 455 | url=getattr(browser_state, "url", ""), 456 | title=getattr(browser_state, "title", ""), 457 | tabs=getattr(browser_state, "tabs", []), 458 | interacted_element=[None], 459 | screenshot=getattr(browser_state, "screenshot", None), 460 | ) 461 | 462 | def _create_empty_state(self): 463 | return BrowserStateHistory( 464 | url="", title="", tabs=[], interacted_element=[None], screenshot=None 465 | ) 466 | 467 | def create_history_gif( 468 | self, 469 | output_path: str = "agent_history.gif", 470 | duration: int = 3000, 471 | show_goals: bool = True, 472 | show_task: bool = True, 473 | show_logo: bool = False, 474 | font_size: int = 40, 475 | title_font_size: int = 56, 476 | goal_font_size: int = 44, 477 | margin: int = 40, 478 | line_spacing: float = 1.5, 479 | ) -> None: 480 | """Create a GIF from the agent's history with overlaid task and goal text.""" 481 | if not self.history.history: 482 | logger.warning("No history to create GIF from") 483 | return 484 | 485 | images = [] 486 | # if history is empty or first screenshot is None, we can't create a gif 487 | if not self.history.history or not self.history.history[0].state.screenshot: 488 | logger.warning("No history or first screenshot to create GIF from") 489 | return 490 | 491 | # Try to load nicer fonts 492 | try: 493 | # Try different font options in order of preference 494 | font_options = ["Helvetica", "Arial", "DejaVuSans", "Verdana"] 495 | font_loaded = False 496 | 497 | for font_name in font_options: 498 | try: 499 | if platform.system() == "Windows": 500 | # Need to specify the abs font path on Windows 501 | font_name = os.path.join( 502 | os.getenv("WIN_FONT_DIR", "C:\\Windows\\Fonts"), 503 | font_name + ".ttf", 504 | ) 505 | regular_font = ImageFont.truetype(font_name, font_size) 506 | title_font = ImageFont.truetype(font_name, title_font_size) 507 | goal_font = ImageFont.truetype(font_name, goal_font_size) 508 | font_loaded = True 509 | break 510 | except OSError: 511 | continue 512 | 513 | if not font_loaded: 514 | raise OSError("No preferred fonts found") 515 | 516 | except OSError: 517 | regular_font = ImageFont.load_default() 518 | title_font = ImageFont.load_default() 519 | 520 | goal_font = regular_font 521 | 522 | # Load logo if requested 523 | logo = None 524 | if show_logo: 525 | try: 526 | logo = Image.open("./static/browser-use.png") 527 | # Resize logo to be small (e.g., 40px height) 528 | logo_height = 150 529 | aspect_ratio = logo.width / logo.height 530 | logo_width = int(logo_height * aspect_ratio) 531 | logo = logo.resize((logo_width, logo_height), Image.Resampling.LANCZOS) 532 | except Exception as e: 533 | logger.warning(f"Could not load logo: {e}") 534 | 535 | # Create task frame if requested 536 | if show_task and self.task: 537 | task_frame = self._create_task_frame( 538 | self.task, 539 | self.history.history[0].state.screenshot, 540 | title_font, 541 | regular_font, 542 | logo, 543 | line_spacing, 544 | ) 545 | images.append(task_frame) 546 | 547 | # Process each history item 548 | for i, item in enumerate(self.history.history, 1): 549 | if not item.state.screenshot: 550 | continue 551 | 552 | # Convert base64 screenshot to PIL Image 553 | img_data = base64.b64decode(item.state.screenshot) 554 | image = Image.open(io.BytesIO(img_data)) 555 | 556 | if show_goals and item.model_output: 557 | image = self._add_overlay_to_image( 558 | image=image, 559 | step_number=i, 560 | goal_text=item.model_output.current_state.thought, 561 | regular_font=regular_font, 562 | title_font=title_font, 563 | margin=margin, 564 | logo=logo, 565 | ) 566 | 567 | images.append(image) 568 | 569 | if images: 570 | # Save the GIF 571 | images[0].save( 572 | output_path, 573 | save_all=True, 574 | append_images=images[1:], 575 | duration=duration, 576 | loop=0, 577 | optimize=False, 578 | ) 579 | logger.info(f"Created GIF at {output_path}") 580 | else: 581 | logger.warning("No images found in history to create GIF") 582 | ```