deploya-labs/mcp-browser-use # codebase.md

# Directory Structure

```
├── .gitignore
├── .python-version
├── assets
│   └── web-ui.png
├── Dockerfile
├── LICENSE
├── pyproject.toml
├── README.md
├── smithery.yaml
├── src
│   └── mcp_server_browser_use
│       ├── __init__.py
│       ├── agent
│       │   ├── __init__.py
│       │   ├── custom_agent.py
│       │   ├── custom_massage_manager.py
│       │   ├── custom_prompts.py
│       │   └── custom_views.py
│       ├── browser
│       │   ├── __init__.py
│       │   ├── custom_browser.py
│       │   └── custom_context.py
│       ├── controller
│       │   ├── __init__.py
│       │   └── custom_controller.py
│       ├── server.py
│       └── utils
│           ├── __init__.py
│           ├── agent_state.py
│           ├── deep_research.py
│           ├── llm.py
│           └── utils.py
└── uv.lock
```

# Files

--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------

```
1 | 3.11
2 | 
```

--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------

```
  1 | # File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig
  2 | # Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,macos,python
  3 | # Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,macos,python
  4 | 
  5 | ### macOS ###
  6 | # General
  7 | .DS_Store
  8 | .AppleDouble
  9 | .LSOverride
 10 | 
 11 | # Icon must end with two \r
 12 | Icon
 13 | 
 14 | 
 15 | # Thumbnails
 16 | ._*
 17 | 
 18 | # Files that might appear in the root of a volume
 19 | .DocumentRevisions-V100
 20 | .fseventsd
 21 | .Spotlight-V100
 22 | .TemporaryItems
 23 | .Trashes
 24 | .VolumeIcon.icns
 25 | .com.apple.timemachine.donotpresent
 26 | 
 27 | # Directories potentially created on remote AFP share
 28 | .AppleDB
 29 | .AppleDesktop
 30 | Network Trash Folder
 31 | Temporary Items
 32 | .apdisk
 33 | 
 34 | ### macOS Patch ###
 35 | # iCloud generated files
 36 | *.icloud
 37 | 
 38 | ### Python ###
 39 | # Byte-compiled / optimized / DLL files
 40 | __pycache__/
 41 | *.py[cod]
 42 | *$py.class
 43 | 
 44 | # C extensions
 45 | *.so
 46 | 
 47 | # Distribution / packaging
 48 | .Python
 49 | build/
 50 | develop-eggs/
 51 | dist/
 52 | downloads/
 53 | eggs/
 54 | .eggs/
 55 | lib/
 56 | lib64/
 57 | parts/
 58 | sdist/
 59 | var/
 60 | wheels/
 61 | share/python-wheels/
 62 | *.egg-info/
 63 | .installed.cfg
 64 | *.egg
 65 | MANIFEST
 66 | 
 67 | # PyInstaller
 68 | #  Usually these files are written by a python script from a template
 69 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 70 | *.manifest
 71 | *.spec
 72 | 
 73 | # Installer logs
 74 | pip-log.txt
 75 | pip-delete-this-directory.txt
 76 | 
 77 | # Unit test / coverage reports
 78 | htmlcov/
 79 | .tox/
 80 | .nox/
 81 | .coverage
 82 | .coverage.*
 83 | .cache
 84 | nosetests.xml
 85 | coverage.xml
 86 | *.cover
 87 | *.py,cover
 88 | .hypothesis/
 89 | .pytest_cache/
 90 | cover/
 91 | 
 92 | # Translations
 93 | *.mo
 94 | *.pot
 95 | 
 96 | # Django stuff:
 97 | *.log
 98 | local_settings.py
 99 | db.sqlite3
100 | db.sqlite3-journal
101 | 
102 | # Flask stuff:
103 | instance/
104 | .webassets-cache
105 | 
106 | # Scrapy stuff:
107 | .scrapy
108 | 
109 | # Sphinx documentation
110 | docs/_build/
111 | 
112 | # PyBuilder
113 | .pybuilder/
114 | target/
115 | 
116 | # Jupyter Notebook
117 | .ipynb_checkpoints
118 | 
119 | # IPython
120 | profile_default/
121 | ipython_config.py
122 | 
123 | # pyenv
124 | #   For a library or package, you might want to ignore these files since the code is
125 | #   intended to run in multiple environments; otherwise, check them in:
126 | # .python-version
127 | 
128 | # pipenv
129 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
130 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
131 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
132 | #   install all needed dependencies.
133 | #Pipfile.lock
134 | 
135 | # poetry
136 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
137 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
138 | #   commonly ignored for libraries.
139 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
140 | #poetry.lock
141 | 
142 | # pdm
143 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
144 | #pdm.lock
145 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
146 | #   in version control.
147 | #   https://pdm.fming.dev/#use-with-ide
148 | .pdm.toml
149 | 
150 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
151 | __pypackages__/
152 | 
153 | # Celery stuff
154 | celerybeat-schedule
155 | celerybeat.pid
156 | 
157 | # SageMath parsed files
158 | *.sage.py
159 | 
160 | # Environments
161 | .env
162 | .venv
163 | env/
164 | venv/
165 | ENV/
166 | env.bak/
167 | venv.bak/
168 | 
169 | # Spyder project settings
170 | .spyderproject
171 | .spyproject
172 | 
173 | # Rope project settings
174 | .ropeproject
175 | 
176 | # mkdocs documentation
177 | /site
178 | 
179 | # mypy
180 | .mypy_cache/
181 | .dmypy.json
182 | dmypy.json
183 | 
184 | # Pyre type checker
185 | .pyre/
186 | 
187 | # pytype static type analyzer
188 | .pytype/
189 | 
190 | # Cython debug symbols
191 | cython_debug/
192 | 
193 | # PyCharm
194 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
195 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
196 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
197 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
198 | #.idea/
199 | 
200 | ### Python Patch ###
201 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
202 | poetry.toml
203 | 
204 | # ruff
205 | .ruff_cache/
206 | 
207 | # LSP config files
208 | pyrightconfig.json
209 | 
210 | ### VisualStudioCode ###
211 | .vscode/*
212 | !.vscode/settings.json
213 | !.vscode/tasks.json
214 | !.vscode/launch.json
215 | !.vscode/extensions.json
216 | !.vscode/*.code-snippets
217 | 
218 | # Local History for Visual Studio Code
219 | .history/
220 | 
221 | # Built Visual Studio Code Extensions
222 | *.vsix
223 | 
224 | ### VisualStudioCode Patch ###
225 | # Ignore all local history of files
226 | .history
227 | .ionide
228 | 
229 | # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,macos,python
230 | 
231 | # Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
232 | 
233 | agent_history.gif
234 | trace.json
235 | recording.mp4
236 | 
```

--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------

```markdown
  1 | <img src="./assets/web-ui.png" alt="Browser Use Web UI" width="full"/>
  2 | 
  3 | <br/>
  4 | 
  5 | # browser-use MCP server
  6 | [![Documentation](https://img.shields.io/badge/Documentation-📕-blue)](https://docs.browser-use.com)
  7 | [![License](https://img.shields.io/badge/License-MIT-green)](LICENSE)
  8 | 
  9 | > **Project Note**: This MCP server implementation builds upon the [browser-use/web-ui](https://github.com/browser-use/web-ui) foundation. Core browser automation logic and configuration patterns are adapted from the original project.
 10 | 
 11 | AI-driven browser automation server implementing the Model Context Protocol (MCP) for natural language browser control.
 12 | 
 13 | <a href="https://glama.ai/mcp/servers/dz6dy5hw59"><img width="380" height="200" src="https://glama.ai/mcp/servers/dz6dy5hw59/badge" alt="Browser-Use Server MCP server" /></a>
 14 | 
 15 | ## Features
 16 | 
 17 | - 🧠 **MCP Integration** - Full protocol implementation for AI agent communication
 18 | - 🌐 **Browser Automation** - Page navigation, form filling, and element interaction
 19 | - 👁️ **Visual Understanding** - Screenshot analysis and vision-based interactions
 20 | - 🔄 **State Persistence** - Maintain browser sessions between tasks
 21 | - 🔌 **Multi-LLM Support** - OpenAI, Anthropic, Azure, DeepSeek integration
 22 | 
 23 | ## Quick Start
 24 | 
 25 | ### Prerequisites
 26 | 
 27 | - Python 3.11 or higher
 28 | - uv (fast Python package installer)
 29 | - Chrome/Chromium browser
 30 | 
 31 | ### Installation
 32 | 
 33 | #### Claude Desktop
 34 | 
 35 | On MacOS: `~/Library/Application\ Support/Claude/claude_desktop_config.json`
 36 | On Windows: `%APPDATA%/Claude/claude_desktop_config.json`
 37 | 
 38 | ```json
 39 | "mcpServers": {
 40 |     "browser-use": {
 41 |       "command": "uvx",
 42 |       "args": [
 43 |         "mcp-server-browser-use",
 44 |       ],
 45 |       "env": {
 46 |         "OPENROUTER_API_KEY": "",
 47 |         "OPENROUTER_ENDPOINT": "https://openrouter.ai/api/v1",
 48 |         "OPENAI_ENDPOINT": "https://api.openai.com/v1",
 49 |         "OPENAI_API_KEY": "",
 50 |         "ANTHROPIC_ENDPOINT": "https://api.anthropic.com",
 51 |         "ANTHROPIC_API_KEY": "",
 52 |         "GOOGLE_API_KEY": "",
 53 |         "AZURE_OPENAI_ENDPOINT": "",
 54 |         "AZURE_OPENAI_API_KEY": "",
 55 |         "DEEPSEEK_ENDPOINT": "https://api.deepseek.com",
 56 |         "DEEPSEEK_API_KEY": "",
 57 |         "MISTRAL_API_KEY": "",
 58 |         "MISTRAL_ENDPOINT": "https://api.mistral.ai/v1",
 59 |         "OLLAMA_ENDPOINT": "http://localhost:11434",
 60 |         "ANONYMIZED_TELEMETRY": "true",
 61 |         "BROWSER_USE_LOGGING_LEVEL": "info",
 62 |         "CHROME_PATH": "",
 63 |         "CHROME_USER_DATA": "",
 64 |         "CHROME_DEBUGGING_PORT": "9222",
 65 |         "CHROME_DEBUGGING_HOST": "localhost",
 66 |         "CHROME_PERSISTENT_SESSION": "false",
 67 |         "BROWSER_HEADLESS": "false",
 68 |         "BROWSER_DISABLE_SECURITY": "false",
 69 |         "BROWSER_WINDOW_WIDTH": "1280",
 70 |         "BROWSER_WINDOW_HEIGHT": "720",
 71 |         "BROWSER_TRACE_PATH": "trace.json",
 72 |         "BROWSER_RECORDING_PATH": "recording.mp4",
 73 |         "RESOLUTION": "1920x1080x24",
 74 |         "RESOLUTION_WIDTH": "1920",
 75 |         "RESOLUTION_HEIGHT": "1080",
 76 |         "VNC_PASSWORD": "youvncpassword",
 77 |         "MCP_MODEL_PROVIDER": "anthropic",
 78 |         "MCP_MODEL_NAME": "claude-3-5-sonnet-20241022",
 79 |         "MCP_TEMPERATURE": "0.3",
 80 |         "MCP_MAX_STEPS": "30",
 81 |         "MCP_USE_VISION": "true",
 82 |         "MCP_MAX_ACTIONS_PER_STEP": "5",
 83 |         "MCP_TOOL_CALL_IN_CONTENT": "true"
 84 |     }
 85 | }
 86 | ```
 87 | 
 88 | ### Local Development
 89 | 
 90 | ```json
 91 | "browser-use": {
 92 |   "command": "uv",
 93 |   "args": [
 94 |     "--directory",
 95 |     "/path/to/mcp-browser-use",
 96 |     "run",
 97 |     "mcp-server-browser-use"
 98 |   ],
 99 |   "env": {
100 |     ...
101 |   }
102 | }
103 | ```
104 | 
105 | ## Development
106 | 
107 | ```bash
108 | # Install dev dependencies
109 | uv sync
110 | 
111 | # Run with debugger
112 | npx @modelcontextprotocol/inspector uv --directory . run mcp-server-browser-use
113 | ```
114 | 
115 | ## Troubleshooting
116 | 
117 | -   **Browser Conflicts**: Close all Chrome instances before starting.
118 | -   **API Errors**: Verify API keys in environment variables match your LLM provider.
119 | -   **Vision Support**: Ensure `MCP_USE_VISION=true` for screenshot analysis.
120 | 
121 | ## Provider Configuration
122 | 
123 | The server supports multiple LLM providers through environment variables. Here are the available options for `MCP_MODEL_PROVIDER`:
124 | 
125 | | Provider | Value | Required Env Variables |
126 | |----------|--------|----------------------|
127 | | Anthropic | `anthropic` | `ANTHROPIC_API_KEY`<br>`ANTHROPIC_ENDPOINT` (optional) |
128 | | OpenAI | `openai` | `OPENAI_API_KEY`<br>`OPENAI_ENDPOINT` (optional) |
129 | | Azure OpenAI | `azure_openai` | `AZURE_OPENAI_API_KEY`<br>`AZURE_OPENAI_ENDPOINT` |
130 | | DeepSeek | `deepseek` | `DEEPSEEK_API_KEY`<br>`DEEPSEEK_ENDPOINT` (optional) |
131 | | Gemini | `gemini` | `GOOGLE_API_KEY` |
132 | | Mistral | `mistral` | `MISTRAL_API_KEY`<br>`MISTRAL_ENDPOINT` (optional) |
133 | | Ollama | `ollama` | `OLLAMA_ENDPOINT` (optional, defaults to localhost:11434) |
134 | | OpenRouter | `openrouter` | `OPENROUTER_API_KEY`<br>`OPENROUTER_ENDPOINT` (optional) |
135 | 
136 | ### Notes:
137 | - For endpoints marked as optional, default values will be used if not specified
138 | - Temperature can be configured using `MCP_TEMPERATURE` (default: 0.3)
139 | - Model can be specified using `MCP_MODEL_NAME`
140 | - For Ollama models, additional context settings like `num_ctx` and `num_predict` are configurable
141 | 
142 | ## Credits
143 | 
144 | This project extends the [browser-use/web-ui](https://github.com/browser-use/web-ui) under MIT License. Special thanks to the original authors for their browser automation framework.
145 | 
146 | ## License
147 | 
148 | MIT - See [LICENSE](LICENSE) for details.
149 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_browser_use/agent/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_browser_use/browser/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_browser_use/controller/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_browser_use/utils/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_browser_use/__init__.py:
--------------------------------------------------------------------------------

```python
1 | """MCP server for browser-use"""
2 | 
3 | from mcp_server_browser_use.server import app, main
4 | 
5 | 
6 | __all__ = ["app", "main"]
7 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_browser_use/browser/custom_context.py:
--------------------------------------------------------------------------------

```python
 1 | import json
 2 | import logging
 3 | import os
 4 | 
 5 | from browser_use.browser.browser import Browser
 6 | from browser_use.browser.context import BrowserContext, BrowserContextConfig
 7 | from playwright.async_api import Browser as PlaywrightBrowser
 8 | from playwright.async_api import BrowserContext as PlaywrightBrowserContext
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class CustomBrowserContext(BrowserContext):
14 |     def __init__(
15 |         self,
16 |         browser: "Browser",
17 |         config: BrowserContextConfig = BrowserContextConfig()
18 |     ):
19 |         super(CustomBrowserContext, self).__init__(browser=browser, config=config)
```

--------------------------------------------------------------------------------
/src/mcp_server_browser_use/utils/agent_state.py:
--------------------------------------------------------------------------------

```python
 1 | import asyncio
 2 | 
 3 | class AgentState:
 4 |     _instance = None
 5 | 
 6 |     def __init__(self):
 7 |         if not hasattr(self, '_stop_requested'):
 8 |             self._stop_requested = asyncio.Event()
 9 |             self.last_valid_state = None  # store the last valid browser state
10 | 
11 |     def __new__(cls):
12 |         if cls._instance is None:
13 |             cls._instance = super(AgentState, cls).__new__(cls)
14 |         return cls._instance
15 | 
16 |     def request_stop(self):
17 |         self._stop_requested.set()
18 | 
19 |     def clear_stop(self):
20 |         self._stop_requested.clear()
21 |         self.last_valid_state = None
22 | 
23 |     def is_stop_requested(self):
24 |         return self._stop_requested.is_set()
25 | 
26 |     def set_last_valid_state(self, state):
27 |         self.last_valid_state = state
28 | 
29 |     def get_last_valid_state(self):
30 |         return self.last_valid_state
```

--------------------------------------------------------------------------------
/smithery.yaml:
--------------------------------------------------------------------------------

```yaml
 1 | startCommand:
 2 |   type: stdio
 3 |   configSchema:
 4 |     type: object
 5 |     required:
 6 |       - OPENROUTER_API_KEY
 7 |     properties:
 8 |       OPENROUTER_API_KEY:
 9 |         type: string
10 |         description: "The API key for OpenRouter."
11 |       MCP_MODEL_NAME:
12 |         type: string
13 |         description: "The model to use on OpenRouter (default: openai/o3-mini-high)."
14 |       BROWSER_HEADLESS:
15 |         type: string
16 |         description: "Set to 'true' to run the browser in headless mode (default: 'false')."
17 |   commandFunction:
18 |     |-
19 |       (config) => ({
20 |         command: 'mcp-server-browser-use',
21 |         args: [],
22 |         env: {
23 |           OPENROUTER_API_KEY: config.OPENROUTER_API_KEY || '',
24 |           MCP_MODEL_NAME: config.MCP_MODEL_NAME || 'openai/o3-mini-high',
25 |           BROWSER_HEADLESS: config.BROWSER_HEADLESS || 'false',
26 |           PORT: '8000'
27 |         }
28 |       })
29 | 
```

--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------

```dockerfile
 1 | # Use the official Python 3.13 slim image as base
 2 | FROM python:3.13-slim
 3 | 
 4 | # Install system dependencies: Chromium, its driver, Xvfb for headless operation, and ca-certificates
 5 | RUN apt-get update && \
 6 |     apt-get install -y --no-install-recommends \
 7 |       chromium \
 8 |       chromium-driver \
 9 |       xvfb \
10 |       ca-certificates && \
11 |     rm -rf /var/lib/apt/lists/*
12 | 
13 | # Set the working directory
14 | WORKDIR /app
15 | 
16 | # Copy project files into the container
17 | COPY . /app
18 | 
19 | # Install uv (fast Python package installer) using pip
20 | RUN pip install --upgrade pip && pip install uv
21 | 
22 | # Install project dependencies using uv's pip command
23 | RUN pip install .
24 | 
25 | # Expose the port that the server will listen on (likely 8000)
26 | EXPOSE 8000
27 | 
28 | # Set environment variables as needed (adjust as necessary)
29 | ENV BROWSER_HEADLESS="true" \
30 |     BROWSER_USE_LOGGING_LEVEL="info"
31 | 
32 | RUN playwright install
33 | 
34 | # Set the entrypoint to the command that Smithery expects
35 | ENTRYPOINT ["uv", "run", "src/mcp_server_browser_use/server.py"]
36 | 
37 | # Default command if no arguments are provided
38 | CMD ["--help"]
```

--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------

```toml
 1 | [project]
 2 | name = "mcp_server_browser_use"
 3 | version = "0.1.3"
 4 | description = "MCP server for browser-use"
 5 | readme = "README.md"
 6 | requires-python = ">=3.11"
 7 | authors = [
 8 |     { name = "Igor Tarasenko" },
 9 |     { name = "Martin Jakobsson" },
10 | ]
11 | license = { text = "MIT" }
12 | classifiers = [
13 |     "Development Status :: 4 - Beta",
14 |     "Programming Language :: Python :: 3",
15 |     "Programming Language :: Python :: 3.11",
16 |     "Operating System :: OS Independent",
17 | ]
18 | 
19 | dependencies = [
20 |     "langchain>=0.3.14",
21 |     "langchain-openai>=0.2.14",
22 |     "pydantic>=2.10.5",
23 |     "fastapi>=0.115.6",
24 |     "uvicorn>=0.22.0",
25 |     "openai>=1.59.5",
26 |     "python-dotenv>=1.0.1",
27 |     "pyperclip>=1.9.0",
28 |     "langchain-ollama>=0.2.2",
29 |     "instructor>=1.7.2",
30 |     "json-repair>=0.35.0",
31 |     "langchain-mistralai>=0.2.6",
32 |     "fastmcp>=0.4.1",
33 |     "browser-use==0.1.29",
34 | ]
35 | 
36 | [build-system]
37 | requires = ["hatchling"]
38 | build-backend = "hatchling.build"
39 | 
40 | [tool.hatch.build.targets.wheel]
41 | packages = ["src/mcp_server_browser_use"]
42 | 
43 | [project.scripts]
44 | mcp-server-browser-use = "mcp_server_browser_use.server:main"
45 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_browser_use/agent/custom_views.py:
--------------------------------------------------------------------------------

```python
 1 | from dataclasses import dataclass
 2 | from typing import Type
 3 | 
 4 | from browser_use.agent.views import AgentOutput
 5 | from browser_use.controller.registry.views import ActionModel
 6 | from pydantic import BaseModel, ConfigDict, Field, create_model
 7 | 
 8 | 
 9 | @dataclass
10 | class CustomAgentStepInfo:
11 |     step_number: int
12 |     max_steps: int
13 |     task: str
14 |     add_infos: str
15 |     memory: str
16 |     task_progress: str
17 |     future_plans: str
18 | 
19 | 
20 | class CustomAgentBrain(BaseModel):
21 |     """Current state of the agent"""
22 | 
23 |     prev_action_evaluation: str
24 |     important_contents: str
25 |     task_progress: str
26 |     future_plans: str
27 |     thought: str
28 |     summary: str
29 | 
30 | 
31 | class CustomAgentOutput(AgentOutput):
32 |     """Output model for agent
33 | 
34 |     @dev note: this model is extended with custom actions in AgentService. You can also use some fields that are not in this model as provided by the linter, as long as they are registered in the DynamicActions model.
35 |     """
36 | 
37 |     model_config = ConfigDict(arbitrary_types_allowed=True)
38 | 
39 |     current_state: CustomAgentBrain
40 |     action: list[ActionModel]
41 | 
42 |     @staticmethod
43 |     def type_with_custom_actions(
44 |         custom_actions: Type[ActionModel],
45 |     ) -> Type["CustomAgentOutput"]:
46 |         """Extend actions with custom actions"""
47 |         return create_model(
48 |             "CustomAgentOutput",
49 |             __base__=CustomAgentOutput,
50 |             action=(
51 |                 list[custom_actions],
52 |                 Field(...),
53 |             ),  # Properly annotated field with no default
54 |             __module__=CustomAgentOutput.__module__,
55 |         )
56 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_browser_use/controller/custom_controller.py:
--------------------------------------------------------------------------------

```python
 1 | import pdb
 2 | 
 3 | import pyperclip
 4 | from typing import Optional, Type
 5 | from pydantic import BaseModel
 6 | from browser_use.agent.views import ActionResult
 7 | from browser_use.browser.context import BrowserContext
 8 | from browser_use.controller.service import Controller, DoneAction
 9 | from main_content_extractor import MainContentExtractor
10 | from browser_use.controller.views import (
11 |     ClickElementAction,
12 |     DoneAction,
13 |     ExtractPageContentAction,
14 |     GoToUrlAction,
15 |     InputTextAction,
16 |     OpenTabAction,
17 |     ScrollAction,
18 |     SearchGoogleAction,
19 |     SendKeysAction,
20 |     SwitchTabAction,
21 | )
22 | import logging
23 | 
24 | logger = logging.getLogger(__name__)
25 | 
26 | 
27 | class CustomController(Controller):
28 |     def __init__(self, exclude_actions: list[str] = [],
29 |                  output_model: Optional[Type[BaseModel]] = None
30 |                  ):
31 |         super().__init__(exclude_actions=exclude_actions, output_model=output_model)
32 |         self._register_custom_actions()
33 | 
34 |     def _register_custom_actions(self):
35 |         """Register all custom browser actions"""
36 | 
37 |         @self.registry.action("Copy text to clipboard")
38 |         def copy_to_clipboard(text: str):
39 |             pyperclip.copy(text)
40 |             return ActionResult(extracted_content=text)
41 | 
42 |         @self.registry.action("Paste text from clipboard", requires_browser=True)
43 |         async def paste_from_clipboard(browser: BrowserContext):
44 |             text = pyperclip.paste()
45 |             # send text to browser
46 |             page = await browser.get_current_page()
47 |             await page.keyboard.type(text)
48 | 
49 |             return ActionResult(extracted_content=text)
50 | 
51 |         @self.registry.action(
52 |             'Extract page content to get the pure text or markdown with links if include_links is set to true',
53 |             param_model=ExtractPageContentAction,
54 |             requires_browser=True,
55 |         )
56 |         async def extract_content(params: ExtractPageContentAction, browser: BrowserContext):
57 |             page = await browser.get_current_page()
58 |             # use jina reader
59 |             url = page.url
60 |             jina_url = f"https://r.jina.ai/{url}"
61 |             await page.goto(jina_url)
62 |             output_format = 'markdown' if params.include_links else 'text'
63 |             content = MainContentExtractor.extract(  # type: ignore
64 |                 html=await page.content(),
65 |                 output_format=output_format,
66 |             )
67 |             # go back to org url
68 |             await page.go_back()
69 |             msg = f'📄  Extracted page content as {output_format}\n: {content}\n'
70 |             logger.info(msg)
71 |             return ActionResult(extracted_content=msg)
72 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_browser_use/browser/custom_browser.py:
--------------------------------------------------------------------------------

```python
 1 | import asyncio
 2 | import pdb
 3 | 
 4 | from playwright.async_api import Browser as PlaywrightBrowser
 5 | from playwright.async_api import (
 6 |     BrowserContext as PlaywrightBrowserContext,
 7 | )
 8 | from playwright.async_api import (
 9 |     Playwright,
10 |     async_playwright,
11 | )
12 | from browser_use.browser.browser import Browser
13 | from browser_use.browser.context import BrowserContext, BrowserContextConfig
14 | from playwright.async_api import BrowserContext as PlaywrightBrowserContext
15 | import logging
16 | 
17 | from .custom_context import CustomBrowserContext
18 | 
19 | logger = logging.getLogger(__name__)
20 | 
21 | class CustomBrowser(Browser):
22 | 
23 |     async def new_context(
24 |         self,
25 |         config: BrowserContextConfig = BrowserContextConfig()
26 |     ) -> CustomBrowserContext:
27 |         return CustomBrowserContext(config=config, browser=self)
28 |     
29 |     async def _setup_browser_with_instance(self, playwright: Playwright) -> PlaywrightBrowser:
30 |         """Sets up and returns a Playwright Browser instance with anti-detection measures."""
31 |         if not self.config.chrome_instance_path:
32 |             raise ValueError('Chrome instance path is required')
33 |         import subprocess
34 | 
35 |         import requests
36 | 
37 |         try:
38 |             # Check if browser is already running
39 |             response = requests.get('http://localhost:9222/json/version', timeout=2)
40 |             if response.status_code == 200:
41 |                 logger.info('Reusing existing Chrome instance')
42 |                 browser = await playwright.chromium.connect_over_cdp(
43 |                     endpoint_url='http://localhost:9222',
44 |                     timeout=20000,  # 20 second timeout for connection
45 |                 )
46 |                 return browser
47 |         except requests.ConnectionError:
48 |             logger.debug('No existing Chrome instance found, starting a new one')
49 | 
50 |         # Start a new Chrome instance
51 |         subprocess.Popen(
52 |             [
53 |                 self.config.chrome_instance_path,
54 |                 '--remote-debugging-port=9222',
55 |             ] + self.config.extra_chromium_args,
56 |             stdout=subprocess.DEVNULL,
57 |             stderr=subprocess.DEVNULL,
58 |         )
59 |   
60 |         # try to connect first in case the browser have not started
61 |         for _ in range(10):
62 |             try:
63 |                 response = requests.get('http://localhost:9222/json/version', timeout=2)
64 |                 if response.status_code == 200:
65 |                     break
66 |             except requests.ConnectionError:
67 |                 pass
68 |             await asyncio.sleep(1)
69 | 
70 |         # Attempt to connect again after starting a new instance
71 |         try:
72 |             browser = await playwright.chromium.connect_over_cdp(
73 |                 endpoint_url='http://localhost:9222',
74 |                 timeout=20000,  # 20 second timeout for connection
75 |             )
76 |             return browser
77 |         except Exception as e:
78 |             logger.error(f'Failed to start a new Chrome instance.: {str(e)}')
79 |             raise RuntimeError(
80 |                 ' To start chrome in Debug mode, you need to close all existing Chrome instances and try again otherwise we can not connect to the instance.'
81 |             )
```

--------------------------------------------------------------------------------
/src/mcp_server_browser_use/utils/llm.py:
--------------------------------------------------------------------------------

```python
  1 | from openai import OpenAI
  2 | import pdb
  3 | from langchain_openai import ChatOpenAI
  4 | from langchain_core.globals import get_llm_cache
  5 | from langchain_core.language_models.base import (
  6 |     BaseLanguageModel,
  7 |     LangSmithParams,
  8 |     LanguageModelInput,
  9 | )
 10 | from langchain_core.load import dumpd, dumps
 11 | from langchain_core.messages import (
 12 |     AIMessage,
 13 |     SystemMessage,
 14 |     AnyMessage,
 15 |     BaseMessage,
 16 |     BaseMessageChunk,
 17 |     HumanMessage,
 18 |     convert_to_messages,
 19 |     message_chunk_to_message,
 20 | )
 21 | from langchain_core.outputs import (
 22 |     ChatGeneration,
 23 |     ChatGenerationChunk,
 24 |     ChatResult,
 25 |     LLMResult,
 26 |     RunInfo,
 27 | )
 28 | from langchain_ollama import ChatOllama
 29 | from langchain_core.output_parsers.base import OutputParserLike
 30 | from langchain_core.runnables import Runnable, RunnableConfig
 31 | from langchain_core.tools import BaseTool
 32 | 
 33 | from typing import (
 34 |     TYPE_CHECKING,
 35 |     Any,
 36 |     Callable,
 37 |     Literal,
 38 |     Optional,
 39 |     Union,
 40 |     cast,
 41 | )
 42 | 
 43 | class DeepSeekR1ChatOpenAI(ChatOpenAI):
 44 | 
 45 |     def __init__(self, *args: Any, **kwargs: Any) -> None:
 46 |         super().__init__(*args, **kwargs)
 47 |         self.client = OpenAI(
 48 |             base_url=kwargs.get("base_url"), api_key=kwargs.get("api_key")
 49 |         )
 50 | 
 51 |     async def ainvoke(
 52 |         self,
 53 |         input: LanguageModelInput,
 54 |         config: Optional[RunnableConfig] = None,
 55 |         *,
 56 |         stop: Optional[list[str]] = None,
 57 |         **kwargs: Any,
 58 |     ) -> AIMessage:
 59 |         message_history = []
 60 |         for input_ in input:
 61 |             if isinstance(input_, SystemMessage):
 62 |                 message_history.append({"role": "system", "content": input_.content})
 63 |             elif isinstance(input_, AIMessage):
 64 |                 message_history.append({"role": "assistant", "content": input_.content})
 65 |             else:
 66 |                 message_history.append({"role": "user", "content": input_.content})
 67 | 
 68 |         response = self.client.chat.completions.create(
 69 |             model=self.model_name, messages=message_history
 70 |         )
 71 | 
 72 |         reasoning_content = response.choices[0].message.reasoning_content
 73 |         content = response.choices[0].message.content
 74 |         return AIMessage(content=content, reasoning_content=reasoning_content)
 75 | 
 76 |     def invoke(
 77 |         self,
 78 |         input: LanguageModelInput,
 79 |         config: Optional[RunnableConfig] = None,
 80 |         *,
 81 |         stop: Optional[list[str]] = None,
 82 |         **kwargs: Any,
 83 |     ) -> AIMessage:
 84 |         message_history = []
 85 |         for input_ in input:
 86 |             if isinstance(input_, SystemMessage):
 87 |                 message_history.append({"role": "system", "content": input_.content})
 88 |             elif isinstance(input_, AIMessage):
 89 |                 message_history.append({"role": "assistant", "content": input_.content})
 90 |             else:
 91 |                 message_history.append({"role": "user", "content": input_.content})
 92 | 
 93 |         response = self.client.chat.completions.create(
 94 |             model=self.model_name,
 95 |             messages=message_history
 96 |         )
 97 | 
 98 |         reasoning_content = response.choices[0].message.reasoning_content
 99 |         content = response.choices[0].message.content
100 |         return AIMessage(content=content, reasoning_content=reasoning_content)
101 | 
102 | class DeepSeekR1ChatOllama(ChatOllama):
103 | 
104 |     async def ainvoke(
105 |         self,
106 |         input: LanguageModelInput,
107 |         config: Optional[RunnableConfig] = None,
108 |         *,
109 |         stop: Optional[list[str]] = None,
110 |         **kwargs: Any,
111 |     ) -> AIMessage:
112 |         org_ai_message = await super().ainvoke(input=input)
113 |         org_content = org_ai_message.content
114 |         reasoning_content = org_content.split("</think>")[0].replace("<think>", "")
115 |         content = org_content.split("</think>")[1]
116 |         if "**JSON Response:**" in content:
117 |             content = content.split("**JSON Response:**")[-1]
118 |         return AIMessage(content=content, reasoning_content=reasoning_content)
119 | 
120 |     def invoke(
121 |         self,
122 |         input: LanguageModelInput,
123 |         config: Optional[RunnableConfig] = None,
124 |         *,
125 |         stop: Optional[list[str]] = None,
126 |         **kwargs: Any,
127 |     ) -> AIMessage:
128 |         org_ai_message = super().invoke(input=input)
129 |         org_content = org_ai_message.content
130 |         reasoning_content = org_content.split("</think>")[0].replace("<think>", "")
131 |         content = org_content.split("</think>")[1]
132 |         if "**JSON Response:**" in content:
133 |             content = content.split("**JSON Response:**")[-1]
134 |         return AIMessage(content=content, reasoning_content=reasoning_content)
135 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_browser_use/agent/custom_massage_manager.py:
--------------------------------------------------------------------------------

```python
  1 | from __future__ import annotations
  2 | 
  3 | import logging
  4 | from typing import List, Optional, Type
  5 | 
  6 | from browser_use.agent.message_manager.service import MessageManager
  7 | from browser_use.agent.message_manager.views import MessageHistory
  8 | from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
  9 | from browser_use.agent.views import ActionResult, AgentStepInfo, ActionModel
 10 | from browser_use.browser.views import BrowserState
 11 | from langchain_core.language_models import BaseChatModel
 12 | from langchain_anthropic import ChatAnthropic
 13 | from langchain_core.language_models import BaseChatModel
 14 | from langchain_core.messages import (
 15 | 	AIMessage,
 16 | 	BaseMessage,
 17 | 	HumanMessage,
 18 |     ToolMessage
 19 | )
 20 | from langchain_openai import ChatOpenAI
 21 | from ..utils.llm import DeepSeekR1ChatOpenAI
 22 | from .custom_prompts import CustomAgentMessagePrompt
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | 
 27 | class CustomMassageManager(MessageManager):
 28 |     def __init__(
 29 |             self,
 30 |             llm: BaseChatModel,
 31 |             task: str,
 32 |             action_descriptions: str,
 33 |             system_prompt_class: Type[SystemPrompt],
 34 |             agent_prompt_class: Type[AgentMessagePrompt],
 35 |             max_input_tokens: int = 128000,
 36 |             estimated_characters_per_token: int = 3,
 37 |             image_tokens: int = 800,
 38 |             include_attributes: list[str] = [],
 39 |             max_error_length: int = 400,
 40 |             max_actions_per_step: int = 10,
 41 |             message_context: Optional[str] = None
 42 |     ):
 43 |         super().__init__(
 44 |             llm=llm,
 45 |             task=task,
 46 |             action_descriptions=action_descriptions,
 47 |             system_prompt_class=system_prompt_class,
 48 |             max_input_tokens=max_input_tokens,
 49 |             estimated_characters_per_token=estimated_characters_per_token,
 50 |             image_tokens=image_tokens,
 51 |             include_attributes=include_attributes,
 52 |             max_error_length=max_error_length,
 53 |             max_actions_per_step=max_actions_per_step,
 54 |             message_context=message_context
 55 |         )
 56 |         self.agent_prompt_class = agent_prompt_class
 57 |         # Custom: Move Task info to state_message
 58 |         self.history = MessageHistory()
 59 |         self._add_message_with_tokens(self.system_prompt)
 60 |         
 61 |         if self.message_context:
 62 |             context_message = HumanMessage(content=self.message_context)
 63 |             self._add_message_with_tokens(context_message)
 64 | 
 65 |     def cut_messages(self):
 66 |         """Get current message list, potentially trimmed to max tokens"""
 67 |         diff = self.history.total_tokens - self.max_input_tokens
 68 |         min_message_len = 2 if self.message_context is not None else 1
 69 |         
 70 |         while diff > 0 and len(self.history.messages) > min_message_len:
 71 |             self.history.remove_message(min_message_len) # always remove the oldest message
 72 |             diff = self.history.total_tokens - self.max_input_tokens
 73 |         
 74 |     def add_state_message(
 75 |             self,
 76 |             state: BrowserState,
 77 |             actions: Optional[List[ActionModel]] = None,
 78 |             result: Optional[List[ActionResult]] = None,
 79 |             step_info: Optional[AgentStepInfo] = None,
 80 |     ) -> None:
 81 |         """Add browser state as human message"""
 82 |         # otherwise add state message and result to next message (which will not stay in memory)
 83 |         state_message = self.agent_prompt_class(
 84 |             state,
 85 |             actions,
 86 |             result,
 87 |             include_attributes=self.include_attributes,
 88 |             max_error_length=self.max_error_length,
 89 |             step_info=step_info,
 90 |         ).get_user_message()
 91 |         self._add_message_with_tokens(state_message)
 92 |     
 93 |     def _count_text_tokens(self, text: str) -> int:
 94 |         if isinstance(self.llm, (ChatOpenAI, ChatAnthropic, DeepSeekR1ChatOpenAI)):
 95 |             try:
 96 |                 tokens = self.llm.get_num_tokens(text)
 97 |             except Exception:
 98 |                 tokens = (
 99 | 					len(text) // self.estimated_characters_per_token
100 | 				)  # Rough estimate if no tokenizer available
101 |         else:
102 |             tokens = (
103 | 				len(text) // self.estimated_characters_per_token
104 | 			)  # Rough estimate if no tokenizer available
105 |         return tokens
106 | 
107 |     def _remove_state_message_by_index(self, remove_ind=-1) -> None:
108 |         """Remove last state message from history"""
109 |         i = len(self.history.messages) - 1
110 |         remove_cnt = 0
111 |         while i >= 0:
112 |             if isinstance(self.history.messages[i].message, HumanMessage): 
113 |                 remove_cnt += 1
114 |             if remove_cnt == abs(remove_ind):
115 |                 self.history.remove_message(i)
116 |                 break
117 |             i -= 1
118 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_browser_use/server.py:
--------------------------------------------------------------------------------

```python
  1 | import asyncio
  2 | import os
  3 | import sys
  4 | import traceback
  5 | from typing import List, Optional
  6 | 
  7 | import logging
  8 | logging.getLogger().addHandler(logging.NullHandler())
  9 | logging.getLogger().propagate = False
 10 | 
 11 | from mcp_server_browser_use.agent.custom_prompts import (
 12 |     CustomAgentMessagePrompt,
 13 |     CustomSystemPrompt,
 14 | )
 15 | 
 16 | from browser_use import BrowserConfig
 17 | from browser_use.browser.context import BrowserContextConfig, BrowserContextWindowSize
 18 | from fastmcp.server import FastMCP
 19 | from mcp.types import TextContent
 20 | 
 21 | from mcp_server_browser_use.agent.custom_agent import CustomAgent
 22 | from mcp_server_browser_use.browser.custom_browser import CustomBrowser
 23 | from mcp_server_browser_use.controller.custom_controller import CustomController
 24 | from mcp_server_browser_use.utils import utils
 25 | from mcp_server_browser_use.utils.agent_state import AgentState
 26 | 
 27 | # Global references for single "running agent" approach
 28 | _global_agent = None
 29 | _global_browser = None
 30 | _global_browser_context = None
 31 | _global_agent_state = AgentState()
 32 | 
 33 | app = FastMCP("mcp_server_browser_use")
 34 | 
 35 | 
 36 | def get_env_bool(key: str, default: bool = False) -> bool:
 37 |     """Get boolean value from environment variable."""
 38 |     return os.getenv(key, str(default)).lower() in ("true", "1", "yes")
 39 | 
 40 | 
 41 | async def _safe_cleanup():
 42 |     """Safely clean up browser resources"""
 43 |     global _global_browser, _global_agent_state, _global_browser_context, _global_agent
 44 | 
 45 |     try:
 46 |         if _global_agent_state:
 47 |             try:
 48 |                 await _global_agent_state.request_stop()
 49 |             except Exception:
 50 |                 pass
 51 | 
 52 |         if _global_browser_context:
 53 |             try:
 54 |                 await _global_browser_context.close()
 55 |             except Exception:
 56 |                 pass
 57 | 
 58 |         if _global_browser:
 59 |             try:
 60 |                 await _global_browser.close()
 61 |             except Exception:
 62 |                 pass
 63 | 
 64 |     except Exception as e:
 65 |         # Log the error, but don't re-raise
 66 |         print(f"Error during cleanup: {e}", file=sys.stderr)
 67 |     finally:
 68 |         # Reset global variables
 69 |         _global_browser = None
 70 |         _global_browser_context = None
 71 |         _global_agent_state = AgentState()
 72 |         _global_agent = None
 73 | 
 74 | 
 75 | @app.tool()
 76 | async def run_browser_agent(task: str, add_infos: str = "") -> str:
 77 |     """Handle run-browser-agent tool calls."""
 78 |     global _global_agent, _global_browser, _global_browser_context, _global_agent_state
 79 | 
 80 |     try:
 81 |         # Clear any previous agent stop signals
 82 |         _global_agent_state.clear_stop()
 83 | 
 84 |         # Get browser configuration
 85 |         headless = get_env_bool("BROWSER_HEADLESS", True)
 86 |         disable_security = get_env_bool("BROWSER_DISABLE_SECURITY", False)
 87 |         window_w = int(os.getenv("BROWSER_WINDOW_WIDTH", "1280"))
 88 |         window_h = int(os.getenv("BROWSER_WINDOW_HEIGHT", "720"))
 89 | 
 90 |         # Get agent configuration
 91 |         model_provider = os.getenv("MCP_MODEL_PROVIDER", "openrouter")
 92 |         model_name = os.getenv("MCP_MODEL_NAME", "openai/o3-mini-high")
 93 |         temperature = float(os.getenv("MCP_TEMPERATURE", "0.7"))
 94 |         max_steps = int(os.getenv("MCP_MAX_STEPS", "100"))
 95 |         use_vision = get_env_bool("MCP_USE_VISION", True)
 96 |         max_actions_per_step = int(os.getenv("MCP_MAX_ACTIONS_PER_STEP", "5"))
 97 |         tool_calling_method = os.getenv("MCP_TOOL_CALLING_METHOD", "auto")
 98 | 
 99 |         # Configure browser window size
100 |         extra_chromium_args = [f"--window-size={window_w},{window_h}"]
101 | 
102 |         # Initialize browser if needed
103 |         if not _global_browser:
104 |             _global_browser = CustomBrowser(
105 |                 config=BrowserConfig(
106 |                     headless=headless,
107 |                     disable_security=disable_security,
108 |                     extra_chromium_args=extra_chromium_args,
109 |                 )
110 |             )
111 | 
112 |         # Initialize browser context if needed
113 |         if not _global_browser_context:
114 |             _global_browser_context = await _global_browser.new_context(
115 |                 config=BrowserContextConfig(
116 |                     trace_path=os.getenv("BROWSER_TRACE_PATH"),
117 |                     save_recording_path=os.getenv("BROWSER_RECORDING_PATH"),
118 |                     no_viewport=False,
119 |                     browser_window_size=BrowserContextWindowSize(
120 |                         width=window_w, height=window_h
121 |                     ),
122 |                 )
123 |             )
124 | 
125 |         # Prepare LLM
126 |         llm = utils.get_llm_model(
127 |             provider=model_provider, model_name=model_name, temperature=temperature
128 |         )
129 | 
130 |         # Create controller and agent
131 |         controller = CustomController()
132 |         _global_agent = CustomAgent(
133 |             task=task,
134 |             add_infos=add_infos,
135 |             use_vision=use_vision,
136 |             llm=llm,
137 |             browser=_global_browser,
138 |             browser_context=_global_browser_context,
139 |             controller=controller,
140 |             system_prompt_class=CustomSystemPrompt,
141 |             agent_prompt_class=CustomAgentMessagePrompt,
142 |             max_actions_per_step=max_actions_per_step,
143 |             agent_state=_global_agent_state,
144 |             tool_calling_method=tool_calling_method,
145 |         )
146 | 
147 |         # Run agent with improved error handling
148 |         try:
149 |             history = await _global_agent.run(max_steps=max_steps)
150 |             final_result = (
151 |                 history.final_result()
152 |                 or f"No final result. Possibly incomplete. {history}"
153 |             )
154 |             return final_result
155 |         except asyncio.CancelledError:
156 |             return "Task was cancelled"
157 |         except Exception as e:
158 |             logging.error(f"Agent run error: {str(e)}\n{traceback.format_exc()}")
159 |             return f"Error during task execution: {str(e)}"
160 | 
161 |     except Exception as e:
162 |         logging.error(f"run-browser-agent error: {str(e)}\n{traceback.format_exc()}")
163 |         return f"Error during task execution: {str(e)}"
164 | 
165 |     finally:
166 |         asyncio.create_task(_safe_cleanup())
167 | 
168 | 
169 | def main():
170 |     app.run()
171 | 
172 | 
173 | if __name__ == "__main__":
174 |     main()
175 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_browser_use/utils/utils.py:
--------------------------------------------------------------------------------

```python
  1 | import base64
  2 | import os
  3 | import time
  4 | from pathlib import Path
  5 | from typing import Dict, Optional
  6 | import requests
  7 | 
  8 | from langchain_anthropic import ChatAnthropic
  9 | from langchain_mistralai import ChatMistralAI
 10 | from langchain_google_genai import ChatGoogleGenerativeAI
 11 | from langchain_ollama import ChatOllama
 12 | from langchain_openai import AzureChatOpenAI, ChatOpenAI
 13 | 
 14 | from .llm import DeepSeekR1ChatOpenAI, DeepSeekR1ChatOllama
 15 | 
 16 | def get_llm_model(provider: str, **kwargs):
 17 |     """
 18 |     获取LLM 模型
 19 |     :param provider: 模型类型
 20 |     :param kwargs:
 21 |     :return:
 22 |     """
 23 |     if provider not in ["ollama"]:
 24 |         env_var = (
 25 |             "GOOGLE_API_KEY" if provider == "gemini" else f"{provider.upper()}_API_KEY"
 26 |         )
 27 |         api_key = kwargs.get("api_key", "") or os.getenv(env_var, "")
 28 |         if not api_key:
 29 |             raise ValueError(f"API key for {provider} is not set")
 30 |         kwargs["api_key"] = api_key
 31 | 
 32 |     if provider == "anthropic":
 33 |         if not kwargs.get("base_url", ""):
 34 |             base_url = "https://api.anthropic.com"
 35 |         else:
 36 |             base_url = kwargs.get("base_url")
 37 | 
 38 |         return ChatAnthropic(
 39 |             model_name=kwargs.get("model_name", "claude-3-5-sonnet-20240620"),
 40 |             temperature=kwargs.get("temperature", 0.0),
 41 |             base_url=base_url,
 42 |             api_key=api_key,
 43 |         )
 44 |     elif provider == "mistral":
 45 |         if not kwargs.get("base_url", ""):
 46 |             base_url = os.getenv("MISTRAL_ENDPOINT", "https://api.mistral.ai/v1")
 47 |         else:
 48 |             base_url = kwargs.get("base_url")
 49 |         if not kwargs.get("api_key", ""):
 50 |             api_key = os.getenv("MISTRAL_API_KEY", "")
 51 |         else:
 52 |             api_key = kwargs.get("api_key")
 53 | 
 54 |         return ChatMistralAI(
 55 |             model=kwargs.get("model_name", "mistral-large-latest"),
 56 |             temperature=kwargs.get("temperature", 0.0),
 57 |             base_url=base_url,
 58 |             api_key=api_key,
 59 |         )
 60 |     elif provider == "openai":
 61 |         if not kwargs.get("base_url", ""):
 62 |             base_url = os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1")
 63 |         else:
 64 |             base_url = kwargs.get("base_url")
 65 | 
 66 |         return ChatOpenAI(
 67 |             model=kwargs.get("model_name", "gpt-4o"),
 68 |             temperature=kwargs.get("temperature", 0.0),
 69 |             base_url=base_url,
 70 |             api_key=api_key,
 71 |         )
 72 |     elif provider == "deepseek":
 73 |         if not kwargs.get("base_url", ""):
 74 |             base_url = os.getenv("DEEPSEEK_ENDPOINT", "")
 75 |         else:
 76 |             base_url = kwargs.get("base_url")
 77 | 
 78 |         if kwargs.get("model_name", "deepseek-chat") == "deepseek-reasoner":
 79 |             return DeepSeekR1ChatOpenAI(
 80 |                 model=kwargs.get("model_name", "deepseek-reasoner"),
 81 |                 temperature=kwargs.get("temperature", 0.0),
 82 |                 base_url=base_url,
 83 |                 api_key=api_key,
 84 |             )
 85 |         else:
 86 |             return ChatOpenAI(
 87 |                 model=kwargs.get("model_name", "deepseek-chat"),
 88 |                 temperature=kwargs.get("temperature", 0.0),
 89 |                 base_url=base_url,
 90 |                 api_key=api_key,
 91 |             )
 92 |     elif provider == "gemini":
 93 |         return ChatGoogleGenerativeAI(
 94 |             model=kwargs.get("model_name", "gemini-2.0-flash-exp"),
 95 |             temperature=kwargs.get("temperature", 0.0),
 96 |             google_api_key=api_key,
 97 |         )
 98 |     elif provider == "ollama":
 99 |         if not kwargs.get("base_url", ""):
100 |             base_url = os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434")
101 |         else:
102 |             base_url = kwargs.get("base_url")
103 | 
104 |         if "deepseek-r1" in kwargs.get("model_name", "qwen2.5:7b"):
105 |             return DeepSeekR1ChatOllama(
106 |                 model=kwargs.get("model_name", "deepseek-r1:14b"),
107 |                 temperature=kwargs.get("temperature", 0.0),
108 |                 num_ctx=kwargs.get("num_ctx", 32000),
109 |                 base_url=base_url,
110 |             )
111 |         else:
112 |             return ChatOllama(
113 |                 model=kwargs.get("model_name", "qwen2.5:7b"),
114 |                 temperature=kwargs.get("temperature", 0.0),
115 |                 num_ctx=kwargs.get("num_ctx", 32000),
116 |                 num_predict=kwargs.get("num_predict", 1024),
117 |                 base_url=base_url,
118 |             )
119 |     elif provider == "azure_openai":
120 |         if not kwargs.get("base_url", ""):
121 |             base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "")
122 |         else:
123 |             base_url = kwargs.get("base_url")
124 |         return AzureChatOpenAI(
125 |             model=kwargs.get("model_name", "gpt-4o"),
126 |             temperature=kwargs.get("temperature", 0.0),
127 |             api_version="2024-05-01-preview",
128 |             azure_endpoint=base_url,
129 |             api_key=api_key,
130 |         )
131 |     elif provider == "openrouter":
132 |         if not kwargs.get("base_url", ""):
133 |             base_url = os.getenv("OPENROUTER_ENDPOINT", "")
134 |         else:
135 |             base_url = kwargs.get("base_url")
136 | 
137 |         model_name = kwargs.get("model_name", "openai/o3-mini-high")
138 |         if "r1" in model_name or "aion" in model_name:
139 |             return DeepSeekR1ChatOpenAI(
140 |                 model=model_name,
141 |                 temperature=kwargs.get("temperature", 0.0),
142 |                 base_url=base_url,
143 |                 api_key=api_key,
144 |             )
145 |         else:
146 |             return ChatOpenAI(
147 |                 model=model_name,
148 |                 temperature=kwargs.get("temperature", 0.0),
149 |                 base_url=base_url,
150 |                 api_key=api_key,
151 |             )
152 |     else:
153 |         raise ValueError(f"Unsupported provider: {provider}")
154 | 
155 | 
156 | def encode_image(img_path):
157 |     if not img_path:
158 |         return None
159 |     with open(img_path, "rb") as fin:
160 |         image_data = base64.b64encode(fin.read()).decode("utf-8")
161 |     return image_data
162 | 
163 | 
164 | def get_latest_files(
165 |     directory: str, file_types: list = [".webm", ".zip"]
166 | ) -> Dict[str, Optional[str]]:
167 |     """Get the latest recording and trace files"""
168 |     latest_files: Dict[str, Optional[str]] = {ext: None for ext in file_types}
169 | 
170 |     if not os.path.exists(directory):
171 |         os.makedirs(directory, exist_ok=True)
172 |         return latest_files
173 | 
174 |     for file_type in file_types:
175 |         try:
176 |             matches = list(Path(directory).rglob(f"*{file_type}"))
177 |             if matches:
178 |                 latest = max(matches, key=lambda p: p.stat().st_mtime)
179 |                 # Only return files that are complete (not being written)
180 |                 if time.time() - latest.stat().st_mtime > 1.0:
181 |                     latest_files[file_type] = str(latest)
182 |         except Exception as e:
183 |             print(f"Error getting latest {file_type} file: {e}")
184 | 
185 |     return latest_files
186 | 
187 | 
188 | async def capture_screenshot(browser_context):
189 |     """Capture and encode a screenshot"""
190 |     # Extract the Playwright browser instance
191 |     playwright_browser = (
192 |         browser_context.browser.playwright_browser
193 |     )  # Ensure this is correct.
194 | 
195 |     # Check if the browser instance is valid and if an existing context can be reused
196 |     if playwright_browser and playwright_browser.contexts:
197 |         playwright_context = playwright_browser.contexts[0]
198 |     else:
199 |         return None
200 | 
201 |     # Access pages in the context
202 |     pages = None
203 |     if playwright_context:
204 |         pages = playwright_context.pages
205 | 
206 |     # Use an existing page or create a new one if none exist
207 |     if pages:
208 |         active_page = pages[0]
209 |         for page in pages:
210 |             if page.url != "about:blank":
211 |                 active_page = page
212 |     else:
213 |         return None
214 | 
215 |     # Take screenshot
216 |     try:
217 |         screenshot = await active_page.screenshot(type="jpeg", quality=75, scale="css")
218 |         encoded = base64.b64encode(screenshot).decode("utf-8")
219 |         return encoded
220 |     except Exception as e:
221 |         return None
222 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_browser_use/agent/custom_prompts.py:
--------------------------------------------------------------------------------

```python
  1 | import pdb
  2 | from typing import List, Optional
  3 | 
  4 | from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
  5 | from browser_use.agent.views import ActionResult, ActionModel
  6 | from browser_use.browser.views import BrowserState
  7 | from langchain_core.messages import HumanMessage, SystemMessage
  8 | from datetime import datetime
  9 | 
 10 | from .custom_views import CustomAgentStepInfo
 11 | 
 12 | 
 13 | class CustomSystemPrompt(SystemPrompt):
 14 |     def important_rules(self) -> str:
 15 |         """
 16 |         Returns the important rules for the agent.
 17 |         """
 18 |         text = r"""
 19 |     1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
 20 |        {
 21 |          "current_state": {
 22 |            "prev_action_evaluation": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not. Note that the result you output must be consistent with the reasoning you output afterwards. If you consider it to be 'Failed,' you should reflect on this during your thought.",
 23 |            "important_contents": "Output important contents closely related to user\'s instruction on the current page. If there is, please output the contents. If not, please output empty string ''.",
 24 |            "task_progress": "Task Progress is a general summary of the current contents that have been completed. Just summarize the contents that have been actually completed based on the content at current step and the history operations. Please list each completed item individually, such as: 1. Input username. 2. Input Password. 3. Click confirm button. Please return string type not a list.",
 25 |            "future_plans": "Based on the user's request and the current state, outline the remaining steps needed to complete the task. This should be a concise list of actions yet to be performed, such as: 1. Select a date. 2. Choose a specific time slot. 3. Confirm booking. Please return string type not a list.",
 26 |            "thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If your output of prev_action_evaluation is 'Failed', please reflect and output your reflection here.",
 27 |            "summary": "Please generate a brief natural language description for the operation in next actions based on your Thought."
 28 |          },
 29 |          "action": [
 30 |            * actions in sequences, please refer to **Common action sequences**. Each output action MUST be formated as: \{action_name\: action_params\}* 
 31 |          ]
 32 |        }
 33 | 
 34 |     2. ACTIONS: You can specify multiple actions to be executed in sequence. 
 35 | 
 36 |        Common action sequences:
 37 |        - Form filling: [
 38 |            {"input_text": {"index": 1, "text": "username"}},
 39 |            {"input_text": {"index": 2, "text": "password"}},
 40 |            {"click_element": {"index": 3}}
 41 |          ]
 42 |        - Navigation and extraction: [
 43 |            {"go_to_url": {"url": "https://example.com"}},
 44 |            {"extract_page_content": {}}
 45 |          ]
 46 | 
 47 | 
 48 |     3. ELEMENT INTERACTION:
 49 |        - Only use indexes that exist in the provided element list
 50 |        - Each element has a unique index number (e.g., "33[:]<button>")
 51 |        - Elements marked with "_[:]" are non-interactive (for context only)
 52 | 
 53 |     4. NAVIGATION & ERROR HANDLING:
 54 |        - If no suitable elements exist, use other functions to complete the task
 55 |        - If stuck, try alternative approaches
 56 |        - Handle popups/cookies by accepting or closing them
 57 |        - Use scroll to find elements you are looking for
 58 | 
 59 |     5. TASK COMPLETION:
 60 |        - If you think all the requirements of user\'s instruction have been completed and no further operation is required, output the **Done** action to terminate the operation process.
 61 |        - Don't hallucinate actions.
 62 |        - If the task requires specific information - make sure to include everything in the done function. This is what the user will see.
 63 |        - If you are running out of steps (current step), think about speeding it up, and ALWAYS use the done action as the last action.
 64 |        - Note that you must verify if you've truly fulfilled the user's request by examining the actual page content, not just by looking at the actions you output but also whether the action is executed successfully. Pay particular attention when errors occur during action execution.
 65 | 
 66 |     6. VISUAL CONTEXT:
 67 |        - When an image is provided, use it to understand the page layout
 68 |        - Bounding boxes with labels correspond to element indexes
 69 |        - Each bounding box and its label have the same color
 70 |        - Most often the label is inside the bounding box, on the top right
 71 |        - Visual context helps verify element locations and relationships
 72 |        - sometimes labels overlap, so use the context to verify the correct element
 73 | 
 74 |     7. Form filling:
 75 |        - If you fill an input field and your action sequence is interrupted, most often a list with suggestions poped up under the field and you need to first select the right element from the suggestion list.
 76 | 
 77 |     8. ACTION SEQUENCING:
 78 |        - Actions are executed in the order they appear in the list 
 79 |        - Each action should logically follow from the previous one
 80 |        - If the page changes after an action, the sequence is interrupted and you get the new state.
 81 |        - If content only disappears the sequence continues.
 82 |        - Only provide the action sequence until you think the page will change.
 83 |        - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page like saving, extracting, checkboxes...
 84 |        - only use multiple actions if it makes sense. 
 85 |     """
 86 |         text += f"   - use maximum {self.max_actions_per_step} actions per sequence"
 87 |         return text
 88 | 
 89 |     def input_format(self) -> str:
 90 |         return """
 91 |     INPUT STRUCTURE:
 92 |     1. Task: The user\'s instructions you need to complete.
 93 |     2. Hints(Optional): Some hints to help you complete the user\'s instructions.
 94 |     3. Memory: Important contents are recorded during historical operations for use in subsequent operations.
 95 |     4. Current URL: The webpage you're currently on
 96 |     5. Available Tabs: List of open browser tabs
 97 |     6. Interactive Elements: List in the format:
 98 |        index[:]<element_type>element_text</element_type>
 99 |        - index: Numeric identifier for interaction
100 |        - element_type: HTML element type (button, input, etc.)
101 |        - element_text: Visible text or element description
102 | 
103 |     Example:
104 |     33[:]<button>Submit Form</button>
105 |     _[:] Non-interactive text
106 | 
107 | 
108 |     Notes:
109 |     - Only elements with numeric indexes are interactive
110 |     - _[:] elements provide context but cannot be interacted with
111 |     """
112 | 
113 |     def get_system_message(self) -> SystemMessage:
114 |         """
115 |         Get the system prompt for the agent.
116 | 
117 |         Returns:
118 |             str: Formatted system prompt
119 |         """
120 |         AGENT_PROMPT = f"""You are a precise browser automation agent that interacts with websites through structured commands. Your role is to:
121 |     1. Analyze the provided webpage elements and structure
122 |     2. Plan a sequence of actions to accomplish the given task
123 |     3. Your final result MUST be a valid JSON as the **RESPONSE FORMAT** described, containing your action sequence and state assessment, No need extra content to expalin. 
124 | 
125 |     {self.input_format()}
126 | 
127 |     {self.important_rules()}
128 | 
129 |     Functions:
130 |     {self.default_action_description}
131 | 
132 |     Remember: Your responses must be valid JSON matching the specified format. Each action in the sequence must be valid."""
133 |         return SystemMessage(content=AGENT_PROMPT)
134 | 
135 | 
136 | class CustomAgentMessagePrompt(AgentMessagePrompt):
137 |     def __init__(
138 |             self,
139 |             state: BrowserState,
140 |             actions: Optional[List[ActionModel]] = None,
141 |             result: Optional[List[ActionResult]] = None,
142 |             include_attributes: list[str] = [],
143 |             max_error_length: int = 400,
144 |             step_info: Optional[CustomAgentStepInfo] = None,
145 |     ):
146 |         super(CustomAgentMessagePrompt, self).__init__(state=state, 
147 |                                                        result=result, 
148 |                                                        include_attributes=include_attributes, 
149 |                                                        max_error_length=max_error_length, 
150 |                                                        step_info=step_info
151 |                                                        )
152 |         self.actions = actions
153 | 
154 |     def get_user_message(self) -> HumanMessage:
155 |         if self.step_info:
156 |             step_info_description = f'Current step: {self.step_info.step_number}/{self.step_info.max_steps}\n'
157 |         else:
158 |             step_info_description = ''
159 |             
160 |         time_str = datetime.now().strftime("%Y-%m-%d %H:%M")
161 |         step_info_description += "Current date and time: {time_str}"
162 | 
163 |         elements_text = self.state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes)
164 | 
165 |         has_content_above = (self.state.pixels_above or 0) > 0
166 |         has_content_below = (self.state.pixels_below or 0) > 0
167 | 
168 |         if elements_text != '':
169 |             if has_content_above:
170 |                 elements_text = (
171 |                     f'... {self.state.pixels_above} pixels above - scroll or extract content to see more ...\n{elements_text}'
172 |                 )
173 |             else:
174 |                 elements_text = f'[Start of page]\n{elements_text}'
175 |             if has_content_below:
176 |                 elements_text = (
177 |                     f'{elements_text}\n... {self.state.pixels_below} pixels below - scroll or extract content to see more ...'
178 |                 )
179 |             else:
180 |                 elements_text = f'{elements_text}\n[End of page]'
181 |         else:
182 |             elements_text = 'empty page'
183 |    
184 |         state_description = f"""
185 | {step_info_description}
186 | 1. Task: {self.step_info.task}. 
187 | 2. Hints(Optional): 
188 | {self.step_info.add_infos}
189 | 3. Memory: 
190 | {self.step_info.memory}
191 | 4. Current url: {self.state.url}
192 | 5. Available tabs:
193 | {self.state.tabs}
194 | 6. Interactive elements:
195 | {elements_text}
196 |         """
197 | 
198 |         if self.actions and self.result:
199 |             state_description += "\n **Previous Actions** \n"
200 |             state_description += f'Previous step: {self.step_info.step_number-1}/{self.step_info.max_steps} \n'
201 |             for i, result in enumerate(self.result):
202 |                 action = self.actions[i]
203 |                 state_description += f"Previous action {i + 1}/{len(self.result)}: {action.model_dump_json(exclude_unset=True)}\n"
204 |                 if result.include_in_memory:
205 |                     if result.extracted_content:
206 |                         state_description += f"Result of previous action {i + 1}/{len(self.result)}: {result.extracted_content}\n"
207 |                     if result.error:
208 |                         # only use last 300 characters of error
209 |                         error = result.error[-self.max_error_length:]
210 |                         state_description += (
211 |                             f"Error of previous action {i + 1}/{len(self.result)}: ...{error}\n"
212 |                         )
213 | 
214 |         if self.state.screenshot:
215 |             # Format message for vision model
216 |             return HumanMessage(
217 |                 content=[
218 |                     {"type": "text", "text": state_description},
219 |                     {
220 |                         "type": "image_url",
221 |                         "image_url": {
222 |                             "url": f"data:image/png;base64,{self.state.screenshot}"
223 |                         },
224 |                     },
225 |                 ]
226 |             )
227 | 
228 |         return HumanMessage(content=state_description)
```

--------------------------------------------------------------------------------
/src/mcp_server_browser_use/utils/deep_research.py:
--------------------------------------------------------------------------------

```python
  1 | import pdb
  2 | 
  3 | from dotenv import load_dotenv
  4 | 
  5 | load_dotenv()
  6 | import asyncio
  7 | import os
  8 | import sys
  9 | import logging
 10 | from pprint import pprint
 11 | from uuid import uuid4
 12 | from mcp_server_browser_use.utils import utils
 13 | from mcp_server_browser_use.agent.custom_agent import CustomAgent
 14 | import json
 15 | from browser_use.agent.service import Agent
 16 | from browser_use.browser.browser import BrowserConfig, Browser
 17 | from langchain.schema import SystemMessage, HumanMessage
 18 | from json_repair import repair_json
 19 | from mcp_server_browser_use.agent.custom_prompts import (
 20 |     CustomSystemPrompt,
 21 |     CustomAgentMessagePrompt,
 22 | )
 23 | from mcp_server_browser_use.controller.custom_controller import CustomController
 24 | 
 25 | logger = logging.getLogger(__name__)
 26 | 
 27 | 
 28 | async def deep_research(task, llm, **kwargs):
 29 |     task_id = str(uuid4())
 30 |     save_dir = kwargs.get("save_dir", os.path.join(f"./tmp/deep_research/{task_id}"))
 31 |     logger.info(f"Save Deep Research at: {save_dir}")
 32 |     os.makedirs(save_dir, exist_ok=True)
 33 | 
 34 |     # max qyery num per iteration
 35 |     max_query_num = kwargs.get("max_query_num", 3)
 36 |     search_system_prompt = f"""
 37 |     You are a **Deep Researcher**, an AI agent specializing in in-depth information gathering and research using a web browser with **automated execution capabilities**. Your expertise lies in formulating comprehensive research plans and executing them meticulously to fulfill complex user requests. You will analyze user instructions, devise a detailed research plan, and determine the necessary search queries to gather the required information.
 38 | 
 39 |     **Your Task:**
 40 | 
 41 |     Given a user's research topic, you will:
 42 | 
 43 |     1. **Develop a Research Plan:** Outline the key aspects and subtopics that need to be investigated to thoroughly address the user's request. This plan should be a high-level overview of the research direction.
 44 |     2. **Generate Search Queries:** Based on your research plan, generate a list of specific search queries to be executed in a web browser. These queries should be designed to efficiently gather relevant information for each aspect of your plan.
 45 | 
 46 |     **Output Format:**
 47 | 
 48 |     Your output will be a JSON object with the following structure:
 49 | 
 50 |     ```json
 51 |     {{
 52 |     "plan": "A concise, high-level research plan outlining the key areas to investigate.",
 53 |       "queries": [
 54 |         "search query 1",
 55 |         "search query 2",
 56 |         //... up to a maximum of {max_query_num} search queries
 57 |       ]
 58 |     }}
 59 |     ```
 60 | 
 61 |     **Important:**
 62 | 
 63 |     *   Limit your output to a **maximum of {max_query_num}** search queries.
 64 |     *   Make the search queries to help the automated agent find the needed information. Consider what keywords are most likely to lead to useful results.
 65 |     *   If you have gathered for all the information you want and no further search queries are required, output queries with an empty list: `[]`
 66 |     *   Make sure output search queries are different from the history queries.
 67 | 
 68 |     **Inputs:**
 69 | 
 70 |     1.  **User Instruction:** The original instruction given by the user.
 71 |     2.  **Previous Queries:** History Queries.
 72 |     3.  **Previous Search Results:** Textual data gathered from prior search queries. If there are no previous search results this string will be empty.
 73 |     """
 74 |     search_messages = [SystemMessage(content=search_system_prompt)]
 75 | 
 76 |     record_system_prompt = """
 77 |     You are an expert information recorder. Your role is to process user instructions, current search results, and previously recorded information to extract, summarize, and record new, useful information that helps fulfill the user's request. Your output will be a JSON formatted list, where each element represents a piece of extracted information and follows the structure: `{"url": "source_url", "title": "source_title", "summary_content": "concise_summary", "thinking": "reasoning"}`.
 78 | 
 79 | **Important Considerations:**
 80 | 
 81 | 1. **Minimize Information Loss:** While concise, prioritize retaining important details and nuances from the sources. Aim for a summary that captures the essence of the information without over-simplification. **Crucially, ensure to preserve key data and figures within the `summary_content`. This is essential for later stages, such as generating tables and reports.**
 82 | 
 83 | 2. **Avoid Redundancy:** Do not record information that is already present in the Previous Recorded Information. Check for semantic similarity, not just exact matches. However, if the same information is expressed differently in a new source and this variation adds valuable context or clarity, it should be included.
 84 | 
 85 | 3. **Source Information:** Extract and include the source title and URL for each piece of information summarized. This is crucial for verification and context. **The Current Search Results are provided in a specific format, where each item starts with "Title:", followed by the title, then "URL Source:", followed by the URL, and finally "Markdown Content:", followed by the content. Please extract the title and URL from this structure.** If a piece of information cannot be attributed to a specific source from the provided search results, use `"url": "unknown"` and `"title": "unknown"`.
 86 | 
 87 | 4. **Thinking and Report Structure:**  For each extracted piece of information, add a `"thinking"` key. This field should contain your assessment of how this information could be used in a report, which section it might belong to (e.g., introduction, background, analysis, conclusion, specific subtopics), and any other relevant thoughts about its significance or connection to other information.
 88 | 
 89 | **Output Format:**
 90 | 
 91 | Provide your output as a JSON formatted list. Each item in the list must adhere to the following format:
 92 | 
 93 | ```json
 94 | [
 95 |   {
 96 |     "url": "source_url_1",
 97 |     "title": "source_title_1",
 98 |     "summary_content": "Concise summary of content. Remember to include key data and figures here.",
 99 |     "thinking": "This could be used in the introduction to set the context. It also relates to the section on the history of the topic."
100 |   },
101 |   // ... more entries
102 |   {
103 |     "url": "unknown",
104 |     "title": "unknown",
105 |     "summary_content": "concise_summary_of_content_without_clear_source",
106 |     "thinking": "This might be useful background information, but I need to verify its accuracy. Could be used in the methodology section to explain how data was collected."
107 |   }
108 | ]
109 | ```
110 | 
111 | **Inputs:**
112 | 
113 | 1. **User Instruction:** The original instruction given by the user. This helps you determine what kind of information will be useful and how to structure your thinking.
114 | 2. **Previous Recorded Information:** Textual data gathered and recorded from previous searches and processing, represented as a single text string.
115 | 3. **Current Search Results:** Textual data gathered from the most recent search query.
116 |     """
117 |     record_messages = [SystemMessage(content=record_system_prompt)]
118 | 
119 |     browser = Browser(
120 |         config=BrowserConfig(
121 |             disable_security=True,
122 |             headless=kwargs.get(
123 |                 "headless", False
124 |             ),  # Set to False to see browser actions
125 |         )
126 |     )
127 |     controller = CustomController()
128 | 
129 |     search_iteration = 0
130 |     max_search_iterations = kwargs.get(
131 |         "max_search_iterations", 10
132 |     )  # Limit search iterations to prevent infinite loop
133 |     use_vision = kwargs.get("use_vision", False)
134 | 
135 |     history_query = []
136 |     history_infos = []
137 |     try:
138 |         while search_iteration < max_search_iterations:
139 |             search_iteration += 1
140 |             logger.info(f"Start {search_iteration}th Search...")
141 |             history_query_ = json.dumps(history_query, indent=4)
142 |             history_infos_ = json.dumps(history_infos, indent=4)
143 |             query_prompt = f"This is search {search_iteration} of {max_search_iterations} maximum searches allowed.\n User Instruction:{task} \n Previous Queries:\n {history_query_} \n Previous Search Results:\n {history_infos_}\n"
144 |             search_messages.append(HumanMessage(content=query_prompt))
145 |             ai_query_msg = llm.invoke(search_messages[:1] + search_messages[1:][-1:])
146 |             search_messages.append(ai_query_msg)
147 |             if hasattr(ai_query_msg, "reasoning_content"):
148 |                 logger.info("🤯 Start Search Deep Thinking: ")
149 |                 logger.info(ai_query_msg.reasoning_content)
150 |                 logger.info("🤯 End Search Deep Thinking")
151 |             ai_query_content = ai_query_msg.content.replace("```json", "").replace(
152 |                 "```", ""
153 |             )
154 |             ai_query_content = repair_json(ai_query_content)
155 |             ai_query_content = json.loads(ai_query_content)
156 |             query_plan = ai_query_content["plan"]
157 |             logger.info(f"Current Iteration {search_iteration} Planing:")
158 |             logger.info(query_plan)
159 |             query_tasks = ai_query_content["queries"]
160 |             if not query_tasks:
161 |                 break
162 |             else:
163 |                 history_query.extend(query_tasks)
164 |                 logger.info("Query tasks:")
165 |                 logger.info(query_tasks)
166 | 
167 |             # 2. Perform Web Search and Auto exec
168 |             # Paralle BU agents
169 |             add_infos = (
170 |                 "1. Please click on the most relevant link to get information and go deeper, instead of just staying on the search page. \n"
171 |                 "2. When opening a PDF file, please remember to extract the content using extract_content instead of simply opening it for the user to view."
172 |             )
173 |             agents = [
174 |                 CustomAgent(
175 |                     task=task,
176 |                     llm=llm,
177 |                     add_infos=add_infos,
178 |                     browser=browser,
179 |                     use_vision=use_vision,
180 |                     system_prompt_class=CustomSystemPrompt,
181 |                     agent_prompt_class=CustomAgentMessagePrompt,
182 |                     max_actions_per_step=5,
183 |                     controller=controller,
184 |                 )
185 |                 for task in query_tasks
186 |             ]
187 |             query_results = await asyncio.gather(
188 |                 *[agent.run(max_steps=kwargs.get("max_steps", 10)) for agent in agents]
189 |             )
190 | 
191 |             # 3. Summarize Search Result
192 |             query_result_dir = os.path.join(save_dir, "query_results")
193 |             os.makedirs(query_result_dir, exist_ok=True)
194 |             for i in range(len(query_tasks)):
195 |                 query_result = query_results[i].final_result()
196 |                 querr_save_path = os.path.join(
197 |                     query_result_dir, f"{search_iteration}-{i}.md"
198 |                 )
199 |                 logger.info(f"save query: {query_tasks[i]} at {querr_save_path}")
200 |                 with open(querr_save_path, "w", encoding="utf-8") as fw:
201 |                     fw.write(f"Query: {query_tasks[i]}\n")
202 |                     fw.write(query_result)
203 |                 history_infos_ = json.dumps(history_infos, indent=4)
204 |                 record_prompt = f"User Instruction:{task}. \nPrevious Recorded Information:\n {json.dumps(history_infos_)} \n Current Search Results: {query_result}\n "
205 |                 record_messages.append(HumanMessage(content=record_prompt))
206 |                 ai_record_msg = llm.invoke(record_messages[:1] + record_messages[-1:])
207 |                 record_messages.append(ai_record_msg)
208 |                 if hasattr(ai_record_msg, "reasoning_content"):
209 |                     logger.info("🤯 Start Record Deep Thinking: ")
210 |                     logger.info(ai_record_msg.reasoning_content)
211 |                     logger.info("🤯 End Record Deep Thinking")
212 |                 record_content = ai_record_msg.content
213 |                 record_content = repair_json(record_content)
214 |                 new_record_infos = json.loads(record_content)
215 |                 history_infos.extend(new_record_infos)
216 | 
217 |         logger.info("\nFinish Searching, Start Generating Report...")
218 | 
219 |         # 5. Report Generation in Markdown (or JSON if you prefer)
220 |         writer_system_prompt = """
221 |         You are a **Deep Researcher** and a professional report writer tasked with creating polished, high-quality reports that fully meet the user's needs, based on the user's instructions and the relevant information provided. You will write the report using Markdown format, ensuring it is both informative and visually appealing.
222 | 
223 | **Specific Instructions:**
224 | 
225 | *   **Structure for Impact:** The report must have a clear, logical, and impactful structure. Begin with a compelling introduction that immediately grabs the reader's attention. Develop well-structured body paragraphs that flow smoothly and logically, and conclude with a concise and memorable conclusion that summarizes key takeaways and leaves a lasting impression.
226 | *   **Engaging and Vivid Language:** Employ precise, vivid, and descriptive language to make the report captivating and enjoyable to read. Use stylistic techniques to enhance engagement. Tailor your tone, vocabulary, and writing style to perfectly suit the subject matter and the intended audience to maximize impact and readability.
227 | *   **Accuracy, Credibility, and Citations:** Ensure that all information presented is meticulously accurate, rigorously truthful, and robustly supported by the available data. **Cite sources exclusively using bracketed sequential numbers within the text (e.g., [1], [2], etc.). If no references are used, omit citations entirely.** These numbers must correspond to a numbered list of references at the end of the report.
228 | *   **Publication-Ready Formatting:** Adhere strictly to Markdown formatting for excellent readability and a clean, highly professional visual appearance. Pay close attention to formatting details like headings, lists, emphasis, and spacing to optimize the visual presentation and reader experience. The report should be ready for immediate publication upon completion, requiring minimal to no further editing for style or format.
229 | *   **Conciseness and Clarity (Unless Specified Otherwise):** When the user does not provide a specific length, prioritize concise and to-the-point writing, maximizing information density while maintaining clarity.
230 | *   **Data-Driven Comparisons with Tables:**  **When appropriate and beneficial for enhancing clarity and impact, present data comparisons in well-structured Markdown tables. This is especially encouraged when dealing with numerical data or when a visual comparison can significantly improve the reader's understanding.**
231 | *   **Length Adherence:** When the user specifies a length constraint, meticulously stay within reasonable bounds of that specification, ensuring the content is appropriately scaled without sacrificing quality or completeness.
232 | *   **Comprehensive Instruction Following:** Pay meticulous attention to all details and nuances provided in the user instructions. Strive to fulfill every aspect of the user's request with the highest degree of accuracy and attention to detail, creating a report that not only meets but exceeds expectations for quality and professionalism.
233 | *   **Reference List Formatting:** The reference list at the end must be formatted as follows:
234 |     `[1] Title (URL, if available)`
235 |     **Each reference must be separated by a blank line to ensure proper spacing.** For example:
236 | 
237 |     ```
238 |     [1] Title 1 (URL1, if available)
239 | 
240 |     [2] Title 2 (URL2, if available)
241 |     ```
242 |     **Furthermore, ensure that the reference list is free of duplicates. Each unique source should be listed only once, regardless of how many times it is cited in the text.**
243 | *   **ABSOLUTE FINAL OUTPUT RESTRICTION:**  **Your output must contain ONLY the finished, publication-ready Markdown report. Do not include ANY extraneous text, phrases, preambles, meta-commentary, or markdown code indicators (e.g., "```markdown```"). The report should begin directly with the title and introductory paragraph, and end directly after the conclusion and the reference list (if applicable).**  **Your response will be deemed a failure if this instruction is not followed precisely.**
244 | 
245 | **Inputs:**
246 | 
247 | 1. **User Instruction:** The original instruction given by the user. This helps you determine what kind of information will be useful and how to structure your thinking.
248 | 2. **Search Information:** Information gathered from the search queries.
249 |         """
250 | 
251 |         history_infos_ = json.dumps(history_infos, indent=4)
252 |         record_json_path = os.path.join(save_dir, "record_infos.json")
253 |         logger.info(f"save All recorded information at {record_json_path}")
254 |         with open(record_json_path, "w") as fw:
255 |             json.dump(history_infos, fw, indent=4)
256 |         report_prompt = (
257 |             f"User Instruction:{task} \n Search Information:\n {history_infos_}"
258 |         )
259 |         report_messages = [
260 |             SystemMessage(content=writer_system_prompt),
261 |             HumanMessage(content=report_prompt),
262 |         ]  # New context for report generation
263 |         ai_report_msg = llm.invoke(report_messages)
264 |         if hasattr(ai_report_msg, "reasoning_content"):
265 |             logger.info("🤯 Start Report Deep Thinking: ")
266 |             logger.info(ai_report_msg.reasoning_content)
267 |             logger.info("🤯 End Report Deep Thinking")
268 |         report_content = ai_report_msg.content
269 | 
270 |         report_file_path = os.path.join(save_dir, "final_report.md")
271 |         with open(report_file_path, "w", encoding="utf-8") as f:
272 |             f.write(report_content)
273 |         logger.info(f"Save Report at: {report_file_path}")
274 |         return report_content, report_file_path
275 | 
276 |     except Exception as e:
277 |         logger.error(f"Deep research Error: {e}")
278 |         return "", None
279 |     finally:
280 |         if browser:
281 |             await browser.close()
282 |             logger.info("Browser closed.")
283 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_browser_use/agent/custom_agent.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import logging
  3 | import pdb
  4 | import traceback
  5 | from typing import Optional, Type, List, Dict, Any, Callable
  6 | from PIL import Image, ImageDraw, ImageFont
  7 | import os
  8 | import base64
  9 | import io
 10 | import platform
 11 | from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
 12 | from browser_use.agent.service import Agent
 13 | from browser_use.agent.views import (
 14 |     ActionResult,
 15 |     ActionModel,
 16 |     AgentHistoryList,
 17 |     AgentOutput,
 18 |     AgentHistory,
 19 | )
 20 | from browser_use.browser.browser import Browser
 21 | from browser_use.browser.context import BrowserContext
 22 | from browser_use.browser.views import BrowserStateHistory
 23 | from browser_use.controller.service import Controller
 24 | from browser_use.telemetry.views import (
 25 |     AgentEndTelemetryEvent,
 26 |     AgentRunTelemetryEvent,
 27 |     AgentStepTelemetryEvent,
 28 | )
 29 | from browser_use.utils import time_execution_async
 30 | from langchain_core.language_models.chat_models import BaseChatModel
 31 | from langchain_core.messages import (
 32 |     BaseMessage,
 33 | )
 34 | from json_repair import repair_json
 35 | from mcp_server_browser_use.utils.agent_state import AgentState
 36 | 
 37 | from .custom_massage_manager import CustomMassageManager
 38 | from .custom_views import CustomAgentOutput, CustomAgentStepInfo
 39 | 
 40 | logger = logging.getLogger(__name__)
 41 | 
 42 | 
 43 | class CustomAgent(Agent):
 44 |     def __init__(
 45 |         self,
 46 |         task: str,
 47 |         llm: BaseChatModel,
 48 |         add_infos: str = "",
 49 |         browser: Browser | None = None,
 50 |         browser_context: BrowserContext | None = None,
 51 |         controller: Controller = Controller(),
 52 |         use_vision: bool = True,
 53 |         save_conversation_path: Optional[str] = None,
 54 |         max_failures: int = 5,
 55 |         retry_delay: int = 10,
 56 |         system_prompt_class: Type[SystemPrompt] = SystemPrompt,
 57 |         agent_prompt_class: Type[AgentMessagePrompt] = AgentMessagePrompt,
 58 |         max_input_tokens: int = 128000,
 59 |         validate_output: bool = False,
 60 |         include_attributes: list[str] = [
 61 |             "title",
 62 |             "type",
 63 |             "name",
 64 |             "role",
 65 |             "tabindex",
 66 |             "aria-label",
 67 |             "placeholder",
 68 |             "value",
 69 |             "alt",
 70 |             "aria-expanded",
 71 |         ],
 72 |         max_error_length: int = 400,
 73 |         max_actions_per_step: int = 10,
 74 |         tool_call_in_content: bool = True,
 75 |         agent_state: AgentState = None,
 76 |         initial_actions: Optional[List[Dict[str, Dict[str, Any]]]] = None,
 77 |         # Cloud Callbacks
 78 |         register_new_step_callback: (
 79 |             Callable[["BrowserState", "AgentOutput", int], None] | None
 80 |         ) = None,
 81 |         register_done_callback: Callable[["AgentHistoryList"], None] | None = None,
 82 |         tool_calling_method: Optional[str] = "auto",
 83 |     ):
 84 |         super().__init__(
 85 |             task=task,
 86 |             llm=llm,
 87 |             browser=browser,
 88 |             browser_context=browser_context,
 89 |             controller=controller,
 90 |             use_vision=use_vision,
 91 |             save_conversation_path=save_conversation_path,
 92 |             max_failures=max_failures,
 93 |             retry_delay=retry_delay,
 94 |             system_prompt_class=system_prompt_class,
 95 |             max_input_tokens=max_input_tokens,
 96 |             validate_output=validate_output,
 97 |             include_attributes=include_attributes,
 98 |             max_error_length=max_error_length,
 99 |             max_actions_per_step=max_actions_per_step,
100 |             tool_call_in_content=tool_call_in_content,
101 |             initial_actions=initial_actions,
102 |             register_new_step_callback=register_new_step_callback,
103 |             register_done_callback=register_done_callback,
104 |             tool_calling_method=tool_calling_method,
105 |         )
106 |         if self.model_name in ["deepseek-reasoner"] or "deepseek-r1" in self.model_name:
107 |             # deepseek-reasoner does not support function calling
108 |             self.use_deepseek_r1 = True
109 |             # deepseek-reasoner only support 64000 context
110 |             self.max_input_tokens = 64000
111 |         else:
112 |             self.use_deepseek_r1 = False
113 | 
114 |         # record last actions
115 |         self._last_actions = None
116 |         # record extract content
117 |         self.extracted_content = ""
118 |         # custom new info
119 |         self.add_infos = add_infos
120 |         # agent_state for Stop
121 |         self.agent_state = agent_state
122 |         self.agent_prompt_class = agent_prompt_class
123 |         self.message_manager = CustomMassageManager(
124 |             llm=self.llm,
125 |             task=self.task,
126 |             action_descriptions=self.controller.registry.get_prompt_description(),
127 |             system_prompt_class=self.system_prompt_class,
128 |             agent_prompt_class=agent_prompt_class,
129 |             max_input_tokens=self.max_input_tokens,
130 |             include_attributes=self.include_attributes,
131 |             max_error_length=self.max_error_length,
132 |             max_actions_per_step=self.max_actions_per_step,
133 |         )
134 | 
135 |     def _setup_action_models(self) -> None:
136 |         """Setup dynamic action models from controller's registry"""
137 |         # Get the dynamic action model from controller's registry
138 |         self.ActionModel = self.controller.registry.create_action_model()
139 |         # Create output model with the dynamic actions
140 |         self.AgentOutput = CustomAgentOutput.type_with_custom_actions(self.ActionModel)
141 | 
142 |     def _log_response(self, response: CustomAgentOutput) -> None:
143 |         """Log the model's response"""
144 |         if "Success" in response.current_state.prev_action_evaluation:
145 |             emoji = "✅"
146 |         elif "Failed" in response.current_state.prev_action_evaluation:
147 |             emoji = "❌"
148 |         else:
149 |             emoji = "🤷"
150 | 
151 |         logger.info(f"{emoji} Eval: {response.current_state.prev_action_evaluation}")
152 |         logger.info(f"🧠 New Memory: {response.current_state.important_contents}")
153 |         logger.info(f"⏳ Task Progress: \n{response.current_state.task_progress}")
154 |         logger.info(f"📋 Future Plans: \n{response.current_state.future_plans}")
155 |         logger.info(f"🤔 Thought: {response.current_state.thought}")
156 |         logger.info(f"🎯 Summary: {response.current_state.summary}")
157 |         for i, action in enumerate(response.action):
158 |             logger.info(
159 |                 f"🛠️  Action {i + 1}/{len(response.action)}: {action.model_dump_json(exclude_unset=True)}"
160 |             )
161 | 
162 |     def update_step_info(
163 |         self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None
164 |     ):
165 |         """
166 |         update step info
167 |         """
168 |         if step_info is None:
169 |             return
170 | 
171 |         step_info.step_number += 1
172 |         important_contents = model_output.current_state.important_contents
173 |         if (
174 |             important_contents
175 |             and "None" not in important_contents
176 |             and important_contents not in step_info.memory
177 |         ):
178 |             step_info.memory += important_contents + "\n"
179 | 
180 |         task_progress = model_output.current_state.task_progress
181 |         if task_progress and "None" not in task_progress:
182 |             step_info.task_progress = task_progress
183 | 
184 |         future_plans = model_output.current_state.future_plans
185 |         if future_plans and "None" not in future_plans:
186 |             step_info.future_plans = future_plans
187 | 
188 |     @time_execution_async("--get_next_action")
189 |     async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
190 |         """Get next action from LLM based on current state"""
191 |         messages_to_process = (
192 |             self.message_manager.merge_successive_human_messages(input_messages)
193 |             if self.use_deepseek_r1
194 |             else input_messages
195 |         )
196 | 
197 |         ai_message = self.llm.invoke(messages_to_process)
198 |         self.message_manager._add_message_with_tokens(ai_message)
199 | 
200 |         if self.use_deepseek_r1:
201 |             logger.info("🤯 Start Deep Thinking: ")
202 |             logger.info(ai_message.reasoning_content)
203 |             logger.info("🤯 End Deep Thinking")
204 | 
205 |         if isinstance(ai_message.content, list):
206 |             ai_content = ai_message.content[0]
207 |         else:
208 |             ai_content = ai_message.content
209 | 
210 |         ai_content = ai_content.replace("```json", "").replace("```", "")
211 |         ai_content = repair_json(ai_content)
212 |         parsed_json = json.loads(ai_content)
213 |         parsed: AgentOutput = self.AgentOutput(**parsed_json)
214 | 
215 |         if parsed is None:
216 |             logger.debug(ai_message.content)
217 |             raise ValueError("Could not parse response.")
218 | 
219 |         # Limit actions to maximum allowed per step
220 |         parsed.action = parsed.action[: self.max_actions_per_step]
221 |         self._log_response(parsed)
222 |         self.n_steps += 1
223 | 
224 |         return parsed
225 | 
226 |     @time_execution_async("--step")
227 |     async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
228 |         """Execute one step of the task"""
229 |         logger.info(f"\n📍 Step {self.n_steps}")
230 |         state = None
231 |         model_output = None
232 |         result: list[ActionResult] = []
233 | 
234 |         try:
235 |             state = await self.browser_context.get_state(use_vision=self.use_vision)
236 |             self.message_manager.add_state_message(
237 |                 state, self._last_actions, self._last_result, step_info
238 |             )
239 |             input_messages = self.message_manager.get_messages()
240 |             try:
241 |                 model_output = await self.get_next_action(input_messages)
242 |                 if self.register_new_step_callback:
243 |                     self.register_new_step_callback(state, model_output, self.n_steps)
244 |                 self.update_step_info(model_output, step_info)
245 |                 logger.info(f"🧠 All Memory: \n{step_info.memory}")
246 |                 self._save_conversation(input_messages, model_output)
247 |                 if self.model_name != "deepseek-reasoner":
248 |                     # remove prev message
249 |                     self.message_manager._remove_state_message_by_index(-1)
250 |             except Exception as e:
251 |                 # model call failed, remove last state message from history
252 |                 self.message_manager._remove_state_message_by_index(-1)
253 |                 raise e
254 | 
255 |             actions: list[ActionModel] = model_output.action
256 |             result: list[ActionResult] = await self.controller.multi_act(
257 |                 actions, self.browser_context
258 |             )
259 |             if len(result) != len(actions):
260 |                 # I think something changes, such information should let LLM know
261 |                 for ri in range(len(result), len(actions)):
262 |                     result.append(
263 |                         ActionResult(
264 |                             extracted_content=None,
265 |                             include_in_memory=True,
266 |                             error=f"{actions[ri].model_dump_json(exclude_unset=True)} is Failed to execute. \
267 |                                                     Something new appeared after action {actions[len(result) - 1].model_dump_json(exclude_unset=True)}",
268 |                             is_done=False,
269 |                         )
270 |                     )
271 |             if len(actions) == 0:
272 |                 # TODO: fix no action case
273 |                 result = [
274 |                     ActionResult(
275 |                         is_done=True,
276 |                         extracted_content=step_info.memory,
277 |                         include_in_memory=True,
278 |                     )
279 |                 ]
280 |             for ret_ in result:
281 |                 if "Extracted page" in ret_.extracted_content:
282 |                     # record every extracted page
283 |                     self.extracted_content += ret_.extracted_content
284 |             self._last_result = result
285 |             self._last_actions = actions
286 |             if len(result) > 0 and result[-1].is_done:
287 |                 if not self.extracted_content:
288 |                     self.extracted_content = step_info.memory
289 |                 result[-1].extracted_content = self.extracted_content
290 |                 logger.info(f"📄 Result: {result[-1].extracted_content}")
291 | 
292 |             self.consecutive_failures = 0
293 | 
294 |         except Exception as e:
295 |             result = await self._handle_step_error(e)
296 |             self._last_result = result
297 | 
298 |         finally:
299 |             actions = (
300 |                 [a.model_dump(exclude_unset=True) for a in model_output.action]
301 |                 if model_output
302 |                 else []
303 |             )
304 |             self.telemetry.capture(
305 |                 AgentStepTelemetryEvent(
306 |                     agent_id=self.agent_id,
307 |                     step=self.n_steps,
308 |                     actions=actions,
309 |                     consecutive_failures=self.consecutive_failures,
310 |                     step_error=(
311 |                         [r.error for r in result if r.error]
312 |                         if result
313 |                         else ["No result"]
314 |                     ),
315 |                 )
316 |             )
317 |             if not result:
318 |                 return
319 | 
320 |             if state:
321 |                 self._make_history_item(model_output, state, result)
322 | 
323 |     async def run(self, max_steps: int = 100) -> AgentHistoryList:
324 |         """Execute the task with maximum number of steps"""
325 |         try:
326 |             self._log_agent_run()
327 | 
328 |             # Execute initial actions if provided
329 |             if self.initial_actions:
330 |                 result = await self.controller.multi_act(
331 |                     self.initial_actions,
332 |                     self.browser_context,
333 |                     check_for_new_elements=False,
334 |                 )
335 |                 self._last_result = result
336 | 
337 |             step_info = CustomAgentStepInfo(
338 |                 task=self.task,
339 |                 add_infos=self.add_infos,
340 |                 step_number=1,
341 |                 max_steps=max_steps,
342 |                 memory="",
343 |                 task_progress="",
344 |                 future_plans="",
345 |             )
346 | 
347 |             for step in range(max_steps):
348 |                 # 1) Check if stop requested
349 |                 if self.agent_state and self.agent_state.is_stop_requested():
350 |                     logger.info("🛑 Stop requested by user")
351 |                     self._create_stop_history_item()
352 |                     break
353 | 
354 |                 # 2) Store last valid state before step
355 |                 if self.browser_context and self.agent_state:
356 |                     state = await self.browser_context.get_state(
357 |                         use_vision=self.use_vision
358 |                     )
359 |                     self.agent_state.set_last_valid_state(state)
360 | 
361 |                 if self._too_many_failures():
362 |                     break
363 | 
364 |                 # 3) Do the step
365 |                 await self.step(step_info)
366 | 
367 |                 if self.history.is_done():
368 |                     if (
369 |                         self.validate_output and step < max_steps - 1
370 |                     ):  # if last step, we dont need to validate
371 |                         if not await self._validate_output():
372 |                             continue
373 | 
374 |                     logger.info("✅ Task completed successfully")
375 |                     break
376 |             else:
377 |                 logger.info("❌ Failed to complete task in maximum steps")
378 |                 if not self.extracted_content:
379 |                     self.history.history[-1].result[
380 |                         -1
381 |                     ].extracted_content = step_info.memory
382 |                 else:
383 |                     self.history.history[-1].result[
384 |                         -1
385 |                     ].extracted_content = self.extracted_content
386 | 
387 |             return self.history
388 | 
389 |         finally:
390 |             self.telemetry.capture(
391 |                 AgentEndTelemetryEvent(
392 |                     agent_id=self.agent_id,
393 |                     success=self.history.is_done(),
394 |                     steps=self.n_steps,
395 |                     max_steps_reached=self.n_steps >= max_steps,
396 |                     errors=self.history.errors(),
397 |                 )
398 |             )
399 | 
400 |             if not self.injected_browser_context:
401 |                 await self.browser_context.close()
402 | 
403 |             if not self.injected_browser and self.browser:
404 |                 await self.browser.close()
405 | 
406 |             if self.generate_gif:
407 |                 output_path: str = "agent_history.gif"
408 |                 if isinstance(self.generate_gif, str):
409 |                     output_path = self.generate_gif
410 | 
411 |                 self.create_history_gif(output_path=output_path)
412 | 
413 |     def _create_stop_history_item(self):
414 |         """Create a history item for when the agent is stopped."""
415 |         try:
416 |             # Attempt to retrieve the last valid state from agent_state
417 |             state = None
418 |             if self.agent_state:
419 |                 last_state = self.agent_state.get_last_valid_state()
420 |                 if last_state:
421 |                     # Convert to BrowserStateHistory
422 |                     state = BrowserStateHistory(
423 |                         url=getattr(last_state, "url", ""),
424 |                         title=getattr(last_state, "title", ""),
425 |                         tabs=getattr(last_state, "tabs", []),
426 |                         interacted_element=[None],
427 |                         screenshot=getattr(last_state, "screenshot", None),
428 |                     )
429 |                 else:
430 |                     state = self._create_empty_state()
431 |             else:
432 |                 state = self._create_empty_state()
433 | 
434 |             # Create a final item in the agent history indicating done
435 |             stop_history = AgentHistory(
436 |                 model_output=None,
437 |                 state=state,
438 |                 result=[ActionResult(extracted_content=None, error=None, is_done=True)],
439 |             )
440 |             self.history.history.append(stop_history)
441 | 
442 |         except Exception as e:
443 |             logger.error(f"Error creating stop history item: {e}")
444 |             # Create empty state as fallback
445 |             state = self._create_empty_state()
446 |             stop_history = AgentHistory(
447 |                 model_output=None,
448 |                 state=state,
449 |                 result=[ActionResult(extracted_content=None, error=None, is_done=True)],
450 |             )
451 |             self.history.history.append(stop_history)
452 | 
453 |     def _convert_to_browser_state_history(self, browser_state):
454 |         return BrowserStateHistory(
455 |             url=getattr(browser_state, "url", ""),
456 |             title=getattr(browser_state, "title", ""),
457 |             tabs=getattr(browser_state, "tabs", []),
458 |             interacted_element=[None],
459 |             screenshot=getattr(browser_state, "screenshot", None),
460 |         )
461 | 
462 |     def _create_empty_state(self):
463 |         return BrowserStateHistory(
464 |             url="", title="", tabs=[], interacted_element=[None], screenshot=None
465 |         )
466 | 
467 |     def create_history_gif(
468 |         self,
469 |         output_path: str = "agent_history.gif",
470 |         duration: int = 3000,
471 |         show_goals: bool = True,
472 |         show_task: bool = True,
473 |         show_logo: bool = False,
474 |         font_size: int = 40,
475 |         title_font_size: int = 56,
476 |         goal_font_size: int = 44,
477 |         margin: int = 40,
478 |         line_spacing: float = 1.5,
479 |     ) -> None:
480 |         """Create a GIF from the agent's history with overlaid task and goal text."""
481 |         if not self.history.history:
482 |             logger.warning("No history to create GIF from")
483 |             return
484 | 
485 |         images = []
486 |         # if history is empty or first screenshot is None, we can't create a gif
487 |         if not self.history.history or not self.history.history[0].state.screenshot:
488 |             logger.warning("No history or first screenshot to create GIF from")
489 |             return
490 | 
491 |         # Try to load nicer fonts
492 |         try:
493 |             # Try different font options in order of preference
494 |             font_options = ["Helvetica", "Arial", "DejaVuSans", "Verdana"]
495 |             font_loaded = False
496 | 
497 |             for font_name in font_options:
498 |                 try:
499 |                     if platform.system() == "Windows":
500 |                         # Need to specify the abs font path on Windows
501 |                         font_name = os.path.join(
502 |                             os.getenv("WIN_FONT_DIR", "C:\\Windows\\Fonts"),
503 |                             font_name + ".ttf",
504 |                         )
505 |                     regular_font = ImageFont.truetype(font_name, font_size)
506 |                     title_font = ImageFont.truetype(font_name, title_font_size)
507 |                     goal_font = ImageFont.truetype(font_name, goal_font_size)
508 |                     font_loaded = True
509 |                     break
510 |                 except OSError:
511 |                     continue
512 | 
513 |             if not font_loaded:
514 |                 raise OSError("No preferred fonts found")
515 | 
516 |         except OSError:
517 |             regular_font = ImageFont.load_default()
518 |             title_font = ImageFont.load_default()
519 | 
520 |             goal_font = regular_font
521 | 
522 |         # Load logo if requested
523 |         logo = None
524 |         if show_logo:
525 |             try:
526 |                 logo = Image.open("./static/browser-use.png")
527 |                 # Resize logo to be small (e.g., 40px height)
528 |                 logo_height = 150
529 |                 aspect_ratio = logo.width / logo.height
530 |                 logo_width = int(logo_height * aspect_ratio)
531 |                 logo = logo.resize((logo_width, logo_height), Image.Resampling.LANCZOS)
532 |             except Exception as e:
533 |                 logger.warning(f"Could not load logo: {e}")
534 | 
535 |         # Create task frame if requested
536 |         if show_task and self.task:
537 |             task_frame = self._create_task_frame(
538 |                 self.task,
539 |                 self.history.history[0].state.screenshot,
540 |                 title_font,
541 |                 regular_font,
542 |                 logo,
543 |                 line_spacing,
544 |             )
545 |             images.append(task_frame)
546 | 
547 |         # Process each history item
548 |         for i, item in enumerate(self.history.history, 1):
549 |             if not item.state.screenshot:
550 |                 continue
551 | 
552 |             # Convert base64 screenshot to PIL Image
553 |             img_data = base64.b64decode(item.state.screenshot)
554 |             image = Image.open(io.BytesIO(img_data))
555 | 
556 |             if show_goals and item.model_output:
557 |                 image = self._add_overlay_to_image(
558 |                     image=image,
559 |                     step_number=i,
560 |                     goal_text=item.model_output.current_state.thought,
561 |                     regular_font=regular_font,
562 |                     title_font=title_font,
563 |                     margin=margin,
564 |                     logo=logo,
565 |                 )
566 | 
567 |             images.append(image)
568 | 
569 |         if images:
570 |             # Save the GIF
571 |             images[0].save(
572 |                 output_path,
573 |                 save_all=True,
574 |                 append_images=images[1:],
575 |                 duration=duration,
576 |                 loop=0,
577 |                 optimize=False,
578 |             )
579 |             logger.info(f"Created GIF at {output_path}")
580 |         else:
581 |             logger.warning("No images found in history to create GIF")
582 | 
```