This is page 1 of 2. Use http://codebase.md/saik0s/mcp-browser-use?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .env.example
├── .gitignore
├── .pre-commit-config.yaml
├── .python-version
├── assets
│ └── header.png
├── CLAUDE.md
├── LICENSE
├── pyproject.toml
├── README.md
├── src
│ └── mcp_server_browser_use
│ ├── __init__.py
│ ├── __main__.py
│ ├── _internal
│ │ ├── __init__.py
│ │ ├── agent
│ │ │ ├── __init__.py
│ │ │ ├── browser_use
│ │ │ │ └── browser_use_agent.py
│ │ │ └── deep_research
│ │ │ └── deep_research_agent.py
│ │ ├── browser
│ │ │ ├── __init__.py
│ │ │ ├── custom_browser.py
│ │ │ └── custom_context.py
│ │ ├── controller
│ │ │ ├── __init__.py
│ │ │ └── custom_controller.py
│ │ └── utils
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── llm_provider.py
│ │ ├── mcp_client.py
│ │ └── utils.py
│ ├── cli.py
│ ├── config.py
│ └── server.py
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
```
1 | 3.11
2 |
```
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
```yaml
1 | fail_fast: true
2 |
3 | repos:
4 | - repo: https://github.com/pre-commit/mirrors-prettier
5 | rev: v3.1.0
6 | hooks:
7 | - id: prettier
8 | types_or: [yaml, json5]
9 |
10 | # - repo: https://github.com/astral-sh/ruff-pre-commit
11 | # rev: v0.8.1
12 | # hooks:
13 | # - id: ruff-format
14 | # - id: ruff
15 | # args: [--fix, --exit-non-zero-on-fix]
16 |
17 | - repo: local
18 | hooks:
19 | - id: uv-lock-check
20 | name: Check uv.lock is up to date
21 | entry: uv lock --check
22 | language: system
23 | files: ^(pyproject\.toml|uv\.lock)$
24 | pass_filenames: false
25 |
```
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
```
1 | # File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig
2 | # Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,macos,python
3 | # Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,macos,python
4 |
5 | ### macOS ###
6 | # General
7 | .DS_Store
8 | .AppleDouble
9 | .LSOverride
10 |
11 | # Icon must end with two \r
12 | Icon
13 |
14 |
15 | # Thumbnails
16 | ._*
17 |
18 | # Files that might appear in the root of a volume
19 | .DocumentRevisions-V100
20 | .fseventsd
21 | .Spotlight-V100
22 | .TemporaryItems
23 | .Trashes
24 | .VolumeIcon.icns
25 | .com.apple.timemachine.donotpresent
26 |
27 | # Directories potentially created on remote AFP share
28 | .AppleDB
29 | .AppleDesktop
30 | Network Trash Folder
31 | Temporary Items
32 | .apdisk
33 |
34 | ### macOS Patch ###
35 | # iCloud generated files
36 | *.icloud
37 |
38 | ### Python ###
39 | # Byte-compiled / optimized / DLL files
40 | __pycache__/
41 | *.py[cod]
42 | *$py.class
43 |
44 | # C extensions
45 | *.so
46 |
47 | # Distribution / packaging
48 | .Python
49 | build/
50 | develop-eggs/
51 | dist/
52 | downloads/
53 | eggs/
54 | .eggs/
55 | lib/
56 | lib64/
57 | parts/
58 | sdist/
59 | var/
60 | wheels/
61 | share/python-wheels/
62 | *.egg-info/
63 | .installed.cfg
64 | *.egg
65 | MANIFEST
66 |
67 | # PyInstaller
68 | # Usually these files are written by a python script from a template
69 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
70 | *.manifest
71 | *.spec
72 |
73 | # Installer logs
74 | pip-log.txt
75 | pip-delete-this-directory.txt
76 |
77 | # Unit test / coverage reports
78 | htmlcov/
79 | .tox/
80 | .nox/
81 | .coverage
82 | .coverage.*
83 | .cache
84 | nosetests.xml
85 | coverage.xml
86 | *.cover
87 | *.py,cover
88 | .hypothesis/
89 | .pytest_cache/
90 | cover/
91 |
92 | # Translations
93 | *.mo
94 | *.pot
95 |
96 | # Django stuff:
97 | *.log
98 | local_settings.py
99 | db.sqlite3
100 | db.sqlite3-journal
101 |
102 | # Flask stuff:
103 | instance/
104 | .webassets-cache
105 |
106 | # Scrapy stuff:
107 | .scrapy
108 |
109 | # Sphinx documentation
110 | docs/_build/
111 |
112 | # PyBuilder
113 | .pybuilder/
114 | target/
115 |
116 | # Jupyter Notebook
117 | .ipynb_checkpoints
118 |
119 | # IPython
120 | profile_default/
121 | ipython_config.py
122 |
123 | # pyenv
124 | # For a library or package, you might want to ignore these files since the code is
125 | # intended to run in multiple environments; otherwise, check them in:
126 | # .python-version
127 |
128 | # pipenv
129 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
130 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
131 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
132 | # install all needed dependencies.
133 | #Pipfile.lock
134 |
135 | # poetry
136 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
137 | # This is especially recommended for binary packages to ensure reproducibility, and is more
138 | # commonly ignored for libraries.
139 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
140 | #poetry.lock
141 |
142 | # pdm
143 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
144 | #pdm.lock
145 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
146 | # in version control.
147 | # https://pdm.fming.dev/#use-with-ide
148 | .pdm.toml
149 |
150 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
151 | __pypackages__/
152 |
153 | # Celery stuff
154 | celerybeat-schedule
155 | celerybeat.pid
156 |
157 | # SageMath parsed files
158 | *.sage.py
159 |
160 | # Environments
161 | .env
162 | .venv
163 | env/
164 | venv/
165 | ENV/
166 | env.bak/
167 | venv.bak/
168 |
169 | # Spyder project settings
170 | .spyderproject
171 | .spyproject
172 |
173 | # Rope project settings
174 | .ropeproject
175 |
176 | # mkdocs documentation
177 | /site
178 |
179 | # mypy
180 | .mypy_cache/
181 | .dmypy.json
182 | dmypy.json
183 |
184 | # Pyre type checker
185 | .pyre/
186 |
187 | # pytype static type analyzer
188 | .pytype/
189 |
190 | # Cython debug symbols
191 | cython_debug/
192 |
193 | # PyCharm
194 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
195 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
196 | # and can be added to the global gitignore or merged into this file. For a more nuclear
197 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
198 | #.idea/
199 |
200 | ### Python Patch ###
201 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
202 | poetry.toml
203 |
204 | # ruff
205 | .ruff_cache/
206 |
207 | # LSP config files
208 | pyrightconfig.json
209 |
210 | ### VisualStudioCode ###
211 | .vscode/*
212 | !.vscode/settings.json
213 | !.vscode/tasks.json
214 | !.vscode/launch.json
215 | !.vscode/extensions.json
216 | !.vscode/*.code-snippets
217 |
218 | # Local History for Visual Studio Code
219 | .history/
220 |
221 | # Built Visual Studio Code Extensions
222 | *.vsix
223 |
224 | ### VisualStudioCode Patch ###
225 | # Ignore all local history of files
226 | .history
227 | .ionide
228 |
229 | # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,macos,python
230 |
231 | # Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
232 |
233 | agent_history.gif
234 | trace.json
235 | recording.mp4
236 | temp/
237 | tmp/
238 | .vscode/
239 |
```
--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
```
1 | # This is an example .env file. Copy it to .env and fill in your values.
2 | # Lines starting with # are comments.
3 |
4 | # === Main LLM Configuration (MCP_LLM_*) ===
5 | # Select the primary LLM provider
6 | # Options: openai, azure_openai, anthropic, google, mistral, ollama, deepseek, openrouter, alibaba, moonshot, unbound
7 | MCP_LLM_PROVIDER=google
8 | # Specify the model name for the selected provider
9 | MCP_LLM_MODEL_NAME=gemini-2.5-flash-preview-04-17
10 | # LLM temperature (0.0-2.0). Controls randomness.
11 | MCP_LLM_TEMPERATURE=0.0
12 | # Optional: Generic override for the LLM provider's base URL
13 | # MCP_LLM_BASE_URL=
14 | # Optional: Generic override for the LLM provider's API key. Takes precedence over provider-specific keys.
15 | # MCP_LLM_API_KEY=
16 |
17 | # --- Provider Specific API Keys (MCP_LLM_*) ---
18 | # Required unless using Ollama locally without auth or generic MCP_LLM_API_KEY is set
19 | # MCP_LLM_OPENAI_API_KEY=YOUR_OPENAI_API_KEY
20 | # MCP_LLM_ANTHROPIC_API_KEY=YOUR_ANTHROPIC_API_KEY
21 | # MCP_LLM_GOOGLE_API_KEY=YOUR_GOOGLE_API_KEY
22 | # MCP_LLM_AZURE_OPENAI_API_KEY=YOUR_AZURE_OPENAI_API_KEY
23 | # MCP_LLM_DEEPSEEK_API_KEY=YOUR_DEEPSEEK_API_KEY
24 | # MCP_LLM_MISTRAL_API_KEY=YOUR_MISTRAL_API_KEY
25 | # MCP_LLM_OPENROUTER_API_KEY=YOUR_OPENROUTER_API_KEY
26 | # MCP_LLM_ALIBABA_API_KEY=YOUR_ALIBABA_API_KEY
27 | # MCP_LLM_MOONSHOT_API_KEY=YOUR_MOONSHOT_API_KEY
28 | # MCP_LLM_UNBOUND_API_KEY=YOUR_UNBOUND_API_KEY
29 |
30 | # --- Provider Specific Endpoints (MCP_LLM_*) ---
31 | # Optional: Override default API endpoints.
32 | # MCP_LLM_OPENAI_ENDPOINT=https://api.openai.com/v1
33 | # MCP_LLM_ANTHROPIC_ENDPOINT=https://api.anthropic.com
34 | # MCP_LLM_AZURE_OPENAI_ENDPOINT=YOUR_AZURE_ENDPOINT # Required if using Azure, e.g., https://your-resource.openai.azure.com/
35 | # MCP_LLM_AZURE_OPENAI_API_VERSION=2025-01-01-preview
36 | # MCP_LLM_DEEPSEEK_ENDPOINT=https://api.deepseek.com
37 | # MCP_LLM_MISTRAL_ENDPOINT=https://api.mistral.ai/v1
38 | # MCP_LLM_OLLAMA_ENDPOINT=http://localhost:11434
39 | # MCP_LLM_OPENROUTER_ENDPOINT=https://openrouter.ai/api/v1
40 | # MCP_LLM_ALIBABA_ENDPOINT=https://dashscope.aliyuncs.com/compatible-mode/v1
41 | # MCP_LLM_MOONSHOT_ENDPOINT=https://api.moonshot.cn/v1
42 | # MCP_LLM_UNBOUND_ENDPOINT=https://api.getunbound.ai
43 |
44 | # --- Ollama Specific (MCP_LLM_*) ---
45 | # MCP_LLM_OLLAMA_NUM_CTX=32000
46 | # MCP_LLM_OLLAMA_NUM_PREDICT=1024
47 |
48 | # === Planner LLM Configuration (Optional, MCP_LLM_PLANNER_*) ===
49 | # If you want to use a different LLM for planning tasks within agents.
50 | # Defaults to main LLM settings if not specified.
51 | # MCP_LLM_PLANNER_PROVIDER=
52 | # MCP_LLM_PLANNER_MODEL_NAME=
53 | # MCP_LLM_PLANNER_TEMPERATURE=
54 | # MCP_LLM_PLANNER_BASE_URL=
55 | # MCP_LLM_PLANNER_API_KEY= # Generic planner API key, or use provider-specific below
56 | # MCP_LLM_PLANNER_OPENAI_API_KEY=
57 | # ... (similar provider-specific keys and endpoints for planner if needed)
58 |
59 | # === Browser Configuration (MCP_BROWSER_*) ===
60 | # General browser headless mode (true/false)
61 | MCP_BROWSER_HEADLESS=false
62 | # General browser disable security features (use cautiously) (true/false)
63 | MCP_BROWSER_DISABLE_SECURITY=false
64 | # Optional: Path to Chrome/Chromium executable
65 | # MCP_BROWSER_BINARY_PATH=/usr/bin/chromium-browser
66 | # Optional: Path to Chrome user data directory (for persistent sessions)
67 | # MCP_BROWSER_USER_DATA_DIR=~/.config/google-chrome/Profile 1
68 | MCP_BROWSER_WINDOW_WIDTH=1280
69 | MCP_BROWSER_WINDOW_HEIGHT=1080
70 | # Set to true to connect to user's browser via MCP_BROWSER_CDP_URL
71 | MCP_BROWSER_USE_OWN_BROWSER=false
72 | # Optional: Connect to existing Chrome via DevTools Protocol URL. Required if MCP_BROWSER_USE_OWN_BROWSER=true.
73 | # MCP_BROWSER_CDP_URL=http://localhost:9222
74 | # MCP_BROWSER_WSS_URL= # Optional: WSS URL if CDP URL is not sufficient
75 | # Keep browser managed by server open between MCP tool calls (if MCP_BROWSER_USE_OWN_BROWSER=false)
76 | MCP_BROWSER_KEEP_OPEN=false
77 | # Optional: Directory to save Playwright trace files (useful for debugging). If not set, tracing to file is disabled.
78 | # MCP_BROWSER_TRACE_PATH=./tmp/trace
79 |
80 | # === Agent Tool Configuration (`run_browser_agent` tool, MCP_AGENT_TOOL_*) ===
81 | MCP_AGENT_TOOL_MAX_STEPS=100
82 | MCP_AGENT_TOOL_MAX_ACTIONS_PER_STEP=5
83 | # Method for tool invocation ('auto', 'json_schema', 'function_calling')
84 | MCP_AGENT_TOOL_TOOL_CALLING_METHOD=auto
85 | MCP_AGENT_TOOL_MAX_INPUT_TOKENS=128000
86 | # Enable vision capabilities (screenshot analysis)
87 | MCP_AGENT_TOOL_USE_VISION=true
88 | # Override general browser headless mode for this tool (true/false/empty for general setting)
89 | # MCP_AGENT_TOOL_HEADLESS=
90 | # Override general browser disable security for this tool (true/false/empty for general setting)
91 | # MCP_AGENT_TOOL_DISABLE_SECURITY=
92 | # Enable Playwright video recording (true/false)
93 | MCP_AGENT_TOOL_ENABLE_RECORDING=false
94 | # Optional: Path to save agent run video recordings. If not set, recording to file is disabled even if ENABLE_RECORDING=true.
95 | # MCP_AGENT_TOOL_SAVE_RECORDING_PATH=./tmp/recordings
96 | # Optional: Directory to save agent history JSON files. If not set, history saving is disabled.
97 | # MCP_AGENT_TOOL_HISTORY_PATH=./tmp/agent_history
98 |
99 | # === Deep Research Tool Configuration (`run_deep_research` tool, MCP_RESEARCH_TOOL_*) ===
100 | MCP_RESEARCH_TOOL_MAX_PARALLEL_BROWSERS=3
101 | # MANDATORY: Base directory to save research artifacts (report, results). Task ID will be appended.
102 | # Example: MCP_RESEARCH_TOOL_SAVE_DIR=/mnt/data/research_outputs
103 | # Example: MCP_RESEARCH_TOOL_SAVE_DIR=C:\\Users\\YourUser\\Documents\\ResearchData
104 | MCP_RESEARCH_TOOL_SAVE_DIR=./tmp/deep_research
105 |
106 | # === Path Configuration (MCP_PATHS_*) ===
107 | # Optional: Directory for downloaded files. If not set, persistent downloads to a specific path are disabled.
108 | # MCP_PATHS_DOWNLOADS=./tmp/downloads
109 |
110 | # === Server Configuration (MCP_SERVER_*) ===
111 | # Path for the server log file. Leave empty for stdout.
112 | # MCP_SERVER_LOG_FILE=mcp_server_browser_use.log
113 | # Logging level: DEBUG, INFO, WARNING, ERROR, CRITICAL
114 | MCP_SERVER_LOGGING_LEVEL=INFO
115 | # Enable/disable anonymized telemetry (true/false)
116 | MCP_SERVER_ANONYMIZED_TELEMETRY=true
117 | # Optional: JSON string for MCP client configuration for the controller
118 | # MCP_SERVER_MCP_CONFIG='{"client_name": "mcp-browser-use-controller"}'
119 |
```
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
```markdown
1 | <img src="./assets/header.png" alt="Browser Use Web UI" width="full"/>
2 |
3 | <br/>
4 |
5 | # browser-use MCP server & CLI
6 | [](https://docs.browser-use.com)
7 | [](LICENSE)
8 |
9 | > **Project Note**: This MCP server implementation builds upon the [browser-use/web-ui](https://github.com/browser-use/web-ui) foundation. Core browser automation logic and configuration patterns are adapted from the original project.
10 |
11 | AI-driven browser automation server implementing the Model Context Protocol (MCP) for natural language browser control and web research. Also provides CLI access to its core functionalities.
12 |
13 | <a href="https://glama.ai/mcp/servers/@Saik0s/mcp-browser-use"><img width="380" height="200" src="https://glama.ai/mcp/servers/@Saik0s/mcp-browser-use/badge" alt="Browser-Use MCP server" /></a>
14 |
15 | ## Features
16 |
17 | - 🧠 **MCP Integration** - Full protocol implementation for AI agent communication.
18 | - 🌐 **Browser Automation** - Page navigation, form filling, element interaction via natural language (`run_browser_agent` tool).
19 | - 👁️ **Visual Understanding** - Optional screenshot analysis for vision-capable LLMs.
20 | - 🔄 **State Persistence** - Option to manage a server browser session across multiple MCP calls or connect to user's browser.
21 | - 🔌 **Multi-LLM Support** - Integrates with OpenAI, Anthropic, Azure, DeepSeek, Google, Mistral, Ollama, OpenRouter, Alibaba, Moonshot, Unbound AI.
22 | - 🔍 **Deep Research Tool** - Dedicated tool for multi-step web research and report generation (`run_deep_research` tool).
23 | - ⚙️ **Environment Variable Configuration** - Fully configurable via environment variables using a structured Pydantic model.
24 | - 🔗 **CDP Connection** - Ability to connect to and control a user-launched Chrome/Chromium instance via Chrome DevTools Protocol.
25 | - ⌨️ **CLI Interface** - Access core agent functionalities (`run_browser_agent`, `run_deep_research`) directly from the command line for testing and scripting.
26 |
27 | ## Quick Start
28 |
29 | ### The Essentials
30 |
31 | 1. Install UV - the rocket-powered Python installer:
32 | `curl -LsSf https://astral.sh/uv/install.sh | sh`
33 |
34 | 2. Get Playwright browsers (required for automation):
35 | `uvx --from mcp-server-browser-use@latest python -m playwright install`
36 |
37 | ### Integration Patterns
38 |
39 | For MCP clients like Claude Desktop, add a server configuration that's as simple as:
40 |
41 | ```json
42 | // Example 1: One-Line Latest Version (Always Fresh)
43 | "mcpServers": {
44 | "browser-use": {
45 | "command": "uvx",
46 | "args": ["mcp-server-browser-use@latest"],
47 | "env": {
48 | "MCP_LLM_GOOGLE_API_KEY": "YOUR_KEY_HERE_IF_USING_GOOGLE",
49 | "MCP_LLM_PROVIDER": "google",
50 | "MCP_LLM_MODEL_NAME": "gemini-2.5-flash-preview-04-17",
51 | "MCP_BROWSER_HEADLESS": "true",
52 | }
53 | }
54 | }
55 | ```
56 |
57 | ```json
58 | // Example 2: Advanced Configuration with CDP
59 | "mcpServers": {
60 | "browser-use": {
61 | "command": "uvx",
62 | "args": ["mcp-server-browser-use@latest"],
63 | "env": {
64 | "MCP_LLM_OPENROUTER_API_KEY": "YOUR_KEY_HERE_IF_USING_OPENROUTER",
65 | "MCP_LLM_PROVIDER": "openrouter",
66 | "MCP_LLM_MODEL_NAME": "anthropic/claude-3.5-haiku",
67 | "MCP_LLM_TEMPERATURE": "0.4",
68 |
69 | "MCP_BROWSER_HEADLESS": "false",
70 | "MCP_BROWSER_WINDOW_WIDTH": "1440",
71 | "MCP_BROWSER_WINDOW_HEIGHT": "1080",
72 | "MCP_AGENT_TOOL_USE_VISION": "true",
73 |
74 | "MCP_RESEARCH_TOOL_SAVE_DIR": "/path/to/your/research",
75 | "MCP_RESEARCH_TOOL_MAX_PARALLEL_BROWSERS": "5",
76 |
77 | "MCP_PATHS_DOWNLOADS": "/path/to/your/downloads",
78 |
79 | "MCP_BROWSER_USE_OWN_BROWSER": "true",
80 | "MCP_BROWSER_CDP_URL": "http://localhost:9222",
81 |
82 | "MCP_AGENT_TOOL_HISTORY_PATH": "/path/to/your/history",
83 |
84 | "MCP_SERVER_LOGGING_LEVEL": "DEBUG",
85 | "MCP_SERVER_LOG_FILE": "/path/to/your/log/mcp_server_browser_use.log",
86 | }
87 | }
88 | }
89 | ```
90 |
91 | ```json
92 | // Example 3: Advanced Configuration with User Data and custom chrome path
93 | "mcpServers": {
94 | "browser-use": {
95 | "command": "uvx",
96 | "args": ["mcp-server-browser-use@latest"],
97 | "env": {
98 | "MCP_LLM_OPENAI_API_KEY": "YOUR_KEY_HERE_IF_USING_OPENAI",
99 | "MCP_LLM_PROVIDER": "openai",
100 | "MCP_LLM_MODEL_NAME": "gpt-4.1-mini",
101 | "MCP_LLM_TEMPERATURE": "0.2",
102 |
103 | "MCP_BROWSER_HEADLESS": "false",
104 |
105 | "MCP_BROWSER_BINARY_PATH": "/path/to/your/chrome/binary",
106 | "MCP_BROWSER_USER_DATA_DIR": "/path/to/your/user/data",
107 | "MCP_BROWSER_DISABLE_SECURITY": "true",
108 | "MCP_BROWSER_KEEP_OPEN": "true",
109 | "MCP_BROWSER_TRACE_PATH": "/path/to/your/trace",
110 |
111 | "MCP_AGENT_TOOL_HISTORY_PATH": "/path/to/your/history",
112 |
113 | "MCP_SERVER_LOGGING_LEVEL": "DEBUG",
114 | "MCP_SERVER_LOG_FILE": "/path/to/your/log/mcp_server_browser_use.log",
115 | }
116 | }
117 | }
118 | ```
119 |
120 | ```json
121 | // Example 4: Local Development Flow
122 | "mcpServers": {
123 | "browser-use": {
124 | "command": "uv",
125 | "args": [
126 | "--directory",
127 | "/your/dev/path",
128 | "run",
129 | "mcp-server-browser-use"
130 | ],
131 | "env": {
132 | "MCP_LLM_OPENROUTER_API_KEY": "YOUR_KEY_HERE_IF_USING_OPENROUTER",
133 | "MCP_LLM_PROVIDER": "openrouter",
134 | "MCP_LLM_MODEL_NAME": "openai/gpt-4o-mini",
135 | "MCP_BROWSER_HEADLESS": "true",
136 | }
137 | }
138 | }
139 | ```
140 |
141 | **Key Insight:** The best configurations emerge from starting simple (Example 1). The .env.example file contains all possible dials.
142 |
143 | ## MCP Tools
144 |
145 | This server exposes the following tools via the Model Context Protocol:
146 |
147 | ### Synchronous Tools (Wait for Completion)
148 |
149 | 1. **`run_browser_agent`**
150 | * **Description:** Executes a browser automation task based on natural language instructions and waits for it to complete. Uses settings from `MCP_AGENT_TOOL_*`, `MCP_LLM_*`, and `MCP_BROWSER_*` environment variables.
151 | * **Arguments:**
152 | * `task` (string, required): The primary task or objective.
153 | * **Returns:** (string) The final result extracted by the agent or an error message. Agent history (JSON, optional GIF) saved if `MCP_AGENT_TOOL_HISTORY_PATH` is set.
154 |
155 | 2. **`run_deep_research`**
156 | * **Description:** Performs in-depth web research on a topic, generates a report, and waits for completion. Uses settings from `MCP_RESEARCH_TOOL_*`, `MCP_LLM_*`, and `MCP_BROWSER_*` environment variables. If `MCP_RESEARCH_TOOL_SAVE_DIR` is set, outputs are saved to a subdirectory within it; otherwise, operates in memory-only mode.
157 | * **Arguments:**
158 | * `research_task` (string, required): The topic or question for the research.
159 | * `max_parallel_browsers` (integer, optional): Overrides `MCP_RESEARCH_TOOL_MAX_PARALLEL_BROWSERS` from environment.
160 | * **Returns:** (string) The generated research report in Markdown format, including the file path (if saved), or an error message.
161 |
162 | ## CLI Usage
163 |
164 | This package also provides a command-line interface `mcp-browser-cli` for direct testing and scripting.
165 |
166 | **Global Options:**
167 | * `--env-file PATH, -e PATH`: Path to a `.env` file to load configurations from.
168 | * `--log-level LEVEL, -l LEVEL`: Override the logging level (e.g., `DEBUG`, `INFO`).
169 |
170 | **Commands:**
171 |
172 | 1. **`mcp-browser-cli run-browser-agent [OPTIONS] TASK`**
173 | * **Description:** Runs a browser agent task.
174 | * **Arguments:**
175 | * `TASK` (string, required): The primary task for the agent.
176 | * **Example:**
177 | ```bash
178 | mcp-browser-cli run-browser-agent "Go to example.com and find the title." -e .env
179 | ```
180 |
181 | 2. **`mcp-browser-cli run-deep-research [OPTIONS] RESEARCH_TASK`**
182 | * **Description:** Performs deep web research.
183 | * **Arguments:**
184 | * `RESEARCH_TASK` (string, required): The topic or question for research.
185 | * **Options:**
186 | * `--max-parallel-browsers INTEGER, -p INTEGER`: Override `MCP_RESEARCH_TOOL_MAX_PARALLEL_BROWSERS`.
187 | * **Example:**
188 | ```bash
189 | mcp-browser-cli run-deep-research "What are the latest advancements in AI-driven browser automation?" --max-parallel-browsers 5 -e .env
190 | ```
191 |
192 | All other configurations (LLM keys, paths, browser settings) are picked up from environment variables (or the specified `.env` file) as detailed in the Configuration section.
193 |
194 | ## Configuration (Environment Variables)
195 |
196 | Configure the server and CLI using environment variables. You can set these in your system or place them in a `.env` file in the project root (use `--env-file` for CLI). Variables are structured with prefixes.
197 |
198 | | Variable Group (Prefix) | Example Variable | Description | Default Value |
199 | | :---------------------------------- | :--------------------------------------------- | :--------------------------------------------------------------------------------------------------------- | :-------------------------------- |
200 | | **Main LLM (MCP_LLM_)** | | Settings for the primary LLM used by agents. | |
201 | | | `MCP_LLM_PROVIDER` | LLM provider. Options: `openai`, `azure_openai`, `anthropic`, `google`, `mistral`, `ollama`, etc. | `openai` |
202 | | | `MCP_LLM_MODEL_NAME` | Specific model name for the provider. | `gpt-4.1` |
203 | | | `MCP_LLM_TEMPERATURE` | LLM temperature (0.0-2.0). | `0.0` |
204 | | | `MCP_LLM_BASE_URL` | Optional: Generic override for LLM provider's base URL. | Provider-specific |
205 | | | `MCP_LLM_API_KEY` | Optional: Generic LLM API key (takes precedence). | - |
206 | | | `MCP_LLM_OPENAI_API_KEY` | API Key for OpenAI (if provider is `openai`). | - |
207 | | | `MCP_LLM_ANTHROPIC_API_KEY` | API Key for Anthropic. | - |
208 | | | `MCP_LLM_GOOGLE_API_KEY` | API Key for Google AI (Gemini). | - |
209 | | | `MCP_LLM_AZURE_OPENAI_API_KEY` | API Key for Azure OpenAI. | - |
210 | | | `MCP_LLM_AZURE_OPENAI_ENDPOINT` | **Required if using Azure.** Your Azure resource endpoint. | - |
211 | | | `MCP_LLM_OLLAMA_ENDPOINT` | Ollama API endpoint URL. | `http://localhost:11434` |
212 | | | `MCP_LLM_OLLAMA_NUM_CTX` | Context window size for Ollama models. | `32000` |
213 | | **Planner LLM (MCP_LLM_PLANNER_)** | | Optional: Settings for a separate LLM for agent planning. Defaults to Main LLM if not set. | |
214 | | | `MCP_LLM_PLANNER_PROVIDER` | Planner LLM provider. | Main LLM Provider |
215 | | | `MCP_LLM_PLANNER_MODEL_NAME` | Planner LLM model name. | Main LLM Model |
216 | | **Browser (MCP_BROWSER_)** | | General browser settings. | |
217 | | | `MCP_BROWSER_HEADLESS` | Run browser without UI (general setting). | `false` |
218 | | | `MCP_BROWSER_DISABLE_SECURITY` | Disable browser security features (general setting, use cautiously). | `false` |
219 | | | `MCP_BROWSER_BINARY_PATH` | Path to Chrome/Chromium executable. | - |
220 | | | `MCP_BROWSER_USER_DATA_DIR` | Path to Chrome user data directory. | - |
221 | | | `MCP_BROWSER_WINDOW_WIDTH` | Browser window width (pixels). | `1280` |
222 | | | `MCP_BROWSER_WINDOW_HEIGHT` | Browser window height (pixels). | `1080` |
223 | | | `MCP_BROWSER_USE_OWN_BROWSER` | Connect to user's browser via CDP URL. | `false` |
224 | | | `MCP_BROWSER_CDP_URL` | CDP URL (e.g., `http://localhost:9222`). Required if `MCP_BROWSER_USE_OWN_BROWSER=true`. | - |
225 | | | `MCP_BROWSER_KEEP_OPEN` | Keep server-managed browser open between MCP calls (if `MCP_BROWSER_USE_OWN_BROWSER=false`). | `false` |
226 | | | `MCP_BROWSER_TRACE_PATH` | Optional: Directory to save Playwright trace files. If not set, tracing to file is disabled. | ` ` (empty, tracing disabled) |
227 | | **Agent Tool (MCP_AGENT_TOOL_)** | | Settings for the `run_browser_agent` tool. | |
228 | | | `MCP_AGENT_TOOL_MAX_STEPS` | Max steps per agent run. | `100` |
229 | | | `MCP_AGENT_TOOL_MAX_ACTIONS_PER_STEP` | Max actions per agent step. | `5` |
230 | | | `MCP_AGENT_TOOL_TOOL_CALLING_METHOD` | Method for tool invocation ('auto', 'json_schema', 'function_calling'). | `auto` |
231 | | | `MCP_AGENT_TOOL_MAX_INPUT_TOKENS` | Max input tokens for LLM context. | `128000` |
232 | | | `MCP_AGENT_TOOL_USE_VISION` | Enable vision capabilities (screenshot analysis). | `true` |
233 | | | `MCP_AGENT_TOOL_HEADLESS` | Override `MCP_BROWSER_HEADLESS` for this tool (true/false/empty). | ` ` (uses general) |
234 | | | `MCP_AGENT_TOOL_DISABLE_SECURITY` | Override `MCP_BROWSER_DISABLE_SECURITY` for this tool (true/false/empty). | ` ` (uses general) |
235 | | | `MCP_AGENT_TOOL_ENABLE_RECORDING` | Enable Playwright video recording. | `false` |
236 | | | `MCP_AGENT_TOOL_SAVE_RECORDING_PATH` | Optional: Path to save recordings. If not set, recording to file is disabled even if `ENABLE_RECORDING=true`. | ` ` (empty, recording disabled) |
237 | | | `MCP_AGENT_TOOL_HISTORY_PATH` | Optional: Directory to save agent history JSON files. If not set, history saving is disabled. | ` ` (empty, history saving disabled) |
238 | | **Research Tool (MCP_RESEARCH_TOOL_)** | | Settings for the `run_deep_research` tool. | |
239 | | | `MCP_RESEARCH_TOOL_MAX_PARALLEL_BROWSERS` | Max parallel browser instances for deep research. | `3` |
240 | | | `MCP_RESEARCH_TOOL_SAVE_DIR` | Optional: Base directory to save research artifacts. Task ID will be appended. If not set, operates in memory-only mode. | `None` |
241 | | **Paths (MCP_PATHS_)** | | General path settings. | |
242 | | | `MCP_PATHS_DOWNLOADS` | Optional: Directory for downloaded files. If not set, persistent downloads to a specific path are disabled. | ` ` (empty, downloads disabled) |
243 | | **Server (MCP_SERVER_)** | | Server-specific settings. | |
244 | | | `MCP_SERVER_LOG_FILE` | Path for the server log file. Empty for stdout. | ` ` (empty, logs to stdout) |
245 | | | `MCP_SERVER_LOGGING_LEVEL` | Logging level (`DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL`). | `ERROR` |
246 | | | `MCP_SERVER_ANONYMIZED_TELEMETRY` | Enable/disable anonymized telemetry (`true`/`false`). | `true` |
247 | | | `MCP_SERVER_MCP_CONFIG` | Optional: JSON string for MCP client config used by the internal controller. | `null` |
248 |
249 | **Supported LLM Providers (`MCP_LLM_PROVIDER`):**
250 | `openai`, `azure_openai`, `anthropic`, `google`, `mistral`, `ollama`, `deepseek`, `openrouter`, `alibaba`, `moonshot`, `unbound`
251 |
252 | *(Refer to `.env.example` for a comprehensive list of all supported environment variables and their specific provider keys/endpoints.)*
253 |
254 | ## Connecting to Your Own Browser (CDP)
255 |
256 | Instead of having the server launch and manage its own browser instance, you can connect it to a Chrome/Chromium browser that you launch and manage yourself.
257 |
258 | **Steps:**
259 |
260 | 1. **Launch Chrome/Chromium with Remote Debugging Enabled:**
261 | (Commands for macOS, Linux, Windows as previously listed, e.g., `google-chrome --remote-debugging-port=9222`)
262 |
263 | 2. **Configure Environment Variables:**
264 | Set the following environment variables:
265 | ```dotenv
266 | MCP_BROWSER_USE_OWN_BROWSER=true
267 | MCP_BROWSER_CDP_URL=http://localhost:9222 # Use the same port
268 | # Optional: MCP_BROWSER_USER_DATA_DIR=/path/to/your/profile
269 | ```
270 |
271 | 3. **Run the MCP Server or CLI:**
272 | Start the server (`uv run mcp-server-browser-use`) or CLI (`mcp-browser-cli ...`) as usual.
273 |
274 | **Important Considerations:**
275 | * The browser launched with `--remote-debugging-port` must remain open.
276 | * Settings like `MCP_BROWSER_HEADLESS` and `MCP_BROWSER_KEEP_OPEN` are ignored when `MCP_BROWSER_USE_OWN_BROWSER=true`.
277 |
278 | ## Development
279 |
280 | ```bash
281 | # Install dev dependencies and sync project deps
282 | uv sync --dev
283 |
284 | # Install playwright browsers
285 | uv run playwright install
286 |
287 | # Run MCP server with debugger (Example connecting to own browser via CDP)
288 | # 1. Launch Chrome: google-chrome --remote-debugging-port=9222 --user-data-dir="optional/path/to/user/profile"
289 | # 2. Run inspector command with environment variables:
290 | npx @modelcontextprotocol/inspector@latest \
291 | -e MCP_LLM_GOOGLE_API_KEY=$GOOGLE_API_KEY \
292 | -e MCP_LLM_PROVIDER=google \
293 | -e MCP_LLM_MODEL_NAME=gemini-2.5-flash-preview-04-17 \
294 | -e MCP_BROWSER_USE_OWN_BROWSER=true \
295 | -e MCP_BROWSER_CDP_URL=http://localhost:9222 \
296 | -e MCP_RESEARCH_TOOL_SAVE_DIR=./tmp/dev_research_output \
297 | uv --directory . run mcp-server-browser-use
298 |
299 | # Note: Change timeout in inspector's config panel if needed (default is 10 seconds)
300 |
301 | # Run CLI example
302 | # Create a .env file with your settings (including MCP_RESEARCH_TOOL_SAVE_DIR) or use environment variables
303 | uv run mcp-browser-cli -e .env run-browser-agent "What is the title of example.com?"
304 | uv run mcp-browser-cli -e .env run-deep-research "What is the best material for a pan for everyday use on amateur kitchen and dishwasher?"
305 | ```
306 |
307 | ## Troubleshooting
308 |
309 | - **Configuration Error on Startup**: If the application fails to start with an error about a missing setting, ensure all **mandatory** environment variables (like `MCP_RESEARCH_TOOL_SAVE_DIR`) are set correctly in your environment or `.env` file.
310 | - **Browser Conflicts**: If *not* using CDP (`MCP_BROWSER_USE_OWN_BROWSER=false`), ensure no conflicting Chrome instances are running with the same user data directory if `MCP_BROWSER_USER_DATA_DIR` is specified.
311 | - **CDP Connection Issues**: If using `MCP_BROWSER_USE_OWN_BROWSER=true`:
312 | * Verify Chrome was launched with `--remote-debugging-port`.
313 | * Ensure the port in `MCP_BROWSER_CDP_URL` matches.
314 | * Check firewalls and ensure the browser is running.
315 | - **API Errors**: Double-check API keys (`MCP_LLM_<PROVIDER>_API_KEY` or `MCP_LLM_API_KEY`) and endpoints (e.g., `MCP_LLM_AZURE_OPENAI_ENDPOINT` for Azure).
316 | - **Vision Issues**: Ensure `MCP_AGENT_TOOL_USE_VISION=true` and your LLM supports vision.
317 | - **Dependency Problems**: Run `uv sync` and `uv run playwright install`.
318 | - **File/Path Issues**:
319 | * If optional features like history saving, tracing, or downloads are not working, ensure the corresponding path variables (`MCP_AGENT_TOOL_HISTORY_PATH`, `MCP_BROWSER_TRACE_PATH`, `MCP_PATHS_DOWNLOADS`) are set and the application has write permissions to those locations.
320 | * For deep research, ensure `MCP_RESEARCH_TOOL_SAVE_DIR` is set to a valid, writable directory.
321 | - **Logging**: Check the log file (`MCP_SERVER_LOG_FILE`, if set) or console output. Increase `MCP_SERVER_LOGGING_LEVEL` to `DEBUG` for more details. For CLI, use `--log-level DEBUG`.
322 |
323 | ## License
324 |
325 | MIT - See [LICENSE](LICENSE) for details.
326 |
```
--------------------------------------------------------------------------------
/CLAUDE.md:
--------------------------------------------------------------------------------
```markdown
1 | # Development Guidelines
2 |
3 | This document contains critical information about working with this codebase. Follow these guidelines precisely.
4 |
5 | ## Core Development Rules
6 |
7 | 1. Package Management
8 | - ONLY use uv, NEVER pip
9 | - Installation: `uv add package`
10 | - Running tools: `uv run tool`
11 | - Upgrading: `uv add --dev package --upgrade-package package`
12 | - FORBIDDEN: `uv pip install`, `@latest` syntax
13 |
14 | 2. Code Quality
15 | - Type hints required for all code
16 | - Public APIs must have docstrings
17 | - Functions must be focused and small
18 | - Follow existing patterns exactly
19 | - Line length: 150 chars maximum
20 |
21 | 3. Testing Requirements
22 | - Framework: `uv run pytest`
23 | - Async testing: use anyio, not asyncio
24 | - Coverage: test edge cases and errors
25 | - New features require tests
26 | - Bug fixes require regression tests
27 |
28 | ## Python Tools
29 |
30 | ## Code Formatting
31 |
32 | 1. Ruff
33 | - Format: `uv run ruff format .`
34 | - Check: `uv run ruff check .`
35 | - Fix: `uv run ruff check . --fix`
36 | - Critical issues:
37 | - Line length (150 chars)
38 | - Import sorting (I001)
39 | - Unused imports
40 | - Line wrapping:
41 | - Strings: use parentheses
42 | - Function calls: multi-line with proper indent
43 | - Imports: split into multiple lines
44 |
45 | 2. Type Checking
46 | - Tool: `uv run pyright`
47 | - Requirements:
48 | - Explicit None checks for Optional
49 | - Type narrowing for strings
50 | - Version warnings can be ignored if checks pass
51 |
52 | 3. Pre-commit
53 | - Config: `.pre-commit-config.yaml`
54 | - Runs: on git commit
55 | - Tools: Prettier (YAML/JSON), Ruff (Python)
56 | - Ruff updates:
57 | - Check PyPI versions
58 | - Update config rev
59 | - Commit config first
60 |
61 | ## Error Resolution
62 |
63 | 1. CI Failures
64 | - Fix order:
65 | 1. Formatting
66 | 2. Type errors
67 | 3. Linting
68 | - Type errors:
69 | - Get full line context
70 | - Check Optional types
71 | - Add type narrowing
72 | - Verify function signatures
73 |
74 | 2. Common Issues
75 | - Line length150 - Break strings with parentheses
76 | - Multi-line function calls
77 | - Split imports
78 | - Types:
79 | - Add None checks
80 | - Narrow string types
81 | - Match existing patterns
82 |
83 | 3. Best Practices
84 | - Check git status before commits
85 | - Run formatters before type checks
86 | - Keep changes minimal
87 | - Follow existing patterns
88 | - Document public APIs
89 | - Test thoroughly
90 |
```
--------------------------------------------------------------------------------
/src/mcp_server_browser_use/_internal/__init__.py:
--------------------------------------------------------------------------------
```python
1 |
```
--------------------------------------------------------------------------------
/src/mcp_server_browser_use/_internal/agent/__init__.py:
--------------------------------------------------------------------------------
```python
1 |
```
--------------------------------------------------------------------------------
/src/mcp_server_browser_use/_internal/browser/__init__.py:
--------------------------------------------------------------------------------
```python
1 |
```
--------------------------------------------------------------------------------
/src/mcp_server_browser_use/_internal/controller/__init__.py:
--------------------------------------------------------------------------------
```python
1 |
```
--------------------------------------------------------------------------------
/src/mcp_server_browser_use/_internal/utils/__init__.py:
--------------------------------------------------------------------------------
```python
1 |
```
--------------------------------------------------------------------------------
/src/mcp_server_browser_use/__init__.py:
--------------------------------------------------------------------------------
```python
1 | from mcp_server_browser_use.server import main
2 |
3 | if __name__ == "__main__":
4 | main()
5 |
```
--------------------------------------------------------------------------------
/src/mcp_server_browser_use/__main__.py:
--------------------------------------------------------------------------------
```python
1 | from mcp_server_browser_use.server import main
2 |
3 | if __name__ == "__main__":
4 | main()
5 |
```
--------------------------------------------------------------------------------
/src/mcp_server_browser_use/_internal/utils/utils.py:
--------------------------------------------------------------------------------
```python
1 | import base64
2 | import os
3 | import time
4 | from pathlib import Path
5 | from typing import Dict, Optional
6 | import requests
7 | import json
8 | import uuid
9 |
10 |
11 | def encode_image(img_path):
12 | if not img_path:
13 | return None
14 | with open(img_path, "rb") as fin:
15 | image_data = base64.b64encode(fin.read()).decode("utf-8")
16 | return image_data
17 |
18 |
19 | def get_latest_files(directory: str, file_types: list = ['.webm', '.zip']) -> Dict[str, Optional[str]]:
20 | """Get the latest recording and trace files"""
21 | latest_files: Dict[str, Optional[str]] = {ext: None for ext in file_types}
22 |
23 | if not os.path.exists(directory):
24 | os.makedirs(directory, exist_ok=True)
25 | return latest_files
26 |
27 | for file_type in file_types:
28 | try:
29 | matches = list(Path(directory).rglob(f"*{file_type}"))
30 | if matches:
31 | latest = max(matches, key=lambda p: p.stat().st_mtime)
32 | # Only return files that are complete (not being written)
33 | if time.time() - latest.stat().st_mtime > 1.0:
34 | latest_files[file_type] = str(latest)
35 | except Exception as e:
36 | print(f"Error getting latest {file_type} file: {e}")
37 |
38 | return latest_files
39 |
```
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
```toml
1 | [project]
2 | name = "mcp_server_browser_use"
3 | version = "0.1.8"
4 | description = "MCP server for browser-use"
5 | readme = "README.md"
6 | requires-python = ">=3.11"
7 | authors = [{ name = "Igor Tarasenko" }]
8 | license = { text = "MIT" }
9 | classifiers = [
10 | "Development Status :: 4 - Beta",
11 | "Programming Language :: Python :: 3",
12 | "Programming Language :: Python :: 3.11",
13 | "Operating System :: OS Independent",
14 | ]
15 |
16 | dependencies = [
17 | "pydantic-settings>=2.0.0",
18 | "mcp>=1.6.0",
19 | "typer>=0.12.0",
20 | "browser-use==0.1.41",
21 | "pyperclip==1.9.0",
22 | "json-repair",
23 | "langchain-mistralai==0.2.4",
24 | "MainContentExtractor==0.0.4",
25 | "langchain-ibm==0.3.10",
26 | "langchain_mcp_adapters==0.0.9",
27 | "langgraph==0.3.34",
28 | "langchain-community",
29 | ]
30 |
31 | [build-system]
32 | requires = ["hatchling"]
33 | build-backend = "hatchling.build"
34 |
35 | [tool.hatch.build.targets.wheel]
36 | packages = ["src/mcp_server_browser_use"]
37 |
38 | [project.scripts]
39 | mcp-server-browser-use = "mcp_server_browser_use.server:main"
40 | mcp-browser-cli = "mcp_server_browser_use.cli:app"
41 |
42 | [tool.pyright]
43 | include = ["src/mcp_server_browser_use"]
44 | venvPath = "."
45 | venv = ".venv"
46 |
47 | [tool.ruff.lint]
48 | select = ["E", "F", "I"]
49 | ignore = []
50 |
51 | [tool.ruff]
52 | line-length = 150
53 | target-version = "py311"
54 |
55 | [tool.uv]
56 | dev-dependencies = ["pyright>=1.1.378", "pytest>=8.3.3", "ruff>=0.6.9"]
57 |
```
--------------------------------------------------------------------------------
/src/mcp_server_browser_use/_internal/utils/config.py:
--------------------------------------------------------------------------------
```python
1 | PROVIDER_DISPLAY_NAMES = {
2 | "openai": "OpenAI",
3 | "azure_openai": "Azure OpenAI",
4 | "anthropic": "Anthropic",
5 | "deepseek": "DeepSeek",
6 | "google": "Google",
7 | "alibaba": "Alibaba",
8 | "moonshot": "MoonShot",
9 | "unbound": "Unbound AI",
10 | "ibm": "IBM"
11 | }
12 |
13 | # Predefined model names for common providers
14 | model_names = {
15 | "anthropic": ["claude-3-5-sonnet-20241022", "claude-3-5-sonnet-20240620", "claude-3-opus-20240229"],
16 | "openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo", "o3-mini"],
17 | "deepseek": ["deepseek-chat", "deepseek-reasoner"],
18 | "google": ["gemini-2.0-flash", "gemini-2.0-flash-thinking-exp", "gemini-1.5-flash-latest",
19 | "gemini-1.5-flash-8b-latest", "gemini-2.0-flash-thinking-exp-01-21", "gemini-2.0-pro-exp-02-05",
20 | "gemini-2.5-pro-preview-03-25", "gemini-2.5-flash-preview-04-17"],
21 | "ollama": ["qwen2.5:7b", "qwen2.5:14b", "qwen2.5:32b", "qwen2.5-coder:14b", "qwen2.5-coder:32b", "llama2:7b",
22 | "deepseek-r1:14b", "deepseek-r1:32b"],
23 | "azure_openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"],
24 | "mistral": ["pixtral-large-latest", "mistral-large-latest", "mistral-small-latest", "ministral-8b-latest"],
25 | "alibaba": ["qwen-plus", "qwen-max", "qwen-vl-max", "qwen-vl-plus", "qwen-turbo", "qwen-long"],
26 | "moonshot": ["moonshot-v1-32k-vision-preview", "moonshot-v1-8k-vision-preview"],
27 | "unbound": ["gemini-2.0-flash", "gpt-4o-mini", "gpt-4o", "gpt-4.5-preview"],
28 | "siliconflow": [
29 | "deepseek-ai/DeepSeek-R1",
30 | "deepseek-ai/DeepSeek-V3",
31 | "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
32 | "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
33 | "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
34 | "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
35 | "deepseek-ai/DeepSeek-V2.5",
36 | "deepseek-ai/deepseek-vl2",
37 | "Qwen/Qwen2.5-72B-Instruct-128K",
38 | "Qwen/Qwen2.5-72B-Instruct",
39 | "Qwen/Qwen2.5-32B-Instruct",
40 | "Qwen/Qwen2.5-14B-Instruct",
41 | "Qwen/Qwen2.5-7B-Instruct",
42 | "Qwen/Qwen2.5-Coder-32B-Instruct",
43 | "Qwen/Qwen2.5-Coder-7B-Instruct",
44 | "Qwen/Qwen2-7B-Instruct",
45 | "Qwen/Qwen2-1.5B-Instruct",
46 | "Qwen/QwQ-32B-Preview",
47 | "Qwen/Qwen2-VL-72B-Instruct",
48 | "Qwen/Qwen2.5-VL-32B-Instruct",
49 | "Qwen/Qwen2.5-VL-72B-Instruct",
50 | "TeleAI/TeleChat2",
51 | "THUDM/glm-4-9b-chat",
52 | "Vendor-A/Qwen/Qwen2.5-72B-Instruct",
53 | "internlm/internlm2_5-7b-chat",
54 | "internlm/internlm2_5-20b-chat",
55 | "Pro/Qwen/Qwen2.5-7B-Instruct",
56 | "Pro/Qwen/Qwen2-7B-Instruct",
57 | "Pro/Qwen/Qwen2-1.5B-Instruct",
58 | "Pro/THUDM/chatglm3-6b",
59 | "Pro/THUDM/glm-4-9b-chat",
60 | ],
61 | "ibm": ["ibm/granite-vision-3.1-2b-preview", "meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
62 | "meta-llama/llama-3-2-90b-vision-instruct"]
63 | }
64 |
```
--------------------------------------------------------------------------------
/src/mcp_server_browser_use/_internal/browser/custom_context.py:
--------------------------------------------------------------------------------
```python
1 | import json
2 | import logging
3 | import os
4 |
5 | from browser_use.browser.browser import Browser, IN_DOCKER
6 | from browser_use.browser.context import BrowserContext, BrowserContextConfig
7 | from playwright.async_api import Browser as PlaywrightBrowser
8 | from playwright.async_api import BrowserContext as PlaywrightBrowserContext
9 | from typing import Optional
10 | from browser_use.browser.context import BrowserContextState
11 |
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | class CustomBrowserContextConfig(BrowserContextConfig):
16 | force_new_context: bool = False # force to create new context
17 |
18 |
19 | class CustomBrowserContext(BrowserContext):
20 | def __init__(
21 | self,
22 | browser: 'Browser',
23 | config: BrowserContextConfig | None = None,
24 | state: Optional[BrowserContextState] = None,
25 | ):
26 | super(CustomBrowserContext, self).__init__(browser=browser, config=config, state=state)
27 |
28 | async def _create_context(self, browser: PlaywrightBrowser):
29 | """Creates a new browser context with anti-detection measures and loads cookies if available."""
30 | if not self.config.force_new_context and self.browser.config.cdp_url and len(browser.contexts) > 0:
31 | context = browser.contexts[0]
32 | elif not self.config.force_new_context and self.browser.config.browser_binary_path and len(
33 | browser.contexts) > 0:
34 | # Connect to existing Chrome instance instead of creating new one
35 | context = browser.contexts[0]
36 | else:
37 | # Original code for creating new context
38 | context = await browser.new_context(
39 | no_viewport=True,
40 | user_agent=self.config.user_agent,
41 | java_script_enabled=True,
42 | bypass_csp=self.config.disable_security,
43 | ignore_https_errors=self.config.disable_security,
44 | record_video_dir=self.config.save_recording_path,
45 | record_video_size=self.config.browser_window_size.model_dump(),
46 | record_har_path=self.config.save_har_path,
47 | locale=self.config.locale,
48 | http_credentials=self.config.http_credentials,
49 | is_mobile=self.config.is_mobile,
50 | has_touch=self.config.has_touch,
51 | geolocation=self.config.geolocation,
52 | permissions=self.config.permissions,
53 | timezone_id=self.config.timezone_id,
54 | )
55 |
56 | if self.config.trace_path:
57 | await context.tracing.start(screenshots=True, snapshots=True, sources=True)
58 |
59 | # Load cookies if they exist
60 | if self.config.cookies_file and os.path.exists(self.config.cookies_file):
61 | with open(self.config.cookies_file, 'r') as f:
62 | try:
63 | cookies = json.load(f)
64 |
65 | valid_same_site_values = ['Strict', 'Lax', 'None']
66 | for cookie in cookies:
67 | if 'sameSite' in cookie:
68 | if cookie['sameSite'] not in valid_same_site_values:
69 | logger.warning(
70 | f"Fixed invalid sameSite value '{cookie['sameSite']}' to 'None' for cookie {cookie.get('name')}"
71 | )
72 | cookie['sameSite'] = 'None'
73 | logger.info(f'🍪 Loaded {len(cookies)} cookies from {self.config.cookies_file}')
74 | await context.add_cookies(cookies)
75 |
76 | except json.JSONDecodeError as e:
77 | logger.error(f'Failed to parse cookies file: {str(e)}')
78 |
79 | # Expose anti-detection scripts
80 | await context.add_init_script(
81 | """
82 | // Webdriver property
83 | Object.defineProperty(navigator, 'webdriver', {
84 | get: () => undefined
85 | });
86 |
87 | // Languages
88 | Object.defineProperty(navigator, 'languages', {
89 | get: () => ['en-US']
90 | });
91 |
92 | // Plugins
93 | Object.defineProperty(navigator, 'plugins', {
94 | get: () => [1, 2, 3, 4, 5]
95 | });
96 |
97 | // Chrome runtime
98 | window.chrome = { runtime: {} };
99 |
100 | // Permissions
101 | const originalQuery = window.navigator.permissions.query;
102 | window.navigator.permissions.query = (parameters) => (
103 | parameters.name === 'notifications' ?
104 | Promise.resolve({ state: Notification.permission }) :
105 | originalQuery(parameters)
106 | );
107 | (function () {
108 | const originalAttachShadow = Element.prototype.attachShadow;
109 | Element.prototype.attachShadow = function attachShadow(options) {
110 | return originalAttachShadow.call(this, { ...options, mode: "open" });
111 | };
112 | })();
113 | """
114 | )
115 |
116 | return context
117 |
```
--------------------------------------------------------------------------------
/src/mcp_server_browser_use/_internal/browser/custom_browser.py:
--------------------------------------------------------------------------------
```python
1 | import asyncio
2 | import gc
3 | import pdb
4 |
5 | from playwright.async_api import Browser as PlaywrightBrowser
6 | from playwright.async_api import (
7 | BrowserContext as PlaywrightBrowserContext,
8 | )
9 | from playwright.async_api import (
10 | Playwright,
11 | async_playwright,
12 | )
13 | from browser_use.browser.browser import Browser, IN_DOCKER
14 | from browser_use.browser.context import BrowserContext, BrowserContextConfig
15 | from playwright.async_api import BrowserContext as PlaywrightBrowserContext
16 | import logging
17 |
18 | from browser_use.browser.chrome import (
19 | CHROME_ARGS,
20 | CHROME_DETERMINISTIC_RENDERING_ARGS,
21 | CHROME_DISABLE_SECURITY_ARGS,
22 | CHROME_DOCKER_ARGS,
23 | CHROME_HEADLESS_ARGS,
24 | )
25 | from browser_use.browser.context import BrowserContext, BrowserContextConfig
26 | from browser_use.browser.utils.screen_resolution import get_screen_resolution, get_window_adjustments
27 | from browser_use.utils import time_execution_async
28 | import socket
29 |
30 | from .custom_context import CustomBrowserContext, CustomBrowserContextConfig
31 |
32 | logger = logging.getLogger(__name__)
33 |
34 |
35 | class CustomBrowser(Browser):
36 |
37 | async def new_context(self, config: CustomBrowserContextConfig | None = None) -> CustomBrowserContext:
38 | """Create a browser context"""
39 | browser_config = self.config.model_dump() if self.config else {}
40 | context_config = config.model_dump() if config else {}
41 | merged_config = {**browser_config, **context_config}
42 | return CustomBrowserContext(config=CustomBrowserContextConfig(**merged_config), browser=self)
43 |
44 | async def _setup_builtin_browser(self, playwright: Playwright) -> PlaywrightBrowser:
45 | """Sets up and returns a Playwright Browser instance with anti-detection measures."""
46 | assert self.config.browser_binary_path is None, 'browser_binary_path should be None if trying to use the builtin browsers'
47 |
48 | if self.config.headless:
49 | screen_size = {'width': 1920, 'height': 1080}
50 | offset_x, offset_y = 0, 0
51 | else:
52 | screen_size = get_screen_resolution()
53 | offset_x, offset_y = get_window_adjustments()
54 |
55 | chrome_args = {
56 | *CHROME_ARGS,
57 | *(CHROME_DOCKER_ARGS if IN_DOCKER else []),
58 | *(CHROME_HEADLESS_ARGS if self.config.headless else []),
59 | *(CHROME_DISABLE_SECURITY_ARGS if self.config.disable_security else []),
60 | *(CHROME_DETERMINISTIC_RENDERING_ARGS if self.config.deterministic_rendering else []),
61 | f'--window-position={offset_x},{offset_y}',
62 | *self.config.extra_browser_args,
63 | }
64 | contain_window_size = False
65 | for arg in self.config.extra_browser_args:
66 | if "--window-size" in arg:
67 | contain_window_size = True
68 | break
69 | if not contain_window_size:
70 | chrome_args.add(f'--window-size={screen_size["width"]},{screen_size["height"]}')
71 |
72 | # check if port 9222 is already taken, if so remove the remote-debugging-port arg to prevent conflicts
73 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
74 | if s.connect_ex(('localhost', 9222)) == 0:
75 | chrome_args.remove('--remote-debugging-port=9222')
76 |
77 | browser_class = getattr(playwright, self.config.browser_class)
78 | args = {
79 | 'chromium': list(chrome_args),
80 | 'firefox': [
81 | *{
82 | '-no-remote',
83 | *self.config.extra_browser_args,
84 | }
85 | ],
86 | 'webkit': [
87 | *{
88 | '--no-startup-window',
89 | *self.config.extra_browser_args,
90 | }
91 | ],
92 | }
93 |
94 | browser = await browser_class.launch(
95 | headless=self.config.headless,
96 | args=args[self.config.browser_class],
97 | proxy=self.config.proxy.model_dump() if self.config.proxy else None,
98 | handle_sigterm=False,
99 | handle_sigint=False,
100 | )
101 | return browser
102 |
103 | async def _close_without_httpxclients(self):
104 | if self.config.keep_alive:
105 | return
106 |
107 | try:
108 | if self.playwright_browser:
109 | await self.playwright_browser.close()
110 | del self.playwright_browser
111 | if self.playwright:
112 | await self.playwright.stop()
113 | del self.playwright
114 | if chrome_proc := getattr(self, '_chrome_subprocess', None):
115 | try:
116 | # always kill all children processes, otherwise chrome leaves a bunch of zombie processes
117 | for proc in chrome_proc.children(recursive=True):
118 | proc.kill()
119 | chrome_proc.kill()
120 | except Exception as e:
121 | logger.debug(f'Failed to terminate chrome subprocess: {e}')
122 |
123 | except Exception as e:
124 | logger.debug(f'Failed to close browser properly: {e}')
125 |
126 | finally:
127 | self.playwright_browser = None
128 | self.playwright = None
129 | self._chrome_subprocess = None
130 | gc.collect()
131 |
```
--------------------------------------------------------------------------------
/src/mcp_server_browser_use/_internal/agent/browser_use/browser_use_agent.py:
--------------------------------------------------------------------------------
```python
1 | from __future__ import annotations
2 |
3 | import asyncio
4 | import gc
5 | import inspect
6 | import json
7 | import logging
8 | import os
9 | import re
10 | import time
11 | from pathlib import Path
12 | from typing import Any, Awaitable, Callable, Dict, Generic, List, Optional, TypeVar, Union
13 |
14 | from dotenv import load_dotenv
15 | from langchain_core.language_models.chat_models import BaseChatModel
16 | from langchain_core.messages import (
17 | BaseMessage,
18 | HumanMessage,
19 | SystemMessage,
20 | )
21 |
22 | # from lmnr.sdk.decorators import observe
23 | from pydantic import BaseModel, ValidationError
24 |
25 | from browser_use.agent.gif import create_history_gif
26 | from browser_use.agent.memory.service import Memory, MemorySettings
27 | from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings
28 | from browser_use.agent.message_manager.utils import convert_input_messages, extract_json_from_model_output, save_conversation
29 | from browser_use.agent.prompts import AgentMessagePrompt, PlannerPrompt, SystemPrompt
30 | from browser_use.agent.views import (
31 | REQUIRED_LLM_API_ENV_VARS,
32 | ActionResult,
33 | AgentError,
34 | AgentHistory,
35 | AgentHistoryList,
36 | AgentOutput,
37 | AgentSettings,
38 | AgentState,
39 | AgentStepInfo,
40 | StepMetadata,
41 | ToolCallingMethod,
42 | )
43 | from browser_use.browser.browser import Browser
44 | from browser_use.browser.context import BrowserContext
45 | from browser_use.browser.views import BrowserState, BrowserStateHistory
46 | from browser_use.controller.registry.views import ActionModel
47 | from browser_use.controller.service import Controller
48 | from browser_use.dom.history_tree_processor.service import (
49 | DOMHistoryElement,
50 | HistoryTreeProcessor,
51 | )
52 | from browser_use.exceptions import LLMException
53 | from browser_use.telemetry.service import ProductTelemetry
54 | from browser_use.telemetry.views import (
55 | AgentEndTelemetryEvent,
56 | AgentRunTelemetryEvent,
57 | AgentStepTelemetryEvent,
58 | )
59 | from browser_use.utils import check_env_variables, time_execution_async, time_execution_sync
60 | from browser_use.agent.service import Agent, AgentHookFunc
61 |
62 | load_dotenv()
63 | logger = logging.getLogger(__name__)
64 |
65 | SKIP_LLM_API_KEY_VERIFICATION = os.environ.get('SKIP_LLM_API_KEY_VERIFICATION', 'false').lower()[0] in 'ty1'
66 |
67 |
68 | class BrowserUseAgent(Agent):
69 | @time_execution_async('--run (agent)')
70 | async def run(
71 | self, max_steps: int = 100, on_step_start: AgentHookFunc | None = None,
72 | on_step_end: AgentHookFunc | None = None
73 | ) -> AgentHistoryList:
74 | """Execute the task with maximum number of steps"""
75 |
76 | loop = asyncio.get_event_loop()
77 |
78 | # Set up the Ctrl+C signal handler with callbacks specific to this agent
79 | from browser_use.utils import SignalHandler
80 |
81 | signal_handler = SignalHandler(
82 | loop=loop,
83 | pause_callback=self.pause,
84 | resume_callback=self.resume,
85 | custom_exit_callback=None, # No special cleanup needed on forced exit
86 | exit_on_second_int=True,
87 | )
88 | signal_handler.register()
89 |
90 | # Wait for verification task to complete if it exists
91 | if hasattr(self, '_verification_task') and self._verification_task and not self._verification_task.done():
92 | try:
93 | await self._verification_task
94 | except Exception:
95 | # Error already logged in the task
96 | pass
97 |
98 | try:
99 | self._log_agent_run()
100 |
101 | # Execute initial actions if provided
102 | if self.initial_actions:
103 | result = await self.multi_act(self.initial_actions, check_for_new_elements=False)
104 | self.state.last_result = result
105 |
106 | for step in range(max_steps):
107 | # Check if waiting for user input after Ctrl+C
108 | while self.state.paused:
109 | await asyncio.sleep(0.5)
110 | if self.state.stopped:
111 | break
112 |
113 | # Check if we should stop due to too many failures
114 | if self.state.consecutive_failures >= self.settings.max_failures:
115 | logger.error(f'❌ Stopping due to {self.settings.max_failures} consecutive failures')
116 | break
117 |
118 | # Check control flags before each step
119 | if self.state.stopped:
120 | logger.info('Agent stopped')
121 | break
122 |
123 | while self.state.paused:
124 | await asyncio.sleep(0.2) # Small delay to prevent CPU spinning
125 | if self.state.stopped: # Allow stopping while paused
126 | break
127 |
128 | if on_step_start is not None:
129 | await on_step_start(self)
130 |
131 | step_info = AgentStepInfo(step_number=step, max_steps=max_steps)
132 | await self.step(step_info)
133 |
134 | if on_step_end is not None:
135 | await on_step_end(self)
136 |
137 | if self.state.history.is_done():
138 | if self.settings.validate_output and step < max_steps - 1:
139 | if not await self._validate_output():
140 | continue
141 |
142 | await self.log_completion()
143 | break
144 | else:
145 | logger.info('❌ Failed to complete task in maximum steps')
146 |
147 | return self.state.history
148 |
149 | except KeyboardInterrupt:
150 | # Already handled by our signal handler, but catch any direct KeyboardInterrupt as well
151 | logger.info('Got KeyboardInterrupt during execution, returning current history')
152 | return self.state.history
153 |
154 | finally:
155 | # Unregister signal handlers before cleanup
156 | signal_handler.unregister()
157 |
158 | self.telemetry.capture(
159 | AgentEndTelemetryEvent(
160 | agent_id=self.state.agent_id,
161 | is_done=self.state.history.is_done(),
162 | success=self.state.history.is_successful(),
163 | steps=self.state.n_steps,
164 | max_steps_reached=self.state.n_steps >= max_steps,
165 | errors=self.state.history.errors(),
166 | total_input_tokens=self.state.history.total_input_tokens(),
167 | total_duration_seconds=self.state.history.total_duration_seconds(),
168 | )
169 | )
170 |
171 | await self.close()
172 |
173 | if self.settings.generate_gif:
174 | output_path: str = 'agent_history.gif'
175 | if isinstance(self.settings.generate_gif, str):
176 | output_path = self.settings.generate_gif
177 |
178 | create_history_gif(task=self.task, history=self.state.history, output_path=output_path)
```
--------------------------------------------------------------------------------
/src/mcp_server_browser_use/_internal/controller/custom_controller.py:
--------------------------------------------------------------------------------
```python
1 | import pdb
2 |
3 | import pyperclip
4 | from typing import Optional, Type, Callable, Dict, Any, Union, Awaitable, TypeVar
5 | from pydantic import BaseModel
6 | from browser_use.agent.views import ActionResult
7 | from browser_use.browser.context import BrowserContext
8 | from browser_use.controller.service import Controller, DoneAction
9 | from browser_use.controller.registry.service import Registry, RegisteredAction
10 | from main_content_extractor import MainContentExtractor
11 | from browser_use.controller.views import (
12 | ClickElementAction,
13 | DoneAction,
14 | ExtractPageContentAction,
15 | GoToUrlAction,
16 | InputTextAction,
17 | OpenTabAction,
18 | ScrollAction,
19 | SearchGoogleAction,
20 | SendKeysAction,
21 | SwitchTabAction,
22 | )
23 | import logging
24 | import inspect
25 | import asyncio
26 | import os
27 | from langchain_core.language_models.chat_models import BaseChatModel
28 | from browser_use.agent.views import ActionModel, ActionResult
29 |
30 | from ..utils.mcp_client import create_tool_param_model, setup_mcp_client_and_tools
31 |
32 | from browser_use.utils import time_execution_sync
33 |
34 | logger = logging.getLogger(__name__)
35 |
36 | Context = TypeVar('Context')
37 |
38 |
39 | class CustomController(Controller):
40 | def __init__(self, exclude_actions: list[str] = [],
41 | output_model: Optional[Type[BaseModel]] = None,
42 | ask_assistant_callback: Optional[Union[Callable[[str, BrowserContext], Dict[str, Any]], Callable[
43 | [str, BrowserContext], Awaitable[Dict[str, Any]]]]] = None,
44 | ):
45 | super().__init__(exclude_actions=exclude_actions, output_model=output_model)
46 | self._register_custom_actions()
47 | self.ask_assistant_callback = ask_assistant_callback
48 | self.mcp_client = None
49 | self.mcp_server_config = None
50 |
51 | def _register_custom_actions(self):
52 | """Register all custom browser actions"""
53 |
54 | @self.registry.action(
55 | "When executing tasks, prioritize autonomous completion. However, if you encounter a definitive blocker "
56 | "that prevents you from proceeding independently – such as needing credentials you don't possess, "
57 | "requiring subjective human judgment, needing a physical action performed, encountering complex CAPTCHAs, "
58 | "or facing limitations in your capabilities – you must request human assistance."
59 | )
60 | async def ask_for_assistant(query: str, browser: BrowserContext):
61 | if self.ask_assistant_callback:
62 | if inspect.iscoroutinefunction(self.ask_assistant_callback):
63 | user_response = await self.ask_assistant_callback(query, browser)
64 | else:
65 | user_response = self.ask_assistant_callback(query, browser)
66 | msg = f"AI ask: {query}. User response: {user_response['response']}"
67 | logger.info(msg)
68 | return ActionResult(extracted_content=msg, include_in_memory=True)
69 | else:
70 | return ActionResult(extracted_content="Human cannot help you. Please try another way.",
71 | include_in_memory=True)
72 |
73 | @self.registry.action(
74 | 'Upload file to interactive element with file path ',
75 | )
76 | async def upload_file(index: int, path: str, browser: BrowserContext, available_file_paths: list[str]):
77 | if path not in available_file_paths:
78 | return ActionResult(error=f'File path {path} is not available')
79 |
80 | if not os.path.exists(path):
81 | return ActionResult(error=f'File {path} does not exist')
82 |
83 | dom_el = await browser.get_dom_element_by_index(index)
84 |
85 | file_upload_dom_el = dom_el.get_file_upload_element()
86 |
87 | if file_upload_dom_el is None:
88 | msg = f'No file upload element found at index {index}'
89 | logger.info(msg)
90 | return ActionResult(error=msg)
91 |
92 | file_upload_el = await browser.get_locate_element(file_upload_dom_el)
93 |
94 | if file_upload_el is None:
95 | msg = f'No file upload element found at index {index}'
96 | logger.info(msg)
97 | return ActionResult(error=msg)
98 |
99 | try:
100 | await file_upload_el.set_input_files(path)
101 | msg = f'Successfully uploaded file to index {index}'
102 | logger.info(msg)
103 | return ActionResult(extracted_content=msg, include_in_memory=True)
104 | except Exception as e:
105 | msg = f'Failed to upload file to index {index}: {str(e)}'
106 | logger.info(msg)
107 | return ActionResult(error=msg)
108 |
109 | @time_execution_sync('--act')
110 | async def act(
111 | self,
112 | action: ActionModel,
113 | browser_context: Optional[BrowserContext] = None,
114 | #
115 | page_extraction_llm: Optional[BaseChatModel] = None,
116 | sensitive_data: Optional[Dict[str, str]] = None,
117 | available_file_paths: Optional[list[str]] = None,
118 | #
119 | context: Context | None = None,
120 | ) -> ActionResult:
121 | """Execute an action"""
122 |
123 | try:
124 | for action_name, params in action.model_dump(exclude_unset=True).items():
125 | if params is not None:
126 | if action_name.startswith("mcp"):
127 | # this is a mcp tool
128 | logger.debug(f"Invoke MCP tool: {action_name}")
129 | mcp_tool = self.registry.registry.actions.get(action_name).function
130 | result = await mcp_tool.ainvoke(params)
131 | else:
132 | result = await self.registry.execute_action(
133 | action_name,
134 | params,
135 | browser=browser_context,
136 | page_extraction_llm=page_extraction_llm,
137 | sensitive_data=sensitive_data,
138 | available_file_paths=available_file_paths,
139 | context=context,
140 | )
141 |
142 | if isinstance(result, str):
143 | return ActionResult(extracted_content=result)
144 | elif isinstance(result, ActionResult):
145 | return result
146 | elif result is None:
147 | return ActionResult()
148 | else:
149 | raise ValueError(f'Invalid action result type: {type(result)} of {result}')
150 | return ActionResult()
151 | except Exception as e:
152 | raise e
153 |
154 | async def setup_mcp_client(self, mcp_server_config: Optional[Dict[str, Any]] = None):
155 | self.mcp_server_config = mcp_server_config
156 | if self.mcp_server_config:
157 | self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config)
158 | self.register_mcp_tools()
159 |
160 | def register_mcp_tools(self):
161 | """
162 | Register the MCP tools used by this controller.
163 | """
164 | if self.mcp_client:
165 | for server_name in self.mcp_client.server_name_to_tools:
166 | for tool in self.mcp_client.server_name_to_tools[server_name]:
167 | tool_name = f"mcp.{server_name}.{tool.name}"
168 | self.registry.registry.actions[tool_name] = RegisteredAction(
169 | name=tool_name,
170 | description=tool.description,
171 | function=tool,
172 | param_model=create_tool_param_model(tool),
173 | )
174 | logger.info(f"Add mcp tool: {tool_name}")
175 |
176 | async def close_mcp_client(self):
177 | if self.mcp_client:
178 | await self.mcp_client.__aexit__(None, None, None)
179 |
```
--------------------------------------------------------------------------------
/src/mcp_server_browser_use/_internal/utils/mcp_client.py:
--------------------------------------------------------------------------------
```python
1 | import os
2 | import asyncio
3 | import base64
4 | import pdb
5 | from typing import List, Tuple, Optional
6 | from langchain_core.tools import BaseTool
7 | from langchain_mcp_adapters.client import MultiServerMCPClient
8 | import base64
9 | import json
10 | import logging
11 | from typing import Optional, Dict, Any, Type
12 | from langchain_core.tools import BaseTool
13 | from pydantic.v1 import BaseModel, Field
14 | from langchain_core.runnables import RunnableConfig
15 | from pydantic import BaseModel, Field, create_model
16 | from typing import Type, Dict, Any, Optional, get_type_hints, List, Union, Annotated, Set
17 | from pydantic import BaseModel, ConfigDict, create_model, Field
18 | from langchain.tools import BaseTool
19 | import inspect
20 | from datetime import datetime, date, time
21 | import uuid
22 | from enum import Enum
23 | import inspect
24 | from browser_use.controller.registry.views import ActionModel
25 | from typing import Type, Dict, Any, Optional, get_type_hints
26 |
27 | logger = logging.getLogger(__name__)
28 |
29 |
30 | async def setup_mcp_client_and_tools(mcp_server_config: Dict[str, Any]) -> Optional[MultiServerMCPClient]:
31 | """
32 | Initializes the MultiServerMCPClient, connects to servers, fetches tools,
33 | filters them, and returns a flat list of usable tools and the client instance.
34 |
35 | Returns:
36 | A tuple containing:
37 | - list[BaseTool]: The filtered list of usable LangChain tools.
38 | - MultiServerMCPClient | None: The initialized and started client instance, or None on failure.
39 | """
40 |
41 | logger.info("Initializing MultiServerMCPClient...")
42 |
43 | if not mcp_server_config:
44 | logger.error("No MCP server configuration provided.")
45 | return None
46 |
47 | try:
48 | if "mcpServers" in mcp_server_config:
49 | mcp_server_config = mcp_server_config["mcpServers"]
50 | client = MultiServerMCPClient(mcp_server_config)
51 | await client.__aenter__()
52 | return client
53 |
54 | except Exception as e:
55 | logger.error(f"Failed to setup MCP client or fetch tools: {e}", exc_info=True)
56 | return None
57 |
58 |
59 | def create_tool_param_model(tool: BaseTool) -> Type[BaseModel]:
60 | """Creates a Pydantic model from a LangChain tool's schema"""
61 |
62 | # Get tool schema information
63 | json_schema = tool.args_schema
64 | tool_name = tool.name
65 |
66 | # If the tool already has a schema defined, convert it to a new param_model
67 | if json_schema is not None:
68 |
69 | # Create new parameter model
70 | params = {}
71 |
72 | # Process properties if they exist
73 | if 'properties' in json_schema:
74 | # Find required fields
75 | required_fields: Set[str] = set(json_schema.get('required', []))
76 |
77 | for prop_name, prop_details in json_schema['properties'].items():
78 | field_type = resolve_type(prop_details, f"{tool_name}_{prop_name}")
79 |
80 | # Check if parameter is required
81 | is_required = prop_name in required_fields
82 |
83 | # Get default value and description
84 | default_value = prop_details.get('default', ... if is_required else None)
85 | description = prop_details.get('description', '')
86 |
87 | # Add field constraints
88 | field_kwargs = {'default': default_value}
89 | if description:
90 | field_kwargs['description'] = description
91 |
92 | # Add additional constraints if present
93 | if 'minimum' in prop_details:
94 | field_kwargs['ge'] = prop_details['minimum']
95 | if 'maximum' in prop_details:
96 | field_kwargs['le'] = prop_details['maximum']
97 | if 'minLength' in prop_details:
98 | field_kwargs['min_length'] = prop_details['minLength']
99 | if 'maxLength' in prop_details:
100 | field_kwargs['max_length'] = prop_details['maxLength']
101 | if 'pattern' in prop_details:
102 | field_kwargs['pattern'] = prop_details['pattern']
103 |
104 | # Add to parameters dictionary
105 | params[prop_name] = (field_type, Field(**field_kwargs))
106 |
107 | return create_model(
108 | f'{tool_name}_parameters',
109 | __base__=ActionModel,
110 | **params, # type: ignore
111 | )
112 |
113 | # If no schema is defined, extract parameters from the _run method
114 | run_method = tool._run
115 | sig = inspect.signature(run_method)
116 |
117 | # Get type hints for better type information
118 | try:
119 | type_hints = get_type_hints(run_method)
120 | except Exception:
121 | type_hints = {}
122 |
123 | params = {}
124 | for name, param in sig.parameters.items():
125 | # Skip 'self' parameter and any other parameters you want to exclude
126 | if name == 'self':
127 | continue
128 |
129 | # Get annotation from type hints if available, otherwise from signature
130 | annotation = type_hints.get(name, param.annotation)
131 | if annotation == inspect.Parameter.empty:
132 | annotation = Any
133 |
134 | # Use default value if available, otherwise make it required
135 | if param.default != param.empty:
136 | params[name] = (annotation, param.default)
137 | else:
138 | params[name] = (annotation, ...)
139 |
140 | return create_model(
141 | f'{tool_name}_parameters',
142 | __base__=ActionModel,
143 | **params, # type: ignore
144 | )
145 |
146 |
147 | def resolve_type(prop_details: Dict[str, Any], prefix: str = "") -> Any:
148 | """Recursively resolves JSON schema type to Python/Pydantic type"""
149 |
150 | # Handle reference types
151 | if '$ref' in prop_details:
152 | # In a real application, reference resolution would be needed
153 | return Any
154 |
155 | # Basic type mapping
156 | type_mapping = {
157 | 'string': str,
158 | 'integer': int,
159 | 'number': float,
160 | 'boolean': bool,
161 | 'array': List,
162 | 'object': Dict,
163 | 'null': type(None),
164 | }
165 |
166 | # Handle formatted strings
167 | if prop_details.get('type') == 'string' and 'format' in prop_details:
168 | format_mapping = {
169 | 'date-time': datetime,
170 | 'date': date,
171 | 'time': time,
172 | 'email': str,
173 | 'uri': str,
174 | 'url': str,
175 | 'uuid': uuid.UUID,
176 | 'binary': bytes,
177 | }
178 | return format_mapping.get(prop_details['format'], str)
179 |
180 | # Handle enum types
181 | if 'enum' in prop_details:
182 | enum_values = prop_details['enum']
183 | # Create dynamic enum class with safe names
184 | enum_dict = {}
185 | for i, v in enumerate(enum_values):
186 | # Ensure enum names are valid Python identifiers
187 | if isinstance(v, str):
188 | key = v.upper().replace(' ', '_').replace('-', '_')
189 | if not key.isidentifier():
190 | key = f"VALUE_{i}"
191 | else:
192 | key = f"VALUE_{i}"
193 | enum_dict[key] = v
194 |
195 | # Only create enum if we have values
196 | if enum_dict:
197 | return Enum(f"{prefix}_Enum", enum_dict)
198 | return str # Fallback
199 |
200 | # Handle array types
201 | if prop_details.get('type') == 'array' and 'items' in prop_details:
202 | item_type = resolve_type(prop_details['items'], f"{prefix}_item")
203 | return List[item_type] # type: ignore
204 |
205 | # Handle object types with properties
206 | if prop_details.get('type') == 'object' and 'properties' in prop_details:
207 | nested_params = {}
208 | for nested_name, nested_details in prop_details['properties'].items():
209 | nested_type = resolve_type(nested_details, f"{prefix}_{nested_name}")
210 | # Get required field info
211 | required_fields = prop_details.get('required', [])
212 | is_required = nested_name in required_fields
213 | default_value = nested_details.get('default', ... if is_required else None)
214 | description = nested_details.get('description', '')
215 |
216 | field_kwargs = {'default': default_value}
217 | if description:
218 | field_kwargs['description'] = description
219 |
220 | nested_params[nested_name] = (nested_type, Field(**field_kwargs))
221 |
222 | # Create nested model
223 | nested_model = create_model(f"{prefix}_Model", **nested_params)
224 | return nested_model
225 |
226 | # Handle union types (oneOf, anyOf)
227 | if 'oneOf' in prop_details or 'anyOf' in prop_details:
228 | union_schema = prop_details.get('oneOf') or prop_details.get('anyOf')
229 | union_types = []
230 | for i, t in enumerate(union_schema):
231 | union_types.append(resolve_type(t, f"{prefix}_{i}"))
232 |
233 | if union_types:
234 | return Union.__getitem__(tuple(union_types)) # type: ignore
235 | return Any
236 |
237 | # Handle allOf (intersection types)
238 | if 'allOf' in prop_details:
239 | nested_params = {}
240 | for i, schema_part in enumerate(prop_details['allOf']):
241 | if 'properties' in schema_part:
242 | for nested_name, nested_details in schema_part['properties'].items():
243 | nested_type = resolve_type(nested_details, f"{prefix}_allOf_{i}_{nested_name}")
244 | # Check if required
245 | required_fields = schema_part.get('required', [])
246 | is_required = nested_name in required_fields
247 | nested_params[nested_name] = (nested_type, ... if is_required else None)
248 |
249 | # Create composite model
250 | if nested_params:
251 | composite_model = create_model(f"{prefix}_CompositeModel", **nested_params)
252 | return composite_model
253 | return Dict
254 |
255 | # Default to basic types
256 | schema_type = prop_details.get('type', 'string')
257 | if isinstance(schema_type, list):
258 | # Handle multiple types (e.g., ["string", "null"])
259 | non_null_types = [t for t in schema_type if t != 'null']
260 | if non_null_types:
261 | primary_type = type_mapping.get(non_null_types[0], Any)
262 | if 'null' in schema_type:
263 | return Optional[primary_type] # type: ignore
264 | return primary_type
265 | return Any
266 |
267 | return type_mapping.get(schema_type, Any)
268 |
```
--------------------------------------------------------------------------------
/src/mcp_server_browser_use/_internal/utils/llm_provider.py:
--------------------------------------------------------------------------------
```python
1 | from openai import OpenAI
2 | import pdb
3 | from langchain_openai import ChatOpenAI
4 | from langchain_core.globals import get_llm_cache
5 | from langchain_core.language_models.base import (
6 | BaseLanguageModel,
7 | LangSmithParams,
8 | LanguageModelInput,
9 | )
10 | import os
11 | from langchain_core.load import dumpd, dumps
12 | from langchain_core.messages import (
13 | AIMessage,
14 | SystemMessage,
15 | AnyMessage,
16 | BaseMessage,
17 | BaseMessageChunk,
18 | HumanMessage,
19 | convert_to_messages,
20 | message_chunk_to_message,
21 | )
22 | from langchain_core.outputs import (
23 | ChatGeneration,
24 | ChatGenerationChunk,
25 | ChatResult,
26 | LLMResult,
27 | RunInfo,
28 | )
29 | from langchain_ollama import ChatOllama
30 | from langchain_core.output_parsers.base import OutputParserLike
31 | from langchain_core.runnables import Runnable, RunnableConfig
32 | from langchain_core.tools import BaseTool
33 |
34 | from typing import (
35 | TYPE_CHECKING,
36 | Any,
37 | Callable,
38 | Literal,
39 | Optional,
40 | Union,
41 | cast, List,
42 | )
43 | from langchain_anthropic import ChatAnthropic
44 | from langchain_mistralai import ChatMistralAI
45 | from langchain_google_genai import ChatGoogleGenerativeAI
46 | from langchain_ollama import ChatOllama
47 | from langchain_openai import AzureChatOpenAI, ChatOpenAI
48 | from langchain_ibm import ChatWatsonx
49 | from langchain_aws import ChatBedrock
50 | from pydantic import SecretStr
51 |
52 | from ..utils import config
53 |
54 |
55 | class DeepSeekR1ChatOpenAI(ChatOpenAI):
56 |
57 | def __init__(self, *args: Any, **kwargs: Any) -> None:
58 | super().__init__(*args, **kwargs)
59 | self.client = OpenAI(
60 | base_url=kwargs.get("base_url"),
61 | api_key=kwargs.get("api_key")
62 | )
63 |
64 | async def ainvoke(
65 | self,
66 | input: LanguageModelInput,
67 | config: Optional[RunnableConfig] = None,
68 | *,
69 | stop: Optional[list[str]] = None,
70 | **kwargs: Any,
71 | ) -> AIMessage:
72 | message_history = []
73 | for input_ in input:
74 | if isinstance(input_, SystemMessage):
75 | message_history.append({"role": "system", "content": input_.content})
76 | elif isinstance(input_, AIMessage):
77 | message_history.append({"role": "assistant", "content": input_.content})
78 | else:
79 | message_history.append({"role": "user", "content": input_.content})
80 |
81 | response = self.client.chat.completions.create(
82 | model=self.model_name,
83 | messages=message_history
84 | )
85 |
86 | reasoning_content = response.choices[0].message.reasoning_content
87 | content = response.choices[0].message.content
88 | return AIMessage(content=content, reasoning_content=reasoning_content)
89 |
90 | def invoke(
91 | self,
92 | input: LanguageModelInput,
93 | config: Optional[RunnableConfig] = None,
94 | *,
95 | stop: Optional[list[str]] = None,
96 | **kwargs: Any,
97 | ) -> AIMessage:
98 | message_history = []
99 | for input_ in input:
100 | if isinstance(input_, SystemMessage):
101 | message_history.append({"role": "system", "content": input_.content})
102 | elif isinstance(input_, AIMessage):
103 | message_history.append({"role": "assistant", "content": input_.content})
104 | else:
105 | message_history.append({"role": "user", "content": input_.content})
106 |
107 | response = self.client.chat.completions.create(
108 | model=self.model_name,
109 | messages=message_history
110 | )
111 |
112 | reasoning_content = response.choices[0].message.reasoning_content
113 | content = response.choices[0].message.content
114 | return AIMessage(content=content, reasoning_content=reasoning_content)
115 |
116 |
117 | class DeepSeekR1ChatOllama(ChatOllama):
118 |
119 | async def ainvoke(
120 | self,
121 | input: LanguageModelInput,
122 | config: Optional[RunnableConfig] = None,
123 | *,
124 | stop: Optional[list[str]] = None,
125 | **kwargs: Any,
126 | ) -> AIMessage:
127 | org_ai_message = await super().ainvoke(input=input)
128 | org_content = org_ai_message.content
129 | reasoning_content = org_content.split("</think>")[0].replace("<think>", "")
130 | content = org_content.split("</think>")[1]
131 | if "**JSON Response:**" in content:
132 | content = content.split("**JSON Response:**")[-1]
133 | return AIMessage(content=content, reasoning_content=reasoning_content)
134 |
135 | def invoke(
136 | self,
137 | input: LanguageModelInput,
138 | config: Optional[RunnableConfig] = None,
139 | *,
140 | stop: Optional[list[str]] = None,
141 | **kwargs: Any,
142 | ) -> AIMessage:
143 | org_ai_message = super().invoke(input=input)
144 | org_content = org_ai_message.content
145 | reasoning_content = org_content.split("</think>")[0].replace("<think>", "")
146 | content = org_content.split("</think>")[1]
147 | if "**JSON Response:**" in content:
148 | content = content.split("**JSON Response:**")[-1]
149 | return AIMessage(content=content, reasoning_content=reasoning_content)
150 |
151 |
152 | def get_llm_model(provider: str, **kwargs):
153 | """
154 | Get LLM model
155 | :param provider: LLM provider
156 | :param kwargs:
157 | :return:
158 | """
159 | if provider not in ["ollama", "bedrock"]:
160 | env_var = f"{provider.upper()}_API_KEY"
161 | api_key = kwargs.get("api_key", "") or os.getenv(env_var, "")
162 | if not api_key:
163 | provider_display = config.PROVIDER_DISPLAY_NAMES.get(provider, provider.upper())
164 | error_msg = f"💥 {provider_display} API key not found! 🔑 Please set the `{env_var}` environment variable or provide it in the UI."
165 | raise ValueError(error_msg)
166 | kwargs["api_key"] = api_key
167 |
168 | if provider == "anthropic":
169 | if not kwargs.get("base_url", ""):
170 | base_url = "https://api.anthropic.com"
171 | else:
172 | base_url = kwargs.get("base_url")
173 |
174 | return ChatAnthropic(
175 | model=kwargs.get("model_name", "claude-3-5-sonnet-20241022"),
176 | temperature=kwargs.get("temperature", 0.0),
177 | base_url=base_url,
178 | api_key=api_key,
179 | )
180 | elif provider == 'mistral':
181 | if not kwargs.get("base_url", ""):
182 | base_url = os.getenv("MISTRAL_ENDPOINT", "https://api.mistral.ai/v1")
183 | else:
184 | base_url = kwargs.get("base_url")
185 | if not kwargs.get("api_key", ""):
186 | api_key = os.getenv("MISTRAL_API_KEY", "")
187 | else:
188 | api_key = kwargs.get("api_key")
189 |
190 | return ChatMistralAI(
191 | model=kwargs.get("model_name", "mistral-large-latest"),
192 | temperature=kwargs.get("temperature", 0.0),
193 | base_url=base_url,
194 | api_key=api_key,
195 | )
196 | elif provider == "openai":
197 | if not kwargs.get("base_url", ""):
198 | base_url = os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1")
199 | else:
200 | base_url = kwargs.get("base_url")
201 |
202 | return ChatOpenAI(
203 | model=kwargs.get("model_name", "gpt-4o"),
204 | temperature=kwargs.get("temperature", 0.0),
205 | base_url=base_url,
206 | api_key=api_key,
207 | )
208 | elif provider == "deepseek":
209 | if not kwargs.get("base_url", ""):
210 | base_url = os.getenv("DEEPSEEK_ENDPOINT", "")
211 | else:
212 | base_url = kwargs.get("base_url")
213 |
214 | if kwargs.get("model_name", "deepseek-chat") == "deepseek-reasoner":
215 | return DeepSeekR1ChatOpenAI(
216 | model=kwargs.get("model_name", "deepseek-reasoner"),
217 | temperature=kwargs.get("temperature", 0.0),
218 | base_url=base_url,
219 | api_key=api_key,
220 | )
221 | else:
222 | return ChatOpenAI(
223 | model=kwargs.get("model_name", "deepseek-chat"),
224 | temperature=kwargs.get("temperature", 0.0),
225 | base_url=base_url,
226 | api_key=api_key,
227 | )
228 | elif provider == "google":
229 | return ChatGoogleGenerativeAI(
230 | model=kwargs.get("model_name", "gemini-2.0-flash-exp"),
231 | temperature=kwargs.get("temperature", 0.0),
232 | api_key=api_key,
233 | )
234 | elif provider == "ollama":
235 | if not kwargs.get("base_url", ""):
236 | base_url = os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434")
237 | else:
238 | base_url = kwargs.get("base_url")
239 |
240 | if "deepseek-r1" in kwargs.get("model_name", "qwen2.5:7b"):
241 | return DeepSeekR1ChatOllama(
242 | model=kwargs.get("model_name", "deepseek-r1:14b"),
243 | temperature=kwargs.get("temperature", 0.0),
244 | num_ctx=kwargs.get("num_ctx", 32000),
245 | base_url=base_url,
246 | )
247 | else:
248 | return ChatOllama(
249 | model=kwargs.get("model_name", "qwen2.5:7b"),
250 | temperature=kwargs.get("temperature", 0.0),
251 | num_ctx=kwargs.get("num_ctx", 32000),
252 | num_predict=kwargs.get("num_predict", 1024),
253 | base_url=base_url,
254 | )
255 | elif provider == "azure_openai":
256 | if not kwargs.get("base_url", ""):
257 | base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "")
258 | else:
259 | base_url = kwargs.get("base_url")
260 | api_version = kwargs.get("api_version", "") or os.getenv("AZURE_OPENAI_API_VERSION", "2025-01-01-preview")
261 | return AzureChatOpenAI(
262 | model=kwargs.get("model_name", "gpt-4o"),
263 | temperature=kwargs.get("temperature", 0.0),
264 | api_version=api_version,
265 | azure_endpoint=base_url,
266 | api_key=api_key,
267 | )
268 | elif provider == "alibaba":
269 | if not kwargs.get("base_url", ""):
270 | base_url = os.getenv("ALIBABA_ENDPOINT", "https://dashscope.aliyuncs.com/compatible-mode/v1")
271 | else:
272 | base_url = kwargs.get("base_url")
273 |
274 | return ChatOpenAI(
275 | model=kwargs.get("model_name", "qwen-plus"),
276 | temperature=kwargs.get("temperature", 0.0),
277 | base_url=base_url,
278 | api_key=api_key,
279 | )
280 | elif provider == "ibm":
281 | parameters = {
282 | "temperature": kwargs.get("temperature", 0.0),
283 | "max_tokens": kwargs.get("num_ctx", 32000)
284 | }
285 | if not kwargs.get("base_url", ""):
286 | base_url = os.getenv("IBM_ENDPOINT", "https://us-south.ml.cloud.ibm.com")
287 | else:
288 | base_url = kwargs.get("base_url")
289 |
290 | return ChatWatsonx(
291 | model_id=kwargs.get("model_name", "ibm/granite-vision-3.1-2b-preview"),
292 | url=base_url,
293 | project_id=os.getenv("IBM_PROJECT_ID"),
294 | apikey=os.getenv("IBM_API_KEY"),
295 | params=parameters
296 | )
297 | elif provider == "moonshot":
298 | return ChatOpenAI(
299 | model=kwargs.get("model_name", "moonshot-v1-32k-vision-preview"),
300 | temperature=kwargs.get("temperature", 0.0),
301 | base_url=os.getenv("MOONSHOT_ENDPOINT"),
302 | api_key=os.getenv("MOONSHOT_API_KEY"),
303 | )
304 | elif provider == "unbound":
305 | return ChatOpenAI(
306 | model=kwargs.get("model_name", "gpt-4o-mini"),
307 | temperature=kwargs.get("temperature", 0.0),
308 | base_url=os.getenv("UNBOUND_ENDPOINT", "https://api.getunbound.ai"),
309 | api_key=api_key,
310 | )
311 | elif provider == "siliconflow":
312 | if not kwargs.get("api_key", ""):
313 | api_key = os.getenv("SiliconFLOW_API_KEY", "")
314 | else:
315 | api_key = kwargs.get("api_key")
316 | if not kwargs.get("base_url", ""):
317 | base_url = os.getenv("SiliconFLOW_ENDPOINT", "")
318 | else:
319 | base_url = kwargs.get("base_url")
320 | return ChatOpenAI(
321 | api_key=api_key,
322 | base_url=base_url,
323 | model_name=kwargs.get("model_name", "Qwen/QwQ-32B"),
324 | temperature=kwargs.get("temperature", 0.0),
325 | )
326 | else:
327 | raise ValueError(f"Unsupported provider: {provider}")
328 |
```
--------------------------------------------------------------------------------
/src/mcp_server_browser_use/config.py:
--------------------------------------------------------------------------------
```python
1 | from typing import Any, Dict, List, Optional, Union
2 |
3 | from pydantic import Field, SecretStr, field_validator, ValidationInfo
4 | from pydantic_settings import BaseSettings, SettingsConfigDict
5 |
6 |
7 | class LLMSettings(BaseSettings):
8 | model_config = SettingsConfigDict(env_prefix="MCP_LLM_")
9 |
10 | provider: str = Field(default="google", env="PROVIDER")
11 | model_name: str = Field(default="gemini-2.5-flash-preview-04-17", env="MODEL_NAME")
12 | temperature: float = Field(default=0.0, env="TEMPERATURE")
13 | base_url: Optional[str] = Field(default=None, env="BASE_URL")
14 | api_key: Optional[SecretStr] = Field(default=None, env="API_KEY") # Generic API key
15 |
16 | # Provider-specific API keys
17 | openai_api_key: Optional[SecretStr] = Field(default=None, env="OPENAI_API_KEY")
18 | anthropic_api_key: Optional[SecretStr] = Field(default=None, env="ANTHROPIC_API_KEY")
19 | google_api_key: Optional[SecretStr] = Field(default=None, env="GOOGLE_API_KEY")
20 | azure_openai_api_key: Optional[SecretStr] = Field(default=None, env="AZURE_OPENAI_API_KEY")
21 | deepseek_api_key: Optional[SecretStr] = Field(default=None, env="DEEPSEEK_API_KEY")
22 | mistral_api_key: Optional[SecretStr] = Field(default=None, env="MISTRAL_API_KEY")
23 | openrouter_api_key: Optional[SecretStr] = Field(default=None, env="OPENROUTER_API_KEY")
24 | alibaba_api_key: Optional[SecretStr] = Field(default=None, env="ALIBABA_API_KEY")
25 | moonshot_api_key: Optional[SecretStr] = Field(default=None, env="MOONSHOT_API_KEY")
26 | unbound_api_key: Optional[SecretStr] = Field(default=None, env="UNBOUND_API_KEY")
27 |
28 | # Provider-specific endpoints
29 | openai_endpoint: Optional[str] = Field(default=None, env="OPENAI_ENDPOINT")
30 | anthropic_endpoint: Optional[str] = Field(default=None, env="ANTHROPIC_ENDPOINT")
31 | azure_openai_endpoint: Optional[str] = Field(default=None, env="AZURE_OPENAI_ENDPOINT")
32 | azure_openai_api_version: str = Field(default="2025-01-01-preview", env="AZURE_OPENAI_API_VERSION")
33 | deepseek_endpoint: Optional[str] = Field(default=None, env="DEEPSEEK_ENDPOINT")
34 | mistral_endpoint: Optional[str] = Field(default=None, env="MISTRAL_ENDPOINT")
35 | ollama_endpoint: str = Field(default="http://localhost:11434", env="OLLAMA_ENDPOINT")
36 | openrouter_endpoint: str = Field(default="https://openrouter.ai/api/v1", env="OPENROUTER_ENDPOINT")
37 | alibaba_endpoint: Optional[str] = Field(default=None, env="ALIBABA_ENDPOINT")
38 | moonshot_endpoint: Optional[str] = Field(default=None, env="MOONSHOT_ENDPOINT")
39 | unbound_endpoint: Optional[str] = Field(default=None, env="UNBOUND_ENDPOINT")
40 |
41 | ollama_num_ctx: Optional[int] = Field(default=32000, env="OLLAMA_NUM_CTX")
42 | ollama_num_predict: Optional[int] = Field(default=1024, env="OLLAMA_NUM_PREDICT")
43 |
44 | # Planner LLM settings (optional, defaults to main LLM if not set)
45 | planner_provider: Optional[str] = Field(default=None, env="PLANNER_PROVIDER")
46 | planner_model_name: Optional[str] = Field(default=None, env="PLANNER_MODEL_NAME")
47 | planner_temperature: Optional[float] = Field(default=None, env="PLANNER_TEMPERATURE")
48 | planner_base_url: Optional[str] = Field(default=None, env="PLANNER_BASE_URL")
49 | planner_api_key: Optional[SecretStr] = Field(default=None, env="PLANNER_API_KEY")
50 |
51 |
52 | class BrowserSettings(BaseSettings):
53 | model_config = SettingsConfigDict(env_prefix="MCP_BROWSER_")
54 |
55 | headless: bool = Field(default=False, env="HEADLESS") # General headless
56 | disable_security: bool = Field(default=False, env="DISABLE_SECURITY") # General disable security
57 | binary_path: Optional[str] = Field(default=None, env="BINARY_PATH")
58 | user_data_dir: Optional[str] = Field(default=None, env="USER_DATA_DIR")
59 | window_width: int = Field(default=1280, env="WINDOW_WIDTH")
60 | window_height: int = Field(default=1080, env="WINDOW_HEIGHT")
61 | use_own_browser: bool = Field(default=False, env="USE_OWN_BROWSER")
62 | cdp_url: Optional[str] = Field(default=None, env="CDP_URL")
63 | wss_url: Optional[str] = Field(default=None, env="WSS_URL") # For CDP connection if needed
64 | keep_open: bool = Field(default=False, env="KEEP_OPEN") # Server-managed browser persistence
65 | trace_path: Optional[str] = Field(default=None, env="TRACE_PATH")
66 |
67 |
68 | class AgentToolSettings(BaseSettings):
69 | model_config = SettingsConfigDict(env_prefix="MCP_AGENT_TOOL_")
70 |
71 | max_steps: int = Field(default=100, env="MAX_STEPS")
72 | max_actions_per_step: int = Field(default=5, env="MAX_ACTIONS_PER_STEP")
73 | tool_calling_method: Optional[str] = Field(default="auto", env="TOOL_CALLING_METHOD")
74 | max_input_tokens: Optional[int] = Field(default=128000, env="MAX_INPUT_TOKENS")
75 | use_vision: bool = Field(default=True, env="USE_VISION")
76 |
77 | # Browser settings specific to this tool, can override general MCP_BROWSER_ settings
78 | headless: Optional[bool] = Field(default=None, env="HEADLESS")
79 | disable_security: Optional[bool] = Field(default=None, env="DISABLE_SECURITY")
80 |
81 | enable_recording: bool = Field(default=False, env="ENABLE_RECORDING")
82 | save_recording_path: Optional[str] = Field(default=None, env="SAVE_RECORDING_PATH") # e.g. ./tmp/recordings
83 | history_path: Optional[str] = Field(default=None, env="HISTORY_PATH") # e.g. ./tmp/agent_history
84 |
85 |
86 | class DeepResearchToolSettings(BaseSettings):
87 | model_config = SettingsConfigDict(env_prefix="MCP_RESEARCH_TOOL_")
88 |
89 | max_parallel_browsers: int = Field(default=3, env="MAX_PARALLEL_BROWSERS")
90 | save_dir: Optional[str] = Field(default=None, env="SAVE_DIR") # Base dir, task_id will be appended. Optional now.
91 |
92 |
93 | class PathSettings(BaseSettings):
94 | model_config = SettingsConfigDict(env_prefix="MCP_PATHS_")
95 | downloads: Optional[str] = Field(default=None, env="DOWNLOADS") # e.g. ./tmp/downloads
96 |
97 |
98 | class ServerSettings(BaseSettings):
99 | model_config = SettingsConfigDict(env_prefix="MCP_SERVER_")
100 | log_file: Optional[str] = Field(default=None, env="LOG_FILE")
101 | logging_level: str = Field(default="ERROR", env="LOGGING_LEVEL")
102 | anonymized_telemetry: bool = Field(default=True, env="ANONYMIZED_TELEMETRY")
103 | mcp_config: Optional[Dict[str, Any]] = Field(default=None, env="MCP_CONFIG") # For controller's MCP client
104 |
105 |
106 | class AppSettings(BaseSettings):
107 | model_config = SettingsConfigDict(env_prefix="MCP_", extra='ignore') # Root prefix
108 |
109 | llm: LLMSettings = Field(default_factory=LLMSettings)
110 | browser: BrowserSettings = Field(default_factory=BrowserSettings)
111 | agent_tool: AgentToolSettings = Field(default_factory=AgentToolSettings)
112 | research_tool: DeepResearchToolSettings = Field(default_factory=DeepResearchToolSettings)
113 | paths: PathSettings = Field(default_factory=PathSettings)
114 | server: ServerSettings = Field(default_factory=ServerSettings)
115 |
116 | @field_validator('server', 'llm', 'browser', 'agent_tool', 'research_tool', 'paths', mode='before')
117 | @classmethod
118 | def ensure_nested_defaults(cls, v: Any) -> Any:
119 | # This ensures that even if MCP_SERVER__LOG_FILE is set but MCP_SERVER is not,
120 | # the ServerSettings object is still created.
121 | # Pydantic-settings usually handles this, but being explicit can help.
122 | if v is None:
123 | return {}
124 | return v
125 |
126 | def get_api_key_for_provider(self, provider_name: Optional[str], is_planner: bool = False) -> Optional[str]:
127 | """Retrieves the API key for a given provider, checking generic, then specific."""
128 | llm_settings_to_use = self.llm
129 | provider_to_use = provider_name if provider_name else (self.llm.planner_provider if is_planner else self.llm.provider)
130 |
131 | if is_planner:
132 | if self.llm.planner_api_key:
133 | return self.llm.planner_api_key.get_secret_value()
134 | # Fallback to main LLM settings if planner-specific key is not set, but provider is
135 | if self.llm.planner_provider and not self.llm.planner_api_key:
136 | llm_settings_to_use = self.llm # Check main llm settings for this provider
137 | # if no planner provider, it will use main llm provider and its key
138 |
139 | if not provider_to_use: # Should not happen if called correctly
140 | return None
141 |
142 | # Check generic API key first for the relevant LLM settings (main or planner if planner_api_key was set)
143 | if not is_planner and llm_settings_to_use.api_key: # only main LLM has generic api_key
144 | return llm_settings_to_use.api_key.get_secret_value()
145 |
146 | provider_specific_key_name = f"{provider_to_use.lower()}_api_key"
147 | if hasattr(llm_settings_to_use, provider_specific_key_name):
148 | key_val = getattr(llm_settings_to_use, provider_specific_key_name)
149 | if key_val and isinstance(key_val, SecretStr):
150 | return key_val.get_secret_value()
151 | return None
152 |
153 | def get_endpoint_for_provider(self, provider_name: Optional[str], is_planner: bool = False) -> Optional[str]:
154 | """Retrieves the endpoint for a given provider."""
155 | llm_settings_to_use = self.llm
156 | provider_to_use = provider_name if provider_name else (self.llm.planner_provider if is_planner else self.llm.provider)
157 |
158 | if is_planner:
159 | if self.llm.planner_base_url:
160 | return self.llm.planner_base_url
161 | if self.llm.planner_provider and not self.llm.planner_base_url:
162 | llm_settings_to_use = self.llm # Check main llm settings for this provider
163 |
164 | if not provider_to_use:
165 | return None
166 |
167 | if not is_planner and llm_settings_to_use.base_url: # only main LLM has generic base_url
168 | return llm_settings_to_use.base_url
169 |
170 | provider_specific_endpoint_name = f"{provider_to_use.lower()}_endpoint"
171 | if hasattr(llm_settings_to_use, provider_specific_endpoint_name):
172 | return getattr(llm_settings_to_use, provider_specific_endpoint_name)
173 | return None
174 |
175 | def get_llm_config(self, is_planner: bool = False) -> Dict[str, Any]:
176 | """Returns a dictionary of LLM settings suitable for llm_provider.get_llm_model."""
177 | provider = self.llm.planner_provider if is_planner and self.llm.planner_provider else self.llm.provider
178 | model_name = self.llm.planner_model_name if is_planner and self.llm.planner_model_name else self.llm.model_name
179 | temperature = self.llm.planner_temperature if is_planner and self.llm.planner_temperature is not None else self.llm.temperature
180 |
181 | api_key = self.get_api_key_for_provider(provider, is_planner=is_planner)
182 | base_url = self.get_endpoint_for_provider(provider, is_planner=is_planner)
183 |
184 | config = {
185 | "provider": provider,
186 | "model_name": model_name,
187 | "temperature": temperature,
188 | "api_key": api_key,
189 | "base_url": base_url,
190 | "use_vision": self.agent_tool.use_vision if not is_planner else False, # Planners typically don't need vision
191 | "tool_calling_method": self.agent_tool.tool_calling_method if not is_planner else "auto",
192 | "max_input_tokens": self.agent_tool.max_input_tokens if not is_planner else None,
193 | }
194 |
195 | if provider == "azure_openai":
196 | config["azure_openai_api_version"] = self.llm.azure_openai_api_version
197 | elif provider == "ollama":
198 | config["ollama_num_ctx"] = self.llm.ollama_num_ctx
199 | config["ollama_num_predict"] = self.llm.ollama_num_predict
200 | elif provider == "openrouter":
201 | config["provider"] = "openai"
202 |
203 | return config
204 |
205 | # Global settings instance, to be imported by other modules
206 | settings = AppSettings()
207 |
208 | # Example usage (for testing this file directly):
209 | if __name__ == "__main__":
210 | try:
211 | print("Loaded AppSettings:")
212 | print(settings.model_dump_json(indent=2))
213 | print(f"\nLLM API Key for main provider ({settings.llm.provider}): {settings.get_api_key_for_provider(settings.llm.provider)}")
214 | if settings.llm.planner_provider:
215 | print(f"LLM API Key for planner provider ({settings.llm.planner_provider}): {settings.get_api_key_for_provider(settings.llm.planner_provider, is_planner=True)}")
216 |
217 | print("\nMain LLM Config for get_llm_model:")
218 | print(settings.get_llm_config())
219 | if settings.llm.planner_provider:
220 | print("\nPlanner LLM Config for get_llm_model:")
221 | print(settings.get_llm_config(is_planner=True))
222 | except Exception as e:
223 | print(f"Error during settings load or test: {e}")
224 | import os
225 | print("MCP_RESEARCH_TOOL_SAVE_DIR:", os.getenv("MCP_RESEARCH_TOOL_SAVE_DIR"))
226 |
```
--------------------------------------------------------------------------------
/src/mcp_server_browser_use/cli.py:
--------------------------------------------------------------------------------
```python
1 | import asyncio
2 | import json
3 | import logging
4 | import os
5 | import sys
6 | import traceback
7 | import uuid
8 | from pathlib import Path
9 | from typing import Any, Dict, Optional
10 |
11 | import typer
12 | from dotenv import load_dotenv
13 |
14 | from .config import AppSettings, settings as global_settings # Import AppSettings and the global instance
15 | # Import from _internal
16 | from ._internal.agent.browser_use.browser_use_agent import BrowserUseAgent, AgentHistoryList
17 | from ._internal.agent.deep_research.deep_research_agent import DeepResearchAgent
18 | from ._internal.browser.custom_browser import CustomBrowser
19 | from ._internal.browser.custom_context import (
20 | CustomBrowserContext,
21 | CustomBrowserContextConfig,
22 | )
23 | from ._internal.controller.custom_controller import CustomController
24 | from ._internal.utils import llm_provider as internal_llm_provider
25 | from browser_use.browser.browser import BrowserConfig
26 | from browser_use.agent.views import AgentOutput
27 | from browser_use.browser.views import BrowserState
28 |
29 | app = typer.Typer(name="mcp-browser-cli", help="CLI for mcp-browser-use tools.")
30 | logger = logging.getLogger("mcp_browser_cli")
31 |
32 | class CLIState:
33 | settings: Optional[AppSettings] = None
34 |
35 | cli_state = CLIState()
36 |
37 | def setup_logging(level_str: str, log_file: Optional[str]):
38 | numeric_level = getattr(logging, level_str.upper(), logging.INFO)
39 | for handler in logging.root.handlers[:]:
40 | logging.root.removeHandler(handler)
41 | logging.basicConfig(
42 | level=numeric_level,
43 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
44 | filename=log_file if log_file else None,
45 | filemode="a" if log_file else None,
46 | force=True
47 | )
48 |
49 | @app.callback()
50 | def main_callback(
51 | ctx: typer.Context,
52 | env_file: Optional[Path] = typer.Option(
53 | None, "--env-file", "-e", help="Path to .env file to load.", exists=True, dir_okay=False, resolve_path=True
54 | ),
55 | log_level: Optional[str] = typer.Option(
56 | None, "--log-level", "-l", help="Override logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)."
57 | )
58 | ):
59 | """
60 | MCP Browser Use CLI. Settings are loaded from environment variables.
61 | You can use an .env file for convenience.
62 | """
63 | if env_file:
64 | load_dotenv(env_file, override=True)
65 | logger.info(f"Loaded environment variables from: {env_file}")
66 |
67 | # Reload settings after .env might have been loaded and to apply overrides
68 | try:
69 | cli_state.settings = AppSettings()
70 | except Exception as e:
71 | # This can happen if mandatory fields (like MCP_RESEARCH_TOOL_SAVE_DIR) are not set
72 | sys.stderr.write(f"Error loading application settings: {e}\n")
73 | sys.stderr.write("Please ensure all mandatory environment variables are set (e.g., MCP_RESEARCH_TOOL_SAVE_DIR).\n")
74 | raise typer.Exit(code=1)
75 |
76 | # Setup logging based on final settings (env file, then env vars, then CLI override)
77 | final_log_level = log_level if log_level else cli_state.settings.server.logging_level
78 | final_log_file = cli_state.settings.server.log_file
79 | setup_logging(final_log_level, final_log_file)
80 |
81 | logger.info(f"CLI initialized. Effective log level: {final_log_level.upper()}")
82 | if not cli_state.settings: # Should not happen if AppSettings() worked
83 | logger.error("Failed to load application settings.")
84 | raise typer.Exit(code=1)
85 |
86 |
87 | async def cli_ask_human_callback(query: str, browser_context: Any) -> Dict[str, Any]:
88 | """Callback for agent to ask human for input via CLI."""
89 | # browser_context is part of the signature from browser-use, might not be needed here
90 | print(typer.style(f"\n🤖 AGENT ASKS: {query}", fg=typer.colors.YELLOW))
91 | response_text = typer.prompt(typer.style("Your response", fg=typer.colors.CYAN))
92 | return {"response": response_text}
93 |
94 | def cli_on_step_callback(browser_state: BrowserState, agent_output: AgentOutput, step_num: int):
95 | """CLI callback for BrowserUseAgent steps."""
96 | print(typer.style(f"\n--- Step {step_num} ---", fg=typer.colors.BLUE, bold=True))
97 | # Print current state if available
98 | if hasattr(agent_output, "current_state") and agent_output.current_state:
99 | print(typer.style("🧠 Agent State:", fg=typer.colors.MAGENTA))
100 | print(agent_output.current_state)
101 | # Print actions
102 | if hasattr(agent_output, "action") and agent_output.action:
103 | print(typer.style("🎬 Actions:", fg=typer.colors.GREEN))
104 | for action in agent_output.action:
105 | # Try to get action_type and action_input if present, else print the action itself
106 | action_type = getattr(action, "action_type", None)
107 | action_input = getattr(action, "action_input", None)
108 | if action_type is not None or action_input is not None:
109 | print(f" - {action_type or 'Unknown action'}: {action_input or ''}")
110 | else:
111 | print(f" - {action}")
112 | # Optionally print observation if present in browser_state
113 | if hasattr(browser_state, "observation") and browser_state.observation:
114 | obs = browser_state.observation
115 | print(typer.style("👀 Observation:", fg=typer.colors.CYAN))
116 | print(str(obs)[:200] + "..." if obs and len(str(obs)) > 200 else obs)
117 |
118 |
119 | async def _run_browser_agent_logic_cli(task_str: str, current_settings: AppSettings) -> str:
120 | logger.info(f"CLI: Starting run_browser_agent task: {task_str[:100]}...")
121 | agent_task_id = str(uuid.uuid4())
122 | final_result = "Error: Agent execution failed."
123 |
124 | browser_instance: Optional[CustomBrowser] = None
125 | context_instance: Optional[CustomBrowserContext] = None
126 | controller_instance: Optional[CustomController] = None
127 |
128 | try:
129 | # LLM Setup
130 | main_llm_config = current_settings.get_llm_config()
131 | main_llm = internal_llm_provider.get_llm_model(**main_llm_config)
132 | planner_llm = None
133 | if current_settings.llm.planner_provider and current_settings.llm.planner_model_name:
134 | planner_llm_config = current_settings.get_llm_config(is_planner=True)
135 | planner_llm = internal_llm_provider.get_llm_model(**planner_llm_config)
136 |
137 | # Controller Setup
138 | controller_instance = CustomController(ask_assistant_callback=cli_ask_human_callback)
139 | if current_settings.server.mcp_config:
140 | mcp_dict_config = current_settings.server.mcp_config
141 | if isinstance(current_settings.server.mcp_config, str):
142 | mcp_dict_config = json.loads(current_settings.server.mcp_config)
143 | await controller_instance.setup_mcp_client(mcp_dict_config)
144 |
145 | # Browser and Context Setup
146 | agent_headless_override = current_settings.agent_tool.headless
147 | browser_headless = agent_headless_override if agent_headless_override is not None else current_settings.browser.headless
148 | agent_disable_security_override = current_settings.agent_tool.disable_security
149 | browser_disable_security = agent_disable_security_override if agent_disable_security_override is not None else current_settings.browser.disable_security
150 |
151 | if current_settings.browser.use_own_browser and current_settings.browser.cdp_url:
152 | browser_cfg = BrowserConfig(cdp_url=current_settings.browser.cdp_url, wss_url=current_settings.browser.wss_url, user_data_dir=current_settings.browser.user_data_dir)
153 | else:
154 | browser_cfg = BrowserConfig(
155 | headless=browser_headless,
156 | disable_security=browser_disable_security,
157 | browser_binary_path=current_settings.browser.binary_path,
158 | user_data_dir=current_settings.browser.user_data_dir,
159 | window_width=current_settings.browser.window_width,
160 | window_height=current_settings.browser.window_height,
161 | )
162 | browser_instance = CustomBrowser(config=browser_cfg)
163 |
164 | context_cfg = CustomBrowserContextConfig(
165 | trace_path=current_settings.browser.trace_path,
166 | save_downloads_path=current_settings.paths.downloads,
167 | save_recording_path=current_settings.agent_tool.save_recording_path if current_settings.agent_tool.enable_recording else None,
168 | force_new_context=True # CLI always gets a new context
169 | )
170 | context_instance = await browser_instance.new_context(config=context_cfg)
171 |
172 | agent_history_json_file = None
173 | task_history_base_path = current_settings.agent_tool.history_path
174 |
175 | if task_history_base_path:
176 | task_specific_history_dir = Path(task_history_base_path) / agent_task_id
177 | task_specific_history_dir.mkdir(parents=True, exist_ok=True)
178 | agent_history_json_file = str(task_specific_history_dir / f"{agent_task_id}.json")
179 | logger.info(f"Agent history will be saved to: {agent_history_json_file}")
180 |
181 | # Agent Instantiation
182 | agent_instance = BrowserUseAgent(
183 | task=task_str, llm=main_llm,
184 | browser=browser_instance, browser_context=context_instance, controller=controller_instance,
185 | planner_llm=planner_llm,
186 | max_actions_per_step=current_settings.agent_tool.max_actions_per_step,
187 | use_vision=current_settings.agent_tool.use_vision,
188 | register_new_step_callback=cli_on_step_callback,
189 | )
190 |
191 | # Run Agent
192 | history: AgentHistoryList = await agent_instance.run(max_steps=current_settings.agent_tool.max_steps)
193 | agent_instance.save_history(agent_history_json_file)
194 | final_result = history.final_result() or "Agent finished without a final result."
195 | logger.info(f"CLI Agent task {agent_task_id} completed.")
196 |
197 | except Exception as e:
198 | logger.error(f"CLI Error in run_browser_agent: {e}\n{traceback.format_exc()}")
199 | final_result = f"Error: {e}"
200 | finally:
201 | if context_instance: await context_instance.close()
202 | if browser_instance and not current_settings.browser.use_own_browser : await browser_instance.close() # Only close if we launched it
203 | if controller_instance: await controller_instance.close_mcp_client()
204 |
205 | return final_result
206 |
207 |
208 | async def _run_deep_research_logic_cli(research_task_str: str, max_parallel_browsers_override: Optional[int], current_settings: AppSettings) -> str:
209 | logger.info(f"CLI: Starting run_deep_research task: {research_task_str[:100]}...")
210 | task_id = str(uuid.uuid4())
211 | report_content = "Error: Deep research failed."
212 |
213 | try:
214 | main_llm_config = current_settings.get_llm_config()
215 | research_llm = internal_llm_provider.get_llm_model(**main_llm_config)
216 |
217 | dr_browser_cfg = {
218 | "headless": current_settings.browser.headless,
219 | "disable_security": current_settings.browser.disable_security,
220 | "browser_binary_path": current_settings.browser.binary_path,
221 | "user_data_dir": current_settings.browser.user_data_dir,
222 | "window_width": current_settings.browser.window_width,
223 | "window_height": current_settings.browser.window_height,
224 | "trace_path": current_settings.browser.trace_path,
225 | "save_downloads_path": current_settings.paths.downloads,
226 | }
227 | if current_settings.browser.use_own_browser and current_settings.browser.cdp_url:
228 | dr_browser_cfg["cdp_url"] = current_settings.browser.cdp_url
229 | dr_browser_cfg["wss_url"] = current_settings.browser.wss_url
230 |
231 | mcp_server_config_for_agent = None
232 | if current_settings.server.mcp_config:
233 | mcp_server_config_for_agent = current_settings.server.mcp_config
234 | if isinstance(current_settings.server.mcp_config, str):
235 | mcp_server_config_for_agent = json.loads(current_settings.server.mcp_config)
236 |
237 | agent_instance = DeepResearchAgent(
238 | llm=research_llm, browser_config=dr_browser_cfg,
239 | mcp_server_config=mcp_server_config_for_agent,
240 | )
241 |
242 | current_max_parallel_browsers = max_parallel_browsers_override if max_parallel_browsers_override is not None else current_settings.research_tool.max_parallel_browsers
243 |
244 | save_dir_for_task = os.path.join(current_settings.research_tool.save_dir, task_id)
245 | os.makedirs(save_dir_for_task, exist_ok=True)
246 |
247 | logger.info(f"CLI Deep research save directory: {save_dir_for_task}")
248 | logger.info(f"CLI Using max_parallel_browsers: {current_max_parallel_browsers}")
249 |
250 | result_dict = await agent_instance.run(
251 | topic=research_task_str, task_id=task_id,
252 | save_dir=save_dir_for_task, max_parallel_browsers=current_max_parallel_browsers
253 | )
254 |
255 | report_file_path = result_dict.get("report_file_path")
256 | if report_file_path and os.path.exists(report_file_path):
257 | with open(report_file_path, "r", encoding="utf-8") as f:
258 | markdown_content = f.read()
259 | report_content = f"Deep research report generated successfully at {report_file_path}\n\n{markdown_content}"
260 | logger.info(f"CLI Deep research task {task_id} completed. Report at {report_file_path}")
261 | else:
262 | report_content = f"Deep research completed, but report file not found. Result: {result_dict}"
263 | logger.warning(f"CLI Deep research task {task_id} result: {result_dict}, report file path missing or invalid.")
264 |
265 | except Exception as e:
266 | logger.error(f"CLI Error in run_deep_research: {e}\n{traceback.format_exc()}")
267 | report_content = f"Error: {e}"
268 |
269 | return report_content
270 |
271 |
272 | @app.command()
273 | def run_browser_agent(
274 | task: str = typer.Argument(..., help="The primary task or objective for the browser agent."),
275 | ):
276 | """Runs a browser agent task and prints the result."""
277 | if not cli_state.settings:
278 | typer.secho("Error: Application settings not loaded. Use --env-file or set environment variables.", fg=typer.colors.RED)
279 | raise typer.Exit(code=1)
280 |
281 | typer.secho(f"Executing browser agent task: {task}", fg=typer.colors.GREEN)
282 | try:
283 | result = asyncio.run(_run_browser_agent_logic_cli(task, cli_state.settings))
284 | typer.secho("\n--- Agent Final Result ---", fg=typer.colors.BLUE, bold=True)
285 | print(result)
286 | except Exception as e:
287 | typer.secho(f"CLI command failed: {e}", fg=typer.colors.RED)
288 | logger.error(f"CLI run_browser_agent command failed: {e}\n{traceback.format_exc()}")
289 | raise typer.Exit(code=1)
290 |
291 | @app.command()
292 | def run_deep_research(
293 | research_task: str = typer.Argument(..., help="The topic or question for deep research."),
294 | max_parallel_browsers: Optional[int] = typer.Option(None, "--max-parallel-browsers", "-p", help="Override max parallel browsers from settings.")
295 | ):
296 | """Performs deep web research and prints the report."""
297 | if not cli_state.settings:
298 | typer.secho("Error: Application settings not loaded. Use --env-file or set environment variables.", fg=typer.colors.RED)
299 | raise typer.Exit(code=1)
300 |
301 | typer.secho(f"Executing deep research task: {research_task}", fg=typer.colors.GREEN)
302 | try:
303 | result = asyncio.run(_run_deep_research_logic_cli(research_task, max_parallel_browsers, cli_state.settings))
304 | typer.secho("\n--- Deep Research Final Report ---", fg=typer.colors.BLUE, bold=True)
305 | print(result)
306 | except Exception as e:
307 | typer.secho(f"CLI command failed: {e}", fg=typer.colors.RED)
308 | logger.error(f"CLI run_deep_research command failed: {e}\n{traceback.format_exc()}")
309 | raise typer.Exit(code=1)
310 |
311 | if __name__ == "__main__":
312 | # This allows running `python src/mcp_server_browser_use/cli.py ...`
313 | # Set a default log level if run directly for dev purposes, can be overridden by CLI args
314 | if not os.getenv("MCP_SERVER_LOGGING_LEVEL"): # Check if already set
315 | os.environ["MCP_SERVER_LOGGING_LEVEL"] = "DEBUG"
316 | if not os.getenv("MCP_RESEARCH_TOOL_SAVE_DIR"): # Ensure mandatory var is set for local dev
317 | print("Warning: MCP_RESEARCH_TOOL_SAVE_DIR not set. Defaulting to './tmp/deep_research_cli_default' for this run.", file=sys.stderr)
318 | os.environ["MCP_RESEARCH_TOOL_SAVE_DIR"] = "./tmp/deep_research_cli_default"
319 |
320 | app()
321 |
```
--------------------------------------------------------------------------------
/src/mcp_server_browser_use/server.py:
--------------------------------------------------------------------------------
```python
1 | import asyncio
2 | import json
3 | import logging
4 | import os
5 | import traceback
6 | import uuid
7 | from typing import Any, Dict, Optional
8 | from pathlib import Path
9 |
10 |
11 | from .config import settings # Import global AppSettings instance
12 |
13 | # Configure logging using settings
14 | log_level_str = settings.server.logging_level.upper()
15 | numeric_level = getattr(logging, log_level_str, logging.INFO)
16 |
17 | # Remove any existing handlers from the root logger to avoid duplicate messages
18 | # if basicConfig was called elsewhere or by a library.
19 | for handler in logging.root.handlers[:]:
20 | logging.root.removeHandler(handler)
21 |
22 | logging.basicConfig(
23 | level=numeric_level,
24 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
25 | filename=settings.server.log_file if settings.server.log_file else None,
26 | filemode="a" if settings.server.log_file else None, # only use filemode if filename is set
27 | force=True # Override any previous basicConfig
28 | )
29 |
30 | logger = logging.getLogger("mcp_server_browser_use")
31 | # Prevent log propagation if other loggers are configured higher up
32 | # logging.getLogger().propagate = False # This might be too aggressive, let's rely on basicConfig force
33 |
34 | from browser_use.browser.browser import BrowserConfig
35 | from mcp.server.fastmcp import Context, FastMCP
36 |
37 | # Import from _internal
38 | from ._internal.agent.browser_use.browser_use_agent import BrowserUseAgent
39 | from ._internal.agent.deep_research.deep_research_agent import DeepResearchAgent
40 | from ._internal.browser.custom_browser import CustomBrowser
41 | from ._internal.browser.custom_context import (
42 | CustomBrowserContext,
43 | CustomBrowserContextConfig,
44 | )
45 | from ._internal.controller.custom_controller import CustomController
46 | from ._internal.utils import llm_provider as internal_llm_provider # aliased
47 |
48 | from browser_use.agent.views import (
49 | AgentHistoryList,
50 | )
51 |
52 | # Shared resources for MCP_BROWSER_KEEP_OPEN
53 | shared_browser_instance: Optional[CustomBrowser] = None
54 | shared_context_instance: Optional[CustomBrowserContext] = None
55 | shared_controller_instance: Optional[CustomController] = None # Controller might also be shared
56 | resource_lock = asyncio.Lock()
57 |
58 |
59 | async def get_controller(ask_human_callback: Optional[Any] = None) -> CustomController:
60 | """Gets or creates a shared controller instance if keep_open is true, or a new one."""
61 | global shared_controller_instance
62 | if settings.browser.keep_open and shared_controller_instance:
63 | # Potentially update callback if it can change per call, though usually fixed for server
64 | return shared_controller_instance
65 |
66 | controller = CustomController(ask_assistant_callback=ask_human_callback)
67 | if settings.server.mcp_config:
68 | try:
69 | mcp_dict_config = settings.server.mcp_config
70 | if isinstance(settings.server.mcp_config, str): # if passed as JSON string
71 | mcp_dict_config = json.loads(settings.server.mcp_config)
72 | await controller.setup_mcp_client(mcp_dict_config)
73 | except Exception as e:
74 | logger.error(f"Failed to setup MCP client for controller: {e}")
75 |
76 | if settings.browser.keep_open:
77 | shared_controller_instance = controller
78 | return controller
79 |
80 |
81 | async def get_browser_and_context() -> tuple[CustomBrowser, CustomBrowserContext]:
82 | """
83 | Manages creation/reuse of CustomBrowser and CustomBrowserContext
84 | based on settings.browser.keep_open and settings.browser.use_own_browser.
85 | """
86 | global shared_browser_instance, shared_context_instance
87 |
88 | current_browser: Optional[CustomBrowser] = None
89 | current_context: Optional[CustomBrowserContext] = None
90 |
91 | agent_headless_override = settings.agent_tool.headless
92 | browser_headless = agent_headless_override if agent_headless_override is not None else settings.browser.headless
93 |
94 | agent_disable_security_override = settings.agent_tool.disable_security
95 | browser_disable_security = agent_disable_security_override if agent_disable_security_override is not None else settings.browser.disable_security
96 |
97 | if settings.browser.use_own_browser and settings.browser.cdp_url:
98 | logger.info(f"Connecting to own browser via CDP: {settings.browser.cdp_url}")
99 | browser_cfg = BrowserConfig(
100 | cdp_url=settings.browser.cdp_url,
101 | wss_url=settings.browser.wss_url,
102 | user_data_dir=settings.browser.user_data_dir, # Useful for CDP
103 | # Headless, binary_path etc. are controlled by the user-launched browser
104 | )
105 | current_browser = CustomBrowser(config=browser_cfg)
106 | # For CDP, context config is minimal, trace/recording might not apply or be harder to manage
107 | context_cfg = CustomBrowserContextConfig(
108 | trace_path=settings.browser.trace_path,
109 | save_downloads_path=settings.paths.downloads,
110 | save_recording_path=settings.agent_tool.save_recording_path if settings.agent_tool.enable_recording else None,
111 | )
112 | current_context = await current_browser.new_context(config=context_cfg)
113 |
114 | elif settings.browser.keep_open:
115 | if shared_browser_instance and shared_context_instance:
116 | logger.info("Reusing shared browser and context.")
117 | # Ensure browser is still connected
118 | if not shared_browser_instance.is_connected():
119 | logger.warning("Shared browser was disconnected. Recreating.")
120 | if shared_context_instance: await shared_context_instance.close() # Close old context too
121 | await shared_browser_instance.close() # Close browser after context
122 | shared_browser_instance = None
123 | shared_context_instance = None
124 | else:
125 | current_browser = shared_browser_instance
126 | # For shared browser, we might want a new context or reuse.
127 | # For simplicity, let's reuse the context if keep_open is true.
128 | # If new context per call is needed, this logic would change.
129 | current_context = shared_context_instance
130 |
131 | if not current_browser or not current_context : # If shared instances were not valid or not yet created
132 | logger.info("Creating new shared browser and context.")
133 | browser_cfg = BrowserConfig(
134 | headless=browser_headless,
135 | disable_security=browser_disable_security,
136 | browser_binary_path=settings.browser.binary_path,
137 | user_data_dir=settings.browser.user_data_dir,
138 | window_width=settings.browser.window_width,
139 | window_height=settings.browser.window_height,
140 | )
141 | shared_browser_instance = CustomBrowser(config=browser_cfg)
142 | context_cfg = CustomBrowserContextConfig(
143 | trace_path=settings.browser.trace_path,
144 | save_downloads_path=settings.paths.downloads,
145 | save_recording_path=settings.agent_tool.save_recording_path if settings.agent_tool.enable_recording else None,
146 | force_new_context=False # Important for shared context
147 | )
148 | shared_context_instance = await shared_browser_instance.new_context(config=context_cfg)
149 | current_browser = shared_browser_instance
150 | current_context = shared_context_instance
151 | else: # Create new resources per call (not using own browser, not keeping open)
152 | logger.info("Creating new browser and context for this call.")
153 | browser_cfg = BrowserConfig(
154 | headless=browser_headless,
155 | disable_security=browser_disable_security,
156 | browser_binary_path=settings.browser.binary_path,
157 | user_data_dir=settings.browser.user_data_dir,
158 | window_width=settings.browser.window_width,
159 | window_height=settings.browser.window_height,
160 | )
161 | current_browser = CustomBrowser(config=browser_cfg)
162 | context_cfg = CustomBrowserContextConfig(
163 | trace_path=settings.browser.trace_path,
164 | save_downloads_path=settings.paths.downloads,
165 | save_recording_path=settings.agent_tool.save_recording_path if settings.agent_tool.enable_recording else None,
166 | force_new_context=True
167 | )
168 | current_context = await current_browser.new_context(config=context_cfg)
169 |
170 | if not current_browser or not current_context:
171 | raise RuntimeError("Failed to initialize browser or context")
172 |
173 | return current_browser, current_context
174 |
175 |
176 | def serve() -> FastMCP:
177 | server = FastMCP("mcp_server_browser_use")
178 |
179 | @server.tool()
180 | async def run_browser_agent(ctx: Context, task: str) -> str:
181 | logger.info(f"Received run_browser_agent task: {task[:100]}...")
182 | agent_task_id = str(uuid.uuid4())
183 | final_result = "Error: Agent execution failed."
184 |
185 | browser_instance: Optional[CustomBrowser] = None
186 | context_instance: Optional[CustomBrowserContext] = None
187 | controller_instance: Optional[CustomController] = None
188 |
189 | try:
190 | async with resource_lock: # Protect shared resource access/creation
191 | browser_instance, context_instance = await get_browser_and_context()
192 | # For server, ask_human_callback is likely not interactive, can be None or a placeholder
193 | controller_instance = await get_controller(ask_human_callback=None)
194 |
195 | if not browser_instance or not context_instance or not controller_instance:
196 | raise RuntimeError("Failed to acquire browser resources or controller.")
197 |
198 | main_llm_config = settings.get_llm_config()
199 | main_llm = internal_llm_provider.get_llm_model(**main_llm_config)
200 |
201 | planner_llm = None
202 | if settings.llm.planner_provider and settings.llm.planner_model_name:
203 | planner_llm_config = settings.get_llm_config(is_planner=True)
204 | planner_llm = internal_llm_provider.get_llm_model(**planner_llm_config)
205 |
206 | agent_history_json_file = None
207 | task_history_base_path = settings.agent_tool.history_path
208 |
209 | if task_history_base_path:
210 | task_specific_history_dir = Path(task_history_base_path) / agent_task_id
211 | task_specific_history_dir.mkdir(parents=True, exist_ok=True)
212 | agent_history_json_file = str(task_specific_history_dir / f"{agent_task_id}.json")
213 | logger.info(f"Agent history will be saved to: {agent_history_json_file}")
214 |
215 | agent_instance = BrowserUseAgent(
216 | task=task,
217 | llm=main_llm,
218 | browser=browser_instance,
219 | browser_context=context_instance,
220 | controller=controller_instance,
221 | planner_llm=planner_llm,
222 | max_actions_per_step=settings.agent_tool.max_actions_per_step,
223 | use_vision=settings.agent_tool.use_vision,
224 | )
225 |
226 | history: AgentHistoryList = await agent_instance.run(max_steps=settings.agent_tool.max_steps)
227 |
228 | if agent_history_json_file:
229 | agent_instance.save_history(agent_history_json_file)
230 |
231 | final_result = history.final_result() or "Agent finished without a final result."
232 | logger.info(f"Agent task completed. Result: {final_result[:100]}...")
233 |
234 | except Exception as e:
235 | logger.error(f"Error in run_browser_agent: {e}\n{traceback.format_exc()}")
236 | final_result = f"Error: {e}"
237 | finally:
238 | if not settings.browser.keep_open and not settings.browser.use_own_browser:
239 | logger.info("Closing browser resources for this call.")
240 | if context_instance:
241 | await context_instance.close()
242 | if browser_instance:
243 | await browser_instance.close()
244 | if controller_instance: # Close controller only if not shared
245 | await controller_instance.close_mcp_client()
246 | elif settings.browser.use_own_browser: # Own browser, only close controller if not shared
247 | if controller_instance and not (settings.browser.keep_open and controller_instance == shared_controller_instance):
248 | await controller_instance.close_mcp_client()
249 | return final_result
250 |
251 | @server.tool()
252 | async def run_deep_research(
253 | ctx: Context,
254 | research_task: str,
255 | max_parallel_browsers_override: Optional[int] = None,
256 | ) -> str:
257 | logger.info(f"Received run_deep_research task: {research_task[:100]}...")
258 | task_id = str(uuid.uuid4()) # This task_id is used for the sub-directory name
259 | report_content = "Error: Deep research failed."
260 |
261 | try:
262 | main_llm_config = settings.get_llm_config() # Deep research uses main LLM config
263 | research_llm = internal_llm_provider.get_llm_model(**main_llm_config)
264 |
265 | # Prepare browser_config dict for DeepResearchAgent's sub-agents
266 | dr_browser_cfg = {
267 | "headless": settings.browser.headless, # Use general browser headless for sub-tasks
268 | "disable_security": settings.browser.disable_security,
269 | "browser_binary_path": settings.browser.binary_path,
270 | "user_data_dir": settings.browser.user_data_dir,
271 | "window_width": settings.browser.window_width,
272 | "window_height": settings.browser.window_height,
273 | "trace_path": settings.browser.trace_path, # For sub-agent traces
274 | "save_downloads_path": settings.paths.downloads, # For sub-agent downloads
275 | }
276 | if settings.browser.use_own_browser and settings.browser.cdp_url:
277 | # If main browser is CDP, sub-agents should also use it
278 | dr_browser_cfg["cdp_url"] = settings.browser.cdp_url
279 | dr_browser_cfg["wss_url"] = settings.browser.wss_url
280 |
281 | mcp_server_config_for_agent = None
282 | if settings.server.mcp_config:
283 | mcp_server_config_for_agent = settings.server.mcp_config
284 | if isinstance(settings.server.mcp_config, str):
285 | mcp_server_config_for_agent = json.loads(settings.server.mcp_config)
286 |
287 | agent_instance = DeepResearchAgent(
288 | llm=research_llm,
289 | browser_config=dr_browser_cfg,
290 | mcp_server_config=mcp_server_config_for_agent,
291 | )
292 |
293 | current_max_parallel_browsers = max_parallel_browsers_override if max_parallel_browsers_override is not None else settings.research_tool.max_parallel_browsers
294 |
295 | # Check if save_dir is provided, otherwise use in-memory approach
296 | save_dir_for_this_task = None
297 | if settings.research_tool.save_dir:
298 | # If save_dir is provided, construct the full save directory path for this specific task
299 | save_dir_for_this_task = str(Path(settings.research_tool.save_dir) / task_id)
300 | logger.info(f"Deep research save directory for this task: {save_dir_for_this_task}")
301 | else:
302 | logger.info("No save_dir configured. Deep research will operate in memory-only mode.")
303 |
304 | logger.info(f"Using max_parallel_browsers: {current_max_parallel_browsers}")
305 |
306 | result_dict = await agent_instance.run(
307 | topic=research_task,
308 | save_dir=save_dir_for_this_task, # Can be None now
309 | task_id=task_id, # Pass the generated task_id
310 | max_parallel_browsers=current_max_parallel_browsers
311 | )
312 |
313 | # Handle the result based on if files were saved or not
314 | if save_dir_for_this_task and result_dict.get("report_file_path") and Path(result_dict["report_file_path"]).exists():
315 | with open(result_dict["report_file_path"], "r", encoding="utf-8") as f:
316 | markdown_content = f.read()
317 | report_content = f"Deep research report generated successfully at {result_dict['report_file_path']}\n\n{markdown_content}"
318 | logger.info(f"Deep research task {task_id} completed. Report at {result_dict['report_file_path']}")
319 | elif result_dict.get("status") == "completed" and result_dict.get("final_report"):
320 | report_content = f"Deep research completed. Report content:\n\n{result_dict['final_report']}"
321 | if result_dict.get("report_file_path"):
322 | report_content += f"\n(Expected report file at: {result_dict['report_file_path']})"
323 | logger.info(f"Deep research task {task_id} completed. Report content retrieved directly.")
324 | else:
325 | report_content = f"Deep research task {task_id} result: {result_dict}. Report file not found or content not available."
326 | logger.warning(report_content)
327 |
328 |
329 | except Exception as e:
330 | logger.error(f"Error in run_deep_research: {e}\n{traceback.format_exc()}")
331 | report_content = f"Error: {e}"
332 |
333 | return report_content
334 |
335 | return server
336 |
337 | server_instance = serve() # Renamed from 'server' to avoid conflict with 'settings.server'
338 |
339 | def main():
340 | logger.info("Starting MCP server for browser-use...")
341 | try:
342 | # Just log the Research tool save directory if it's configured
343 | if settings.research_tool.save_dir:
344 | logger.info(f"Research tool save directory configured: {settings.research_tool.save_dir}")
345 | else:
346 | logger.info("Research tool save directory not configured. Deep research will operate in memory-only mode.")
347 | except Exception as e:
348 | logger.error(f"Configuration error: {e}")
349 | return # Exit if there's a configuration error
350 |
351 | logger.info(f"Loaded settings with LLM provider: {settings.llm.provider}, Model: {settings.llm.model_name}")
352 | logger.info(f"Browser keep_open: {settings.browser.keep_open}, Use own browser: {settings.browser.use_own_browser}")
353 | if settings.browser.use_own_browser:
354 | logger.info(f"Connecting to own browser via CDP: {settings.browser.cdp_url}")
355 | server_instance.run()
356 |
357 | if __name__ == "__main__":
358 | main()
359 |
```