beehiveinnovations/gemini-mcp-server # codebase.md

This is page 6 of 25. Use http://codebase.md/beehiveinnovations/gemini-mcp-server?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .claude
│   ├── commands
│   │   └── fix-github-issue.md
│   └── settings.json
├── .coveragerc
├── .dockerignore
├── .env.example
├── .gitattributes
├── .github
│   ├── FUNDING.yml
│   ├── ISSUE_TEMPLATE
│   │   ├── bug_report.yml
│   │   ├── config.yml
│   │   ├── documentation.yml
│   │   ├── feature_request.yml
│   │   └── tool_addition.yml
│   ├── pull_request_template.md
│   └── workflows
│       ├── docker-pr.yml
│       ├── docker-release.yml
│       ├── semantic-pr.yml
│       ├── semantic-release.yml
│       └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── AGENTS.md
├── CHANGELOG.md
├── claude_config_example.json
├── CLAUDE.md
├── clink
│   ├── __init__.py
│   ├── agents
│   │   ├── __init__.py
│   │   ├── base.py
│   │   ├── claude.py
│   │   ├── codex.py
│   │   └── gemini.py
│   ├── constants.py
│   ├── models.py
│   ├── parsers
│   │   ├── __init__.py
│   │   ├── base.py
│   │   ├── claude.py
│   │   ├── codex.py
│   │   └── gemini.py
│   └── registry.py
├── code_quality_checks.ps1
├── code_quality_checks.sh
├── communication_simulator_test.py
├── conf
│   ├── __init__.py
│   ├── azure_models.json
│   ├── cli_clients
│   │   ├── claude.json
│   │   ├── codex.json
│   │   └── gemini.json
│   ├── custom_models.json
│   ├── dial_models.json
│   ├── gemini_models.json
│   ├── openai_models.json
│   ├── openrouter_models.json
│   └── xai_models.json
├── config.py
├── docker
│   ├── README.md
│   └── scripts
│       ├── build.ps1
│       ├── build.sh
│       ├── deploy.ps1
│       ├── deploy.sh
│       └── healthcheck.py
├── docker-compose.yml
├── Dockerfile
├── docs
│   ├── adding_providers.md
│   ├── adding_tools.md
│   ├── advanced-usage.md
│   ├── ai_banter.md
│   ├── ai-collaboration.md
│   ├── azure_openai.md
│   ├── configuration.md
│   ├── context-revival.md
│   ├── contributions.md
│   ├── custom_models.md
│   ├── docker-deployment.md
│   ├── gemini-setup.md
│   ├── getting-started.md
│   ├── index.md
│   ├── locale-configuration.md
│   ├── logging.md
│   ├── model_ranking.md
│   ├── testing.md
│   ├── tools
│   │   ├── analyze.md
│   │   ├── apilookup.md
│   │   ├── challenge.md
│   │   ├── chat.md
│   │   ├── clink.md
│   │   ├── codereview.md
│   │   ├── consensus.md
│   │   ├── debug.md
│   │   ├── docgen.md
│   │   ├── listmodels.md
│   │   ├── planner.md
│   │   ├── precommit.md
│   │   ├── refactor.md
│   │   ├── secaudit.md
│   │   ├── testgen.md
│   │   ├── thinkdeep.md
│   │   ├── tracer.md
│   │   └── version.md
│   ├── troubleshooting.md
│   ├── vcr-testing.md
│   └── wsl-setup.md
├── examples
│   ├── claude_config_macos.json
│   └── claude_config_wsl.json
├── LICENSE
├── providers
│   ├── __init__.py
│   ├── azure_openai.py
│   ├── base.py
│   ├── custom.py
│   ├── dial.py
│   ├── gemini.py
│   ├── openai_compatible.py
│   ├── openai.py
│   ├── openrouter.py
│   ├── registries
│   │   ├── __init__.py
│   │   ├── azure.py
│   │   ├── base.py
│   │   ├── custom.py
│   │   ├── dial.py
│   │   ├── gemini.py
│   │   ├── openai.py
│   │   ├── openrouter.py
│   │   └── xai.py
│   ├── registry_provider_mixin.py
│   ├── registry.py
│   ├── shared
│   │   ├── __init__.py
│   │   ├── model_capabilities.py
│   │   ├── model_response.py
│   │   ├── provider_type.py
│   │   └── temperature.py
│   └── xai.py
├── pyproject.toml
├── pytest.ini
├── README.md
├── requirements-dev.txt
├── requirements.txt
├── run_integration_tests.ps1
├── run_integration_tests.sh
├── run-server.ps1
├── run-server.sh
├── scripts
│   └── sync_version.py
├── server.py
├── simulator_tests
│   ├── __init__.py
│   ├── base_test.py
│   ├── conversation_base_test.py
│   ├── log_utils.py
│   ├── test_analyze_validation.py
│   ├── test_basic_conversation.py
│   ├── test_chat_simple_validation.py
│   ├── test_codereview_validation.py
│   ├── test_consensus_conversation.py
│   ├── test_consensus_three_models.py
│   ├── test_consensus_workflow_accurate.py
│   ├── test_content_validation.py
│   ├── test_conversation_chain_validation.py
│   ├── test_cross_tool_comprehensive.py
│   ├── test_cross_tool_continuation.py
│   ├── test_debug_certain_confidence.py
│   ├── test_debug_validation.py
│   ├── test_line_number_validation.py
│   ├── test_logs_validation.py
│   ├── test_model_thinking_config.py
│   ├── test_o3_model_selection.py
│   ├── test_o3_pro_expensive.py
│   ├── test_ollama_custom_url.py
│   ├── test_openrouter_fallback.py
│   ├── test_openrouter_models.py
│   ├── test_per_tool_deduplication.py
│   ├── test_planner_continuation_history.py
│   ├── test_planner_validation_old.py
│   ├── test_planner_validation.py
│   ├── test_precommitworkflow_validation.py
│   ├── test_prompt_size_limit_bug.py
│   ├── test_refactor_validation.py
│   ├── test_secaudit_validation.py
│   ├── test_testgen_validation.py
│   ├── test_thinkdeep_validation.py
│   ├── test_token_allocation_validation.py
│   ├── test_vision_capability.py
│   └── test_xai_models.py
├── systemprompts
│   ├── __init__.py
│   ├── analyze_prompt.py
│   ├── chat_prompt.py
│   ├── clink
│   │   ├── codex_codereviewer.txt
│   │   ├── default_codereviewer.txt
│   │   ├── default_planner.txt
│   │   └── default.txt
│   ├── codereview_prompt.py
│   ├── consensus_prompt.py
│   ├── debug_prompt.py
│   ├── docgen_prompt.py
│   ├── generate_code_prompt.py
│   ├── planner_prompt.py
│   ├── precommit_prompt.py
│   ├── refactor_prompt.py
│   ├── secaudit_prompt.py
│   ├── testgen_prompt.py
│   ├── thinkdeep_prompt.py
│   └── tracer_prompt.py
├── tests
│   ├── __init__.py
│   ├── CASSETTE_MAINTENANCE.md
│   ├── conftest.py
│   ├── gemini_cassettes
│   │   ├── chat_codegen
│   │   │   └── gemini25_pro_calculator
│   │   │       └── mldev.json
│   │   ├── chat_cross
│   │   │   └── step1_gemini25_flash_number
│   │   │       └── mldev.json
│   │   └── consensus
│   │       └── step2_gemini25_flash_against
│   │           └── mldev.json
│   ├── http_transport_recorder.py
│   ├── mock_helpers.py
│   ├── openai_cassettes
│   │   ├── chat_cross_step2_gpt5_reminder.json
│   │   ├── chat_gpt5_continuation.json
│   │   ├── chat_gpt5_moon_distance.json
│   │   ├── consensus_step1_gpt5_for.json
│   │   └── o3_pro_basic_math.json
│   ├── pii_sanitizer.py
│   ├── sanitize_cassettes.py
│   ├── test_alias_target_restrictions.py
│   ├── test_auto_mode_comprehensive.py
│   ├── test_auto_mode_custom_provider_only.py
│   ├── test_auto_mode_model_listing.py
│   ├── test_auto_mode_provider_selection.py
│   ├── test_auto_mode.py
│   ├── test_auto_model_planner_fix.py
│   ├── test_azure_openai_provider.py
│   ├── test_buggy_behavior_prevention.py
│   ├── test_cassette_semantic_matching.py
│   ├── test_challenge.py
│   ├── test_chat_codegen_integration.py
│   ├── test_chat_cross_model_continuation.py
│   ├── test_chat_openai_integration.py
│   ├── test_chat_simple.py
│   ├── test_clink_claude_agent.py
│   ├── test_clink_claude_parser.py
│   ├── test_clink_codex_agent.py
│   ├── test_clink_gemini_agent.py
│   ├── test_clink_gemini_parser.py
│   ├── test_clink_integration.py
│   ├── test_clink_parsers.py
│   ├── test_clink_tool.py
│   ├── test_collaboration.py
│   ├── test_config.py
│   ├── test_consensus_integration.py
│   ├── test_consensus_schema.py
│   ├── test_consensus.py
│   ├── test_conversation_continuation_integration.py
│   ├── test_conversation_field_mapping.py
│   ├── test_conversation_file_features.py
│   ├── test_conversation_memory.py
│   ├── test_conversation_missing_files.py
│   ├── test_custom_openai_temperature_fix.py
│   ├── test_custom_provider.py
│   ├── test_debug.py
│   ├── test_deploy_scripts.py
│   ├── test_dial_provider.py
│   ├── test_directory_expansion_tracking.py
│   ├── test_disabled_tools.py
│   ├── test_docker_claude_desktop_integration.py
│   ├── test_docker_config_complete.py
│   ├── test_docker_healthcheck.py
│   ├── test_docker_implementation.py
│   ├── test_docker_mcp_validation.py
│   ├── test_docker_security.py
│   ├── test_docker_volume_persistence.py
│   ├── test_file_protection.py
│   ├── test_gemini_token_usage.py
│   ├── test_image_support_integration.py
│   ├── test_image_validation.py
│   ├── test_integration_utf8.py
│   ├── test_intelligent_fallback.py
│   ├── test_issue_245_simple.py
│   ├── test_large_prompt_handling.py
│   ├── test_line_numbers_integration.py
│   ├── test_listmodels_restrictions.py
│   ├── test_listmodels.py
│   ├── test_mcp_error_handling.py
│   ├── test_model_enumeration.py
│   ├── test_model_metadata_continuation.py
│   ├── test_model_resolution_bug.py
│   ├── test_model_restrictions.py
│   ├── test_o3_pro_output_text_fix.py
│   ├── test_o3_temperature_fix_simple.py
│   ├── test_openai_compatible_token_usage.py
│   ├── test_openai_provider.py
│   ├── test_openrouter_provider.py
│   ├── test_openrouter_registry.py
│   ├── test_parse_model_option.py
│   ├── test_per_tool_model_defaults.py
│   ├── test_pii_sanitizer.py
│   ├── test_pip_detection_fix.py
│   ├── test_planner.py
│   ├── test_precommit_workflow.py
│   ├── test_prompt_regression.py
│   ├── test_prompt_size_limit_bug_fix.py
│   ├── test_provider_retry_logic.py
│   ├── test_provider_routing_bugs.py
│   ├── test_provider_utf8.py
│   ├── test_providers.py
│   ├── test_rate_limit_patterns.py
│   ├── test_refactor.py
│   ├── test_secaudit.py
│   ├── test_server.py
│   ├── test_supported_models_aliases.py
│   ├── test_thinking_modes.py
│   ├── test_tools.py
│   ├── test_tracer.py
│   ├── test_utf8_localization.py
│   ├── test_utils.py
│   ├── test_uvx_resource_packaging.py
│   ├── test_uvx_support.py
│   ├── test_workflow_file_embedding.py
│   ├── test_workflow_metadata.py
│   ├── test_workflow_prompt_size_validation_simple.py
│   ├── test_workflow_utf8.py
│   ├── test_xai_provider.py
│   ├── transport_helpers.py
│   └── triangle.png
├── tools
│   ├── __init__.py
│   ├── analyze.py
│   ├── apilookup.py
│   ├── challenge.py
│   ├── chat.py
│   ├── clink.py
│   ├── codereview.py
│   ├── consensus.py
│   ├── debug.py
│   ├── docgen.py
│   ├── listmodels.py
│   ├── models.py
│   ├── planner.py
│   ├── precommit.py
│   ├── refactor.py
│   ├── secaudit.py
│   ├── shared
│   │   ├── __init__.py
│   │   ├── base_models.py
│   │   ├── base_tool.py
│   │   ├── exceptions.py
│   │   └── schema_builders.py
│   ├── simple
│   │   ├── __init__.py
│   │   └── base.py
│   ├── testgen.py
│   ├── thinkdeep.py
│   ├── tracer.py
│   ├── version.py
│   └── workflow
│       ├── __init__.py
│       ├── base.py
│       ├── schema_builders.py
│       └── workflow_mixin.py
├── utils
│   ├── __init__.py
│   ├── client_info.py
│   ├── conversation_memory.py
│   ├── env.py
│   ├── file_types.py
│   ├── file_utils.py
│   ├── image_utils.py
│   ├── model_context.py
│   ├── model_restrictions.py
│   ├── security_config.py
│   ├── storage_backend.py
│   └── token_utils.py
└── zen-mcp-server
```

# Files

--------------------------------------------------------------------------------
/code_quality_checks.ps1:
--------------------------------------------------------------------------------

```
  1 | <#
  2 | .SYNOPSIS
  3 |     Code quality checks script for Zen MCP server on Windows.
  4 | 
  5 | .DESCRIPTION
  6 |     This PowerShell script performs code quality checks for the Zen MCP server project:
  7 |     - Runs static analysis and linting tools on the codebase
  8 |     - Ensures code style compliance and detects potential issues
  9 |     - Can be integrated into CI/CD pipelines or used locally before commits
 10 | 
 11 | .PARAMETER Help
 12 |     Displays help information for using the script.
 13 | 
 14 | .PARAMETER Verbose
 15 |     Enables detailed output during code quality checks.
 16 | 
 17 | .EXAMPLE
 18 |     .\code_quality_checks.ps1
 19 |     Runs all code quality checks on the project.
 20 | 
 21 |     .\code_quality_checks.ps1 -Verbose
 22 |     Runs code quality checks with detailed output.
 23 | 
 24 | .NOTES
 25 |     Project Author     : BeehiveInnovations
 26 |     Script Author      : GiGiDKR (https://github.com/GiGiDKR)
 27 |     Date               : 07-05-2025
 28 |     Version            : See project documentation
 29 |     References         : https://github.com/BeehiveInnovations/zen-mcp-server
 30 | #>
 31 | #Requires -Version 5.1
 32 | [CmdletBinding()]
 33 | param(
 34 |     [switch]$SkipTests,
 35 |     [switch]$SkipLinting,
 36 |     [switch]$VerboseOutput
 37 | )
 38 | 
 39 | # Set error action preference
 40 | $ErrorActionPreference = "Stop"
 41 | 
 42 | # Colors for output
 43 | function Write-ColorText {
 44 |     param(
 45 |         [Parameter(Mandatory)]
 46 |         [string]$Text,
 47 |         [string]$Color = "White"
 48 |     )
 49 |     Write-Host $Text -ForegroundColor $Color
 50 | }
 51 | 
 52 | function Write-Emoji {
 53 |     param(
 54 |         [Parameter(Mandatory)]
 55 |         [string]$Emoji,
 56 |         [Parameter(Mandatory)]
 57 |         [string]$Text,
 58 |         [string]$Color = "White"
 59 |     )
 60 |     Write-Host "$Emoji " -NoNewline
 61 |     Write-ColorText $Text -Color $Color
 62 | }
 63 | 
 64 | Write-Emoji "🔍" "Running Code Quality Checks for Zen MCP Server" -Color Cyan
 65 | Write-ColorText "=================================================" -Color Cyan
 66 | 
 67 | # Determine Python command
 68 | $pythonCmd = $null
 69 | $pipCmd = $null
 70 | 
 71 | if (Test-Path ".zen_venv") {
 72 |     if ($IsWindows -or $env:OS -eq "Windows_NT") {
 73 |         if (Test-Path ".zen_venv\Scripts\python.exe") {
 74 |             $pythonCmd = ".zen_venv\Scripts\python.exe"
 75 |             $pipCmd = ".zen_venv\Scripts\pip.exe"
 76 |         }
 77 |     } else {
 78 |         if (Test-Path ".zen_venv/bin/python") {
 79 |             $pythonCmd = ".zen_venv/bin/python"
 80 |             $pipCmd = ".zen_venv/bin/pip"
 81 |         }
 82 |     }
 83 |     
 84 |     if ($pythonCmd) {
 85 |         Write-Emoji "✅" "Using venv" -Color Green
 86 |     }
 87 | } elseif ($env:VIRTUAL_ENV) {
 88 |     $pythonCmd = "python"
 89 |     $pipCmd = "pip"
 90 |     Write-Emoji "✅" "Using activated virtual environment: $env:VIRTUAL_ENV" -Color Green
 91 | } else {
 92 |     Write-Emoji "❌" "No virtual environment found!" -Color Red
 93 |     Write-ColorText "Please run: .\run-server.ps1 first to set up the environment" -Color Yellow
 94 |     exit 1
 95 | }
 96 | 
 97 | Write-Host ""
 98 | 
 99 | # Check and install dev dependencies if needed
100 | Write-Emoji "🔍" "Checking development dependencies..." -Color Cyan
101 | $devDepsNeeded = $false
102 | 
103 | # List of dev tools to check
104 | $devTools = @("ruff", "black", "isort", "pytest")
105 | 
106 | foreach ($tool in $devTools) {
107 |     $toolFound = $false
108 |     
109 |     # Check in venv
110 |     if ($IsWindows -or $env:OS -eq "Windows_NT") {
111 |         if (Test-Path ".zen_venv\Scripts\$tool.exe") {
112 |             $toolFound = $true
113 |         }
114 |     } else {
115 |         if (Test-Path ".zen_venv/bin/$tool") {
116 |             $toolFound = $true
117 |         }
118 |     }
119 |     
120 |     # Check in PATH
121 |     if (!$toolFound) {
122 |         try {
123 |             $null = Get-Command $tool -ErrorAction Stop
124 |             $toolFound = $true
125 |         } catch {
126 |             # Tool not found
127 |         }
128 |     }
129 |     
130 |     if (!$toolFound) {
131 |         $devDepsNeeded = $true
132 |         break
133 |     }
134 | }
135 | 
136 | if ($devDepsNeeded) {
137 |     Write-Emoji "📦" "Installing development dependencies..." -Color Yellow
138 |     try {
139 |         & $pipCmd install -q -r requirements-dev.txt
140 |         if ($LASTEXITCODE -ne 0) {
141 |             throw "Failed to install dev dependencies"
142 |         }
143 |         Write-Emoji "✅" "Development dependencies installed" -Color Green
144 |     } catch {
145 |         Write-Emoji "❌" "Failed to install development dependencies" -Color Red
146 |         Write-ColorText "Error: $_" -Color Red
147 |         exit 1
148 |     }
149 | } else {
150 |     Write-Emoji "✅" "Development dependencies already installed" -Color Green
151 | }
152 | 
153 | # Set tool paths
154 | if ($IsWindows -or $env:OS -eq "Windows_NT") {
155 |     $ruffCmd = if (Test-Path ".zen_venv\Scripts\ruff.exe") { ".zen_venv\Scripts\ruff.exe" } else { "ruff" }
156 |     $blackCmd = if (Test-Path ".zen_venv\Scripts\black.exe") { ".zen_venv\Scripts\black.exe" } else { "black" }
157 |     $isortCmd = if (Test-Path ".zen_venv\Scripts\isort.exe") { ".zen_venv\Scripts\isort.exe" } else { "isort" }
158 |     $pytestCmd = if (Test-Path ".zen_venv\Scripts\pytest.exe") { ".zen_venv\Scripts\pytest.exe" } else { "pytest" }
159 | } else {
160 |     $ruffCmd = if (Test-Path ".zen_venv/bin/ruff") { ".zen_venv/bin/ruff" } else { "ruff" }
161 |     $blackCmd = if (Test-Path ".zen_venv/bin/black") { ".zen_venv/bin/black" } else { "black" }
162 |     $isortCmd = if (Test-Path ".zen_venv/bin/isort") { ".zen_venv/bin/isort" } else { "isort" }
163 |     $pytestCmd = if (Test-Path ".zen_venv/bin/pytest") { ".zen_venv/bin/pytest" } else { "pytest" }
164 | }
165 | 
166 | Write-Host ""
167 | 
168 | # Step 1: Linting and Formatting
169 | if (!$SkipLinting) {
170 |     Write-Emoji "📋" "Step 1: Running Linting and Formatting Checks" -Color Cyan
171 |     Write-ColorText "--------------------------------------------------" -Color Cyan
172 | 
173 |     try {
174 |         Write-Emoji "🔧" "Running ruff linting with auto-fix..." -Color Yellow
175 |         & $ruffCmd check --fix --exclude test_simulation_files --exclude .zen_venv
176 |         if ($LASTEXITCODE -ne 0) {
177 |             throw "Ruff linting failed"
178 |         }
179 | 
180 |         Write-Emoji "🎨" "Running black code formatting..." -Color Yellow
181 |         & $blackCmd . --exclude="test_simulation_files/" --exclude=".zen_venv/"
182 |         if ($LASTEXITCODE -ne 0) {
183 |             throw "Black formatting failed"
184 |         }
185 | 
186 |         Write-Emoji "📦" "Running import sorting with isort..." -Color Yellow
187 |         & $isortCmd . --skip-glob=".zen_venv/*" --skip-glob="test_simulation_files/*"
188 |         if ($LASTEXITCODE -ne 0) {
189 |             throw "Import sorting failed"
190 |         }
191 | 
192 |         Write-Emoji "✅" "Verifying all linting passes..." -Color Yellow
193 |         & $ruffCmd check --exclude test_simulation_files --exclude .zen_venv
194 |         if ($LASTEXITCODE -ne 0) {
195 |             throw "Final linting verification failed"
196 |         }
197 | 
198 |         Write-Emoji "✅" "Step 1 Complete: All linting and formatting checks passed!" -Color Green
199 |     } catch {
200 |         Write-Emoji "❌" "Step 1 Failed: Linting and formatting checks failed" -Color Red
201 |         Write-ColorText "Error: $_" -Color Red
202 |         exit 1
203 |     }
204 | } else {
205 |     Write-Emoji "⏭️" "Skipping linting and formatting checks" -Color Yellow
206 | }
207 | 
208 | Write-Host ""
209 | 
210 | # Step 2: Unit Tests
211 | if (!$SkipTests) {
212 |     Write-Emoji "🧪" "Step 2: Running Complete Unit Test Suite" -Color Cyan
213 |     Write-ColorText "---------------------------------------------" -Color Cyan
214 | 
215 |     try {
216 |         Write-Emoji "🏃" "Running unit tests (excluding integration tests)..." -Color Yellow
217 |         
218 |         $pytestArgs = @("tests/", "-v", "-x", "-m", "not integration")
219 |         if ($VerboseOutput) {
220 |             $pytestArgs += "--verbose"
221 |         }
222 |         
223 |         & $pythonCmd -m pytest @pytestArgs
224 |         if ($LASTEXITCODE -ne 0) {
225 |             throw "Unit tests failed"
226 |         }
227 | 
228 |         Write-Emoji "✅" "Step 2 Complete: All unit tests passed!" -Color Green
229 |     } catch {
230 |         Write-Emoji "❌" "Step 2 Failed: Unit tests failed" -Color Red
231 |         Write-ColorText "Error: $_" -Color Red
232 |         exit 1
233 |     }
234 | } else {
235 |     Write-Emoji "⏭️" "Skipping unit tests" -Color Yellow
236 | }
237 | 
238 | Write-Host ""
239 | 
240 | # Step 3: Final Summary
241 | Write-Emoji "🎉" "All Code Quality Checks Passed!" -Color Green
242 | Write-ColorText "==================================" -Color Green
243 | 
244 | if (!$SkipLinting) {
245 |     Write-Emoji "✅" "Linting (ruff): PASSED" -Color Green
246 |     Write-Emoji "✅" "Formatting (black): PASSED" -Color Green
247 |     Write-Emoji "✅" "Import sorting (isort): PASSED" -Color Green
248 | } else {
249 |     Write-Emoji "⏭️" "Linting: SKIPPED" -Color Yellow
250 | }
251 | 
252 | if (!$SkipTests) {
253 |     Write-Emoji "✅" "Unit tests: PASSED" -Color Green
254 | } else {
255 |     Write-Emoji "⏭️" "Unit tests: SKIPPED" -Color Yellow
256 | }
257 | 
258 | Write-Host ""
259 | Write-Emoji "🚀" "Your code is ready for commit and GitHub Actions!" -Color Green
260 | Write-Emoji "💡" "Remember to add simulator tests if you modified tools" -Color Yellow
261 | 
```

--------------------------------------------------------------------------------
/docs/tools/analyze.md:
--------------------------------------------------------------------------------

```markdown
  1 | # Analyze Tool - Smart File Analysis
  2 | 
  3 | **General-purpose code understanding and exploration through workflow-driven investigation**
  4 | 
  5 | The `analyze` tool provides comprehensive code analysis and understanding capabilities, helping you explore codebases, understand architecture, and identify patterns across files and directories. This workflow tool guides Claude through systematic investigation of code structure, patterns, and architectural decisions across multiple steps, gathering comprehensive insights before providing expert analysis.
  6 | 
  7 | ## Thinking Mode
  8 | 
  9 | **Default is `medium` (8,192 tokens).** Use `high` for architecture analysis (comprehensive insights worth the cost) or `low` for quick file overviews (save ~6k tokens).
 10 | 
 11 | ## How the Workflow Works
 12 | 
 13 | The analyze tool implements a **structured workflow** for thorough code understanding:
 14 | 
 15 | **Investigation Phase (Claude-Led):**
 16 | 1. **Step 1**: Claude describes the analysis plan and begins examining code structure
 17 | 2. **Step 2+**: Claude investigates architecture, patterns, dependencies, and design decisions
 18 | 3. **Throughout**: Claude tracks findings, relevant files, insights, and confidence levels
 19 | 4. **Completion**: Once analysis is comprehensive, Claude signals completion
 20 | 
 21 | **Expert Analysis Phase:**
 22 | After Claude completes the investigation (unless confidence is **certain**):
 23 | - Complete analysis summary with all findings
 24 | - Architectural insights and pattern identification
 25 | - Strategic improvement recommendations
 26 | - Final expert assessment based on investigation
 27 | 
 28 | This workflow ensures methodical analysis before expert insights, resulting in deeper understanding and more valuable recommendations.
 29 | 
 30 | ## Example Prompts
 31 | 
 32 | **Basic Usage:**
 33 | ```
 34 | "Use gemini to analyze main.py to understand how it works"
 35 | "Get gemini to do an architecture analysis of the src/ directory"
 36 | ```
 37 | 
 38 | ## Key Features
 39 | 
 40 | - **Analyzes single files or entire directories** with intelligent file filtering
 41 | - **Supports specialized analysis types**: architecture, performance, security, quality, general
 42 | - **Uses file paths (not content) for clean terminal output** while processing full content
 43 | - **Can identify patterns, anti-patterns, and refactoring opportunities**
 44 | - **Large codebase support**: Handle massive codebases with 1M token context models
 45 | - **Cross-file relationship mapping**: Understand dependencies and interactions
 46 | - **Architecture visualization**: Describe system structure and component relationships
 47 | - **Image support**: Analyze architecture diagrams, UML charts, flowcharts: `"Analyze this system diagram with gemini to understand the data flow and identify bottlenecks"`
 48 | - **Web search capability**: Automatically requests Claude to perform web searches when fresh documentation, patterns, or best practices are needed, ensuring the analysis stays current
 49 | 
 50 | ## Tool Parameters
 51 | 
 52 | **Workflow Investigation Parameters (used during step-by-step process):**
 53 | - `step`: Current investigation step description (required for each step)
 54 | - `step_number`: Current step number in analysis sequence (required)
 55 | - `total_steps`: Estimated total investigation steps (adjustable)
 56 | - `next_step_required`: Whether another investigation step is needed
 57 | - `findings`: Discoveries and insights collected in this step (required)
 58 | - `files_checked`: All files examined during investigation
 59 | - `relevant_files`: Files directly relevant to the analysis (required in step 1)
 60 | - `relevant_context`: Methods/functions/classes central to analysis findings
 61 | - `issues_found`: Issues or concerns identified with severity levels
 62 | - `confidence`: Confidence level in analysis completeness (exploring/low/medium/high/certain)
 63 | - `images`: Visual references for analysis context
 64 | 
 65 | **Initial Configuration (used in step 1):**
 66 | - `prompt`: What to analyze or look for (required)
 67 | - `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
 68 | - `analysis_type`: architecture|performance|security|quality|general (default: general)
 69 | - `output_format`: summary|detailed|actionable (default: detailed)
 70 | - `temperature`: Temperature for analysis (0-1, default 0.2)
 71 | - `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
 72 | - `use_assistant_model`: Whether to use expert analysis phase (default: true, set to false to use Claude only)
 73 | - `continuation_id`: Continue previous analysis sessions
 74 | 
 75 | ## Analysis Types
 76 | 
 77 | **General Analysis (default):**
 78 | - Overall code structure and organization
 79 | - Key components and their responsibilities
 80 | - Data flow and control flow
 81 | - Design patterns and architectural decisions
 82 | 
 83 | **Architecture Analysis:**
 84 | - System-level design and component relationships
 85 | - Module dependencies and coupling
 86 | - Separation of concerns and layering
 87 | - Scalability and maintainability considerations
 88 | 
 89 | **Performance Analysis:**
 90 | - Potential bottlenecks and optimization opportunities
 91 | - Algorithmic complexity assessment
 92 | - Memory usage patterns
 93 | - I/O and database interaction efficiency
 94 | 
 95 | **Security Analysis:**
 96 | - Security patterns and potential vulnerabilities
 97 | - Input validation and sanitization
 98 | - Authentication and authorization mechanisms
 99 | - Data protection and privacy considerations
100 | 
101 | **Quality Analysis:**
102 | - Code quality metrics and maintainability
103 | - Testing coverage and patterns
104 | - Documentation completeness
105 | - Best practices adherence
106 | 
107 | ## Usage Examples
108 | 
109 | **Single File Analysis:**
110 | ```
111 | "Analyze user_controller.py to understand the authentication flow with gemini"
112 | ```
113 | 
114 | **Directory Architecture Analysis:**
115 | ```
116 | "Use pro to analyze the src/ directory architecture and identify the main components"
117 | ```
118 | 
119 | **Performance-Focused Analysis:**
120 | ```
121 | "Analyze backend/api/ for performance bottlenecks with o3, focus on database queries"
122 | ```
123 | 
124 | **Security Assessment:**
125 | ```
126 | "Use gemini pro to analyze the authentication module for security patterns and potential issues"
127 | ```
128 | 
129 | **Visual + Code Analysis:**
130 | ```
131 | "Analyze this system architecture diagram along with the src/core/ implementation to understand the data flow"
132 | ```
133 | 
134 | **Large Codebase Analysis:**
135 | ```
136 | "Analyze the entire project structure with gemini pro to understand how all components work together"
137 | ```
138 | 
139 | ## Output Formats
140 | 
141 | **Summary Format:**
142 | - High-level overview with key findings
143 | - Main components and their purposes
144 | - Critical insights and recommendations
145 | 
146 | **Detailed Format (default):**
147 | - Comprehensive analysis with specific examples
148 | - Code snippets and file references
149 | - Detailed explanations of patterns and structures
150 | 
151 | **Actionable Format:**
152 | - Specific recommendations and next steps
153 | - Prioritized list of improvements
154 | - Implementation guidance and examples
155 | 
156 | ## Best Practices
157 | 
158 | - **Be specific about goals**: Clearly state what you want to understand or discover
159 | - **Use appropriate analysis types**: Choose the type that matches your needs
160 | - **Include related files**: Analyze modules together for better context understanding
161 | - **Leverage large context models**: Use Gemini Pro for comprehensive codebase analysis
162 | - **Combine with visual context**: Include architecture diagrams or documentation
163 | - **Use continuation**: Build on previous analysis for deeper understanding
164 | 
165 | ## Advanced Features
166 | 
167 | **Large Codebase Support:**
168 | With models like Gemini Pro (1M context), you can analyze extensive codebases:
169 | ```
170 | "Analyze the entire microservices architecture across all service directories"
171 | ```
172 | 
173 | **Cross-File Relationship Mapping:**
174 | Understand how components interact across multiple files:
175 | ```
176 | "Analyze the data processing pipeline across input/, processing/, and output/ directories"
177 | ```
178 | 
179 | **Pattern Recognition:**
180 | Identify design patterns, anti-patterns, and architectural decisions:
181 | ```
182 | "Analyze src/ to identify all design patterns used and assess their implementation quality"
183 | ```
184 | 
185 | **Web Search Enhancement:**
186 | The tool can recommend searches for current best practices and documentation:
187 | ```
188 | After analysis: "Recommended searches for Claude: 'FastAPI async best practices 2024', 'SQLAlchemy ORM performance optimization patterns'"
189 | ```
190 | 
191 | ## When to Use Analyze vs Other Tools
192 | 
193 | - **Use `analyze`** for: Understanding code structure, exploring unfamiliar codebases, architecture assessment
194 | - **Use `codereview`** for: Finding bugs and security issues with actionable fixes
195 | - **Use `debug`** for: Diagnosing specific runtime errors or performance problems
196 | - **Use `refactor`** for: Getting specific refactoring recommendations and implementation plans
197 | - **Use `chat`** for: Open-ended discussions about code without structured analysis
198 | 
```

--------------------------------------------------------------------------------
/systemprompts/testgen_prompt.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | TestGen tool system prompt
  3 | """
  4 | 
  5 | TESTGEN_PROMPT = """
  6 | ROLE
  7 | You are a principal software engineer who specialises in writing bullet-proof production code **and** surgical,
  8 | high-signal test suites. You reason about control flow, data flow, mutation, concurrency, failure modes, and security
  9 | in equal measure. Your mission: design and write tests that surface real-world defects before code ever leaves CI.
 10 | 
 11 | CRITICAL LINE NUMBER INSTRUCTIONS
 12 | Code is presented with line number markers "LINE│ code". These markers are for reference ONLY and MUST NOT be
 13 | included in any code you generate. Always reference specific line numbers in your replies in order to locate
 14 | exact positions if needed to point to exact locations. Include a very short code excerpt alongside for clarity.
 15 | Include context_start_text and context_end_text as backup references. Never include "LINE│" markers in generated code
 16 | snippets.
 17 | 
 18 | IF MORE INFORMATION IS NEEDED
 19 | If you need additional context (e.g., test framework details, dependencies, existing test patterns) to provide
 20 | accurate test generation, you MUST respond ONLY with this JSON format (and nothing else). Do NOT ask for the
 21 | same file you've been provided unless for some reason its content is missing or incomplete:
 22 | {
 23 |   "status": "files_required_to_continue",
 24 |   "mandatory_instructions": "<your critical instructions for the agent>",
 25 |   "files_needed": ["[file name here]", "[or some folder/]"]
 26 | }
 27 | 
 28 | MULTI-AGENT WORKFLOW
 29 | You sequentially inhabit five expert personas—each passes a concise artefact to the next:
 30 | 
 31 | 1. **Context Profiler** – derives language(s), test framework(s), build tooling, domain constraints, and existing
 32 | test idioms from the code snapshot provided.
 33 | 2. **Path Analyzer** – builds a map of reachable code paths (happy, error, exceptional) plus any external interactions
 34 |  that are directly involved (network, DB, file-system, IPC).
 35 | 3. **Adversarial Thinker** – enumerates realistic failures, boundary conditions, race conditions, and misuse patterns
 36 |  that historically break similar systems.
 37 | 4. **Risk Prioritizer** – ranks findings by production impact and likelihood; discards speculative or
 38 | out-of-scope cases.
 39 | 5. **Test Scaffolder** – produces deterministic, isolated tests that follow the *project's* conventions (assert style,
 40 | fixture layout, naming, any mocking strategy, language and tooling etc).
 41 | 
 42 | TEST-GENERATION STRATEGY
 43 | - If a specific test, function, class, or scenario is **explicitly** requested by the agent, focus ONLY on that specific
 44 | request and do not generate broader test coverage unless explicitly asked to do so.
 45 | - Start from public API / interface boundaries, then walk inward to critical private helpers.
 46 | - Analyze function signatures, parameters, return types, and side effects
 47 | - Map all code paths including happy paths and error conditions
 48 | - Test behaviour, not implementation details, unless white-box inspection is required to reach untestable paths.
 49 | - Include both positive and negative test cases
 50 | - Prefer property-based or table-driven tests where inputs form simple algebraic domains.
 51 | - Stub or fake **only** the minimal surface area needed; prefer in-memory fakes over mocks when feasible.
 52 | - Flag any code that cannot be tested deterministically and suggest realistic refactors (seams, dependency injection,
 53 | pure functions).
 54 | - Surface concurrency hazards with stress or fuzz tests when the language/runtime supports them.
 55 | - Focus on realistic failure modes that actually occur in production
 56 | - Remain within scope of language, framework, project. Do not over-step. Do not add unnecessary dependencies.
 57 | - No bogus, fake tests that seemingly pass for no reason at all
 58 | 
 59 | EDGE-CASE TAXONOMY (REAL-WORLD, HIGH-VALUE)
 60 | - **Data Shape Issues**: `null` / `undefined`, zero-length, surrogate-pair emojis, malformed UTF-8, mixed EOLs.
 61 | - **Numeric Boundaries**: −1, 0, 1, `MAX_…`, floating-point rounding, 64-bit truncation.
 62 | - **Temporal Pitfalls**: DST shifts, leap seconds, 29 Feb, Unix epoch 2038, timezone conversions.
 63 | - **Collections & Iteration**: off-by-one, concurrent modification, empty vs singleton vs large (>10⁶ items).
 64 | - **State & Sequence**: API calls out of order, idempotency violations, replay attacks.
 65 | - **External Dependencies**: slow responses, 5xx, malformed JSON/XML, TLS errors, retry storms, cancelled promises.
 66 | - **Concurrency / Async**: race conditions, deadlocks, promise rejection leaks, thread starvation.
 67 | - **Resource Exhaustion**: memory spikes, file-descriptor leaks, connection-pool saturation.
 68 | - **Locale & Encoding**: RTL scripts, uncommon locales, locale-specific formatting.
 69 | - **Security Surfaces**: injection (SQL, shell, LDAP), path traversal, privilege escalation on shared state.
 70 | 
 71 | TEST QUALITY PRINCIPLES
 72 | - Clear Arrange-Act-Assert sections (or given/when/then per project style) but retain and apply project norms, language
 73 | norms and framework norms and best practices.
 74 | - One behavioural assertion per test unless grouping is conventional.
 75 | - Fast: sub-100 ms/unit test; parallelisable; no remote calls.
 76 | - Deterministic: seeded randomness only; fixed stable clocks when time matters.
 77 | - Self-documenting: names read like specs; failures explain *why*, not just *what*.
 78 | 
 79 | FRAMEWORK SELECTION
 80 | Always autodetect from the repository. When a test framework or existing tests are not found, detect from existing
 81 | code; examples:
 82 | - **Swift / Objective-C** → XCTest (Xcode default) or Swift Testing (Apple provided frameworks)
 83 | - **C# / .NET** → xUnit.net preferred; fall back to NUnit or MSTest if they dominate the repo.
 84 | - **C / C++** → GoogleTest (gtest/gmock) or Catch2, matching existing tooling.
 85 | - **JS/TS** → Jest, Vitest, Mocha, or project-specific wrapper.
 86 | - **Python** → pytest, unittest.
 87 | - **Java/Kotlin** → JUnit 5, TestNG.
 88 | - **Go** → built-in `testing`, `testify`.
 89 | - **Rust** → `#[test]`, `proptest`.
 90 | - **Anything Else** → follow existing conventions; never introduce a new framework without strong justification.
 91 | 
 92 | IF FRAMEWORK SELECTION FAILS
 93 | If you are unable to confidently determine which framework to use based on the existing test samples supplied, or if
 94 | additional test samples would help in making a final decision, you MUST respond ONLY with this JSON
 95 | format (and nothing else). Do NOT ask for the same file you've been provided unless for some reason its content
 96 | is missing or incomplete:
 97 | {"status": "test_sample_needed", "reason": "<brief reason why additional sampling is required>"}
 98 | 
 99 | SCOPE CONTROL
100 | Stay strictly within the presented codebase, tech stack, and domain.
101 | Do **not** invent features, frameworks, or speculative integrations.
102 | Do **not** write tests for functions or classes that do not exist.
103 | If a test idea falls outside project scope, discard it.
104 | If a test would be a "good to have" but seems impossible given the current structure, setup of the project, highlight
105 | it but do not approach or offer refactoring ideas.
106 | 
107 | DELIVERABLE
108 | Return only the artefacts (analysis summary, coverage plan, and generated tests) that fit the detected framework
109 | and code / project layout.
110 | Group related tests but separate them into files where this is the convention and most suitable for the project at hand.
111 | Prefer adding tests to an existing test file if one was provided and grouping these tests makes sense.
112 | Must document logic, test reason/hypothesis in delivered code.
113 | MUST NOT add any additional information, introduction, or summaries around generated code. Deliver only the essentials
114 | relevant to the test.
115 | 
116 | IF ADDITIONAL TEST CASES ARE REQUIRED
117 | If you determine that comprehensive test coverage requires generating multiple test files or a large number of
118 | test cases for each file that would risk exceeding context limits, you MUST follow this structured approach:
119 | 
120 | 1. **Generate Essential Tests First**: Create only the most critical and high-impact tests (typically 3-5 key test
121 |    cases covering the most important paths and failure modes). Clearly state the file these tests belong to, even if
122 |    these should be added to an existing test file.
123 | 
124 | 2. **Request Continuation**: You MUST your message with the following added in JSON format (and nothing
125 |    more after this). This will list the pending tests and their respective files (even if they belong to the same or
126 |    an existing test file) as this will be used for the next follow-up test generation request.
127 | {"status": "more_tests_required",
128 | "pending_tests": "test_name (file_name), another_test_name (file_name)"}
129 | 
130 | This approach ensures comprehensive test coverage while maintaining quality and avoiding context overflow.
131 | 
132 | Remember: your value is catching the hard bugs—not inflating coverage numbers.
133 | """
134 | 
```

--------------------------------------------------------------------------------
/tests/test_providers.py:
--------------------------------------------------------------------------------

```python
  1 | """Tests for the model provider abstraction system"""
  2 | 
  3 | import os
  4 | from unittest.mock import Mock, patch
  5 | 
  6 | import pytest
  7 | 
  8 | from providers import ModelProviderRegistry, ModelResponse
  9 | from providers.gemini import GeminiModelProvider
 10 | from providers.openai import OpenAIModelProvider
 11 | from providers.shared import ProviderType
 12 | 
 13 | 
 14 | class TestModelProviderRegistry:
 15 |     """Test the model provider registry"""
 16 | 
 17 |     def setup_method(self):
 18 |         """Clear registry before each test"""
 19 |         # Store the original providers to restore them later
 20 |         registry = ModelProviderRegistry()
 21 |         self._original_providers = registry._providers.copy()
 22 |         registry._providers.clear()
 23 |         registry._initialized_providers.clear()
 24 | 
 25 |     def teardown_method(self):
 26 |         """Restore original providers after each test"""
 27 |         # Restore the original providers that were registered in conftest.py
 28 |         registry = ModelProviderRegistry()
 29 |         registry._providers.clear()
 30 |         registry._initialized_providers.clear()
 31 |         registry._providers.update(self._original_providers)
 32 | 
 33 |     def test_register_provider(self):
 34 |         """Test registering a provider"""
 35 |         ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
 36 | 
 37 |         registry = ModelProviderRegistry()
 38 |         assert ProviderType.GOOGLE in registry._providers
 39 |         assert registry._providers[ProviderType.GOOGLE] == GeminiModelProvider
 40 | 
 41 |     @patch.dict(os.environ, {"GEMINI_API_KEY": "test-key"})
 42 |     def test_get_provider(self):
 43 |         """Test getting a provider instance"""
 44 |         ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
 45 | 
 46 |         provider = ModelProviderRegistry.get_provider(ProviderType.GOOGLE)
 47 | 
 48 |         assert provider is not None
 49 |         assert isinstance(provider, GeminiModelProvider)
 50 |         assert provider.api_key == "test-key"
 51 | 
 52 |     @patch.dict(os.environ, {}, clear=True)
 53 |     def test_get_provider_no_api_key(self):
 54 |         """Test getting provider without API key returns None"""
 55 |         ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
 56 | 
 57 |         provider = ModelProviderRegistry.get_provider(ProviderType.GOOGLE)
 58 | 
 59 |         assert provider is None
 60 | 
 61 |     @patch.dict(os.environ, {"GEMINI_API_KEY": "test-key"})
 62 |     @pytest.mark.no_mock_provider
 63 |     def test_get_provider_for_model(self):
 64 |         """Test getting provider for a specific model"""
 65 |         ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
 66 | 
 67 |         provider = ModelProviderRegistry.get_provider_for_model("gemini-2.5-flash")
 68 | 
 69 |         assert provider is not None
 70 |         assert isinstance(provider, GeminiModelProvider)
 71 | 
 72 |     def test_get_available_providers(self):
 73 |         """Test getting list of available providers"""
 74 |         ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
 75 |         ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
 76 | 
 77 |         providers = ModelProviderRegistry.get_available_providers()
 78 | 
 79 |         assert len(providers) == 2
 80 |         assert ProviderType.GOOGLE in providers
 81 |         assert ProviderType.OPENAI in providers
 82 | 
 83 | 
 84 | class TestGeminiProvider:
 85 |     """Test Gemini model provider"""
 86 | 
 87 |     def test_provider_initialization(self):
 88 |         """Test provider initialization"""
 89 |         provider = GeminiModelProvider(api_key="test-key")
 90 | 
 91 |         assert provider.api_key == "test-key"
 92 |         assert provider.get_provider_type() == ProviderType.GOOGLE
 93 | 
 94 |     def test_get_capabilities(self):
 95 |         """Test getting model capabilities"""
 96 |         provider = GeminiModelProvider(api_key="test-key")
 97 | 
 98 |         capabilities = provider.get_capabilities("gemini-2.5-flash")
 99 | 
100 |         assert capabilities.provider == ProviderType.GOOGLE
101 |         assert capabilities.model_name == "gemini-2.5-flash"
102 |         assert capabilities.context_window == 1_048_576
103 |         assert capabilities.supports_extended_thinking
104 | 
105 |     def test_get_capabilities_pro_model(self):
106 |         """Test getting capabilities for Pro model with thinking support"""
107 |         provider = GeminiModelProvider(api_key="test-key")
108 | 
109 |         capabilities = provider.get_capabilities("gemini-2.5-pro")
110 | 
111 |         assert capabilities.supports_extended_thinking
112 | 
113 |     def test_model_shorthand_resolution(self):
114 |         """Test model shorthand resolution"""
115 |         provider = GeminiModelProvider(api_key="test-key")
116 | 
117 |         assert provider.validate_model_name("flash")
118 |         assert provider.validate_model_name("pro")
119 | 
120 |         capabilities = provider.get_capabilities("flash")
121 |         assert capabilities.model_name == "gemini-2.5-flash"
122 | 
123 |     @patch("google.genai.Client")
124 |     def test_generate_content(self, mock_client_class):
125 |         """Test content generation"""
126 |         # Mock the client
127 |         mock_client = Mock()
128 |         mock_response = Mock()
129 |         mock_response.text = "Generated content"
130 |         # Mock candidates for finish_reason
131 |         mock_candidate = Mock()
132 |         mock_candidate.finish_reason = "STOP"
133 |         mock_response.candidates = [mock_candidate]
134 |         # Mock usage metadata
135 |         mock_usage = Mock()
136 |         mock_usage.prompt_token_count = 10
137 |         mock_usage.candidates_token_count = 20
138 |         mock_response.usage_metadata = mock_usage
139 |         mock_client.models.generate_content.return_value = mock_response
140 |         mock_client_class.return_value = mock_client
141 | 
142 |         provider = GeminiModelProvider(api_key="test-key")
143 | 
144 |         response = provider.generate_content(prompt="Test prompt", model_name="gemini-2.5-flash", temperature=0.7)
145 | 
146 |         assert isinstance(response, ModelResponse)
147 |         assert response.content == "Generated content"
148 |         assert response.model_name == "gemini-2.5-flash"
149 |         assert response.provider == ProviderType.GOOGLE
150 |         assert response.usage["input_tokens"] == 10
151 |         assert response.usage["output_tokens"] == 20
152 |         assert response.usage["total_tokens"] == 30
153 | 
154 | 
155 | class TestOpenAIProvider:
156 |     """Test OpenAI model provider"""
157 | 
158 |     def setup_method(self):
159 |         """Clear restriction service cache before each test"""
160 |         import utils.model_restrictions
161 | 
162 |         utils.model_restrictions._restriction_service = None
163 | 
164 |     def teardown_method(self):
165 |         """Clear restriction service cache after each test"""
166 |         import utils.model_restrictions
167 | 
168 |         utils.model_restrictions._restriction_service = None
169 | 
170 |     def test_provider_initialization(self):
171 |         """Test provider initialization"""
172 |         provider = OpenAIModelProvider(api_key="test-key", organization="test-org")
173 | 
174 |         assert provider.api_key == "test-key"
175 |         assert provider.organization == "test-org"
176 |         assert provider.get_provider_type() == ProviderType.OPENAI
177 | 
178 |     def test_get_capabilities_o3(self):
179 |         """Test getting O3 model capabilities"""
180 |         provider = OpenAIModelProvider(api_key="test-key")
181 | 
182 |         capabilities = provider.get_capabilities("o3-mini")
183 | 
184 |         assert capabilities.provider == ProviderType.OPENAI
185 |         assert capabilities.model_name == "o3-mini"
186 |         assert capabilities.context_window == 200_000
187 |         assert not capabilities.supports_extended_thinking
188 | 
189 |     def test_get_capabilities_o4_mini(self):
190 |         """Test getting O4-mini model capabilities"""
191 |         provider = OpenAIModelProvider(api_key="test-key")
192 | 
193 |         capabilities = provider.get_capabilities("o4-mini")
194 | 
195 |         assert capabilities.provider == ProviderType.OPENAI
196 |         assert capabilities.model_name == "o4-mini"
197 |         assert capabilities.context_window == 200_000
198 |         assert not capabilities.supports_extended_thinking
199 |         # Check temperature constraint is fixed at 1.0
200 |         assert capabilities.temperature_constraint.value == 1.0
201 | 
202 |     def test_validate_model_names(self):
203 |         """Test model name validation"""
204 |         provider = OpenAIModelProvider(api_key="test-key")
205 | 
206 |         assert provider.validate_model_name("o3")
207 |         assert provider.validate_model_name("o3mini")
208 |         assert provider.validate_model_name("o3-mini")  # Backwards compatibility
209 |         assert provider.validate_model_name("o4-mini")
210 |         assert provider.validate_model_name("o4mini")
211 |         assert provider.validate_model_name("o4-mini")
212 |         assert not provider.validate_model_name("gpt-4o")
213 |         assert not provider.validate_model_name("invalid-model")
214 | 
215 |     def test_openai_models_do_not_support_extended_thinking(self):
216 |         """OpenAI catalogue exposes extended thinking capability via ModelCapabilities."""
217 |         provider = OpenAIModelProvider(api_key="test-key")
218 | 
219 |         aliases = ["o3", "o3mini", "o3-mini", "o4-mini", "o4mini"]
220 |         for alias in aliases:
221 |             assert not provider.get_capabilities(alias).supports_extended_thinking
222 | 
```

--------------------------------------------------------------------------------
/docs/tools/codereview.md:
--------------------------------------------------------------------------------

```markdown
  1 | # CodeReview Tool - Professional Code Review
  2 | 
  3 | **Comprehensive code analysis with prioritized feedback through workflow-driven investigation**
  4 | 
  5 | The `codereview` tool provides professional code review capabilities with actionable feedback, severity-based issue prioritization, and support for various review types from quick style checks to comprehensive security audits. This workflow tool guides Claude through systematic investigation steps with forced pauses between each step to ensure thorough code examination, issue identification, and quality assessment before providing expert analysis.
  6 | 
  7 | ## Thinking Mode
  8 | 
  9 | **Default is `medium` (8,192 tokens).** Use `high` for security-critical code (worth the extra tokens) or `low` for quick style checks (saves ~6k tokens).
 10 | 
 11 | ## How the Workflow Works
 12 | 
 13 | The codereview tool implements a **structured workflow** that ensures thorough code examination:
 14 | 
 15 | **Investigation Phase (Claude-Led):**
 16 | 1. **Step 1**: Claude describes the review plan and begins systematic analysis of code structure
 17 | 2. **Step 2+**: Claude examines code quality, security implications, performance concerns, and architectural patterns
 18 | 3. **Throughout**: Claude tracks findings, relevant files, issues, and confidence levels
 19 | 4. **Completion**: Once review is comprehensive, Claude signals completion
 20 | 
 21 | **Expert Analysis Phase:**
 22 | After Claude completes the investigation (unless confidence is **certain**):
 23 | - Complete review summary with all findings and evidence
 24 | - Relevant files and code patterns identified
 25 | - Issues categorized by severity levels
 26 | - Final recommendations based on investigation
 27 | 
 28 | **Special Note**: If you want Claude to perform the entire review without calling another model, you can include "don't use any other model" in your prompt, and Claude will complete the full workflow independently.
 29 | 
 30 | ## Model Recommendation
 31 | 
 32 | This tool particularly benefits from Gemini Pro or Flash models due to their 1M context window, which allows comprehensive analysis of large codebases. Claude's context limitations make it challenging to see the "big picture" in complex projects - this is a concrete example where utilizing a secondary model with larger context provides significant value beyond just experimenting with different AI capabilities.
 33 | 
 34 | ## Example Prompts
 35 | 
 36 | ```
 37 | Perform a codereview with gemini pro and review auth.py for security issues and potential vulnerabilities.
 38 | I need an actionable plan but break it down into smaller quick-wins that we can implement and test rapidly 
 39 | ```
 40 | 
 41 | ## Pro Tip: Multiple Parallel Reviews
 42 | 
 43 | **You can start more than one codereview session with Claude:**
 44 | 
 45 | ```
 46 | Start separate sub-tasks for codereview one with o3 finding critical issues and one with flash finding low priority issues
 47 | and quick-wins and give me the final single combined review highlighting only the critical issues 
 48 | ```
 49 | 
 50 | The above prompt will simultaneously run two separate `codereview` tools with two separate models and combine the output into a single summary for you to consume.
 51 | 
 52 | ## Key Features
 53 | 
 54 | - **Issues prioritized by severity** (🔴 CRITICAL → 🟢 LOW)
 55 | - **Supports specialized reviews**: security, performance, quick
 56 | - **Coding standards enforcement**: `"Use gemini to review src/ against PEP8 standards"`
 57 | - **Severity filtering**: `"Get gemini to review auth/ - only report critical vulnerabilities"`
 58 | - **Image support**: Review code from screenshots, error dialogs, or visual bug reports: `"Review this error screenshot and the related auth.py file for potential security issues"`
 59 | - **Multi-file analysis**: Comprehensive review of entire directories or codebases
 60 | - **Actionable feedback**: Specific recommendations with line numbers and code examples
 61 | - **Language-specific expertise**: Tailored analysis for Python, JavaScript, Java, C#, Swift, and more
 62 | - **Integration issue detection**: Identifies cross-file dependencies and architectural problems
 63 | - **Security vulnerability scanning**: Focused on common security patterns and anti-patterns
 64 | 
 65 | ## Tool Parameters
 66 | 
 67 | **Workflow Investigation Parameters (used during step-by-step process):**
 68 | - `step`: Current investigation step description (required for each step)
 69 | - `step_number`: Current step number in review sequence (required)
 70 | - `total_steps`: Estimated total investigation steps (adjustable)
 71 | - `next_step_required`: Whether another investigation step is needed
 72 | - `findings`: Discoveries and evidence collected in this step (required)
 73 | - `files_checked`: All files examined during investigation
 74 | - `relevant_files`: Files directly relevant to the review (required in step 1)
 75 | - `relevant_context`: Methods/functions/classes central to review findings
 76 | - `issues_found`: Issues identified with severity levels
 77 | - `confidence`: Confidence level in review completeness (exploring/low/medium/high/certain)
 78 | - `images`: Visual references for review context
 79 | 
 80 | **Initial Review Configuration (used in step 1):**
 81 | - `prompt`: User's summary of what the code does, expected behavior, constraints, and review objectives (required)
 82 | - `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
 83 | - `review_type`: full|security|performance|quick (default: full)
 84 | - `focus_on`: Specific aspects to focus on (e.g., "security vulnerabilities", "performance bottlenecks")
 85 | - `standards`: Coding standards to enforce (e.g., "PEP8", "ESLint", "Google Style Guide")
 86 | - `severity_filter`: critical|high|medium|low|all (default: all)
 87 | - `temperature`: Temperature for consistency (0-1, default 0.2)
 88 | - `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
 89 | - `use_assistant_model`: Whether to use expert analysis phase (default: true, set to false to use Claude only)
 90 | - `continuation_id`: Continue previous review discussions
 91 | 
 92 | ## Review Types
 93 | 
 94 | **Full Review (default):**
 95 | - Comprehensive analysis including bugs, security, performance, maintainability
 96 | - Best for new features or significant code changes
 97 | 
 98 | **Security Review:**
 99 | - Focused on security vulnerabilities and attack vectors
100 | - Checks for common security anti-patterns
101 | - Best for authentication, authorization, data handling code
102 | 
103 | **Performance Review:**
104 | - Analyzes performance bottlenecks and optimization opportunities
105 | - Memory usage, algorithmic complexity, resource management
106 | - Best for performance-critical code paths
107 | 
108 | **Quick Review:**
109 | - Fast style and basic issue check
110 | - Lower token usage for rapid feedback
111 | - Best for code formatting and simple validation
112 | 
113 | ## Severity Levels
114 | 
115 | Issues are categorized and prioritized:
116 | 
117 | - **🔴 CRITICAL**: Security vulnerabilities, crashes, data corruption
118 | - **🟠 HIGH**: Logic errors, performance issues, reliability problems  
119 | - **🟡 MEDIUM**: Code smells, maintainability issues, minor bugs
120 | - **🟢 LOW**: Style issues, documentation, minor improvements
121 | 
122 | ## Usage Examples
123 | 
124 | **Basic Security Review:**
125 | ```
126 | "Review the authentication module in auth/ for security vulnerabilities with gemini pro"
127 | ```
128 | 
129 | **Performance-Focused Review:**
130 | ```
131 | "Use o3 to review backend/api.py for performance issues, focus on database queries and caching"
132 | ```
133 | 
134 | **Quick Style Check:**
135 | ```
136 | "Quick review of utils.py with flash, only report critical and high severity issues"
137 | ```
138 | 
139 | **Standards Enforcement:**
140 | ```
141 | "Review src/ directory against PEP8 standards with gemini, focus on code formatting and structure"
142 | ```
143 | 
144 | **Visual Context Review:**
145 | ```
146 | "Review this authentication code along with the error dialog screenshot to understand the security implications"
147 | ```
148 | 
149 | ## Best Practices
150 | 
151 | - **Provide context**: Describe what the code is supposed to do and any constraints
152 | - **Use appropriate review types**: Security for auth code, performance for critical paths
153 | - **Set severity filters**: Focus on critical issues for quick wins
154 | - **Include relevant files**: Review related modules together for better context
155 | - **Use parallel reviews**: Run multiple reviews with different models for comprehensive coverage
156 | - **Follow up on findings**: Use the continuation feature to discuss specific issues in detail
157 | 
158 | ## Output Format
159 | 
160 | Reviews include:
161 | - **Executive Summary**: Overview of code quality and main concerns
162 | - **Detailed Findings**: Specific issues with severity levels, line numbers, and recommendations
163 | - **Quick Wins**: Easy-to-implement improvements with high impact
164 | - **Long-term Improvements**: Structural changes for better maintainability
165 | - **Security Considerations**: Specific security recommendations when relevant
166 | 
167 | ## When to Use CodeReview vs Other Tools
168 | 
169 | - **Use `codereview`** for: Finding bugs, security issues, performance problems, code quality assessment
170 | - **Use `analyze`** for: Understanding code structure without finding issues
171 | - **Use `debug`** for: Diagnosing specific runtime errors or exceptions
172 | - **Use `refactor`** for: Identifying structural improvements and modernization opportunities
173 | 
```

--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Tests for utility functions
  3 | """
  4 | 
  5 | from utils import check_token_limit, estimate_tokens, read_file_content, read_files
  6 | 
  7 | 
  8 | class TestFileUtils:
  9 |     """Test file reading utilities"""
 10 | 
 11 |     def test_read_file_content_success(self, project_path):
 12 |         """Test successful file reading"""
 13 |         test_file = project_path / "test.py"
 14 |         test_file.write_text("def hello():\n    return 'world'", encoding="utf-8")
 15 | 
 16 |         content, tokens = read_file_content(str(test_file))
 17 |         assert "--- BEGIN FILE:" in content
 18 |         assert "--- END FILE:" in content
 19 |         assert "def hello():" in content
 20 |         assert "return 'world'" in content
 21 |         assert tokens > 0  # Should have estimated tokens
 22 | 
 23 |     def test_read_file_content_not_found(self, project_path):
 24 |         """Test reading non-existent file"""
 25 |         # Use a non-existent file within the project path
 26 |         nonexistent = project_path / "nonexistent" / "file.py"
 27 |         content, tokens = read_file_content(str(nonexistent))
 28 |         assert "--- FILE NOT FOUND:" in content
 29 |         assert "Error: File does not exist" in content
 30 |         assert tokens > 0
 31 | 
 32 |     def test_read_file_content_safe_files_allowed(self):
 33 |         """Test that safe files outside the original project root are now allowed"""
 34 |         # In the new security model, safe files like /etc/passwd
 35 |         # can be read as they're not in the dangerous paths list
 36 |         content, tokens = read_file_content("/etc/passwd")
 37 |         # Should successfully read the file (with timestamp in header)
 38 |         assert "--- BEGIN FILE: /etc/passwd (Last modified:" in content
 39 |         assert "--- END FILE: /etc/passwd ---" in content
 40 |         assert tokens > 0
 41 | 
 42 |     def test_read_file_content_relative_path_rejected(self):
 43 |         """Test that relative paths are rejected"""
 44 |         # Try to use a relative path
 45 |         content, tokens = read_file_content("./some/relative/path.py")
 46 |         assert "--- ERROR ACCESSING FILE:" in content
 47 |         assert "Relative paths are not supported" in content
 48 |         assert tokens > 0
 49 | 
 50 |     def test_read_file_content_directory(self, project_path):
 51 |         """Test reading a directory"""
 52 |         content, tokens = read_file_content(str(project_path))
 53 |         assert "--- NOT A FILE:" in content
 54 |         assert "Error: Path is not a file" in content
 55 |         assert tokens > 0
 56 | 
 57 |     def test_read_files_multiple(self, project_path):
 58 |         """Test reading multiple files"""
 59 |         file1 = project_path / "file1.py"
 60 |         file1.write_text("print('file1')", encoding="utf-8")
 61 |         file2 = project_path / "file2.py"
 62 |         file2.write_text("print('file2')", encoding="utf-8")
 63 | 
 64 |         content = read_files([str(file1), str(file2)])
 65 | 
 66 |         assert "--- BEGIN FILE:" in content
 67 |         assert "file1.py" in content
 68 |         assert "file2.py" in content
 69 |         assert "print('file1')" in content
 70 |         assert "print('file2')" in content
 71 | 
 72 |         # Check that both files are included
 73 |         assert "file1.py" in content and "file2.py" in content
 74 | 
 75 |     def test_read_files_with_code(self):
 76 |         """Test reading with direct code"""
 77 |         code = "def test():\n    pass"
 78 |         content = read_files([], code)
 79 | 
 80 |         assert "--- BEGIN DIRECT CODE ---" in content
 81 |         assert "--- END DIRECT CODE ---" in content
 82 |         assert code in content
 83 | 
 84 |         # Check that direct code is included
 85 |         assert code in content
 86 | 
 87 |     def test_read_files_directory_support(self, project_path):
 88 |         """Test reading all files from a directory"""
 89 |         # Create directory structure
 90 |         (project_path / "file1.py").write_text("print('file1')", encoding="utf-8")
 91 |         (project_path / "file2.js").write_text("console.log('file2')", encoding="utf-8")
 92 |         (project_path / "readme.md").write_text("# README", encoding="utf-8")
 93 | 
 94 |         # Create subdirectory
 95 |         subdir = project_path / "src"
 96 |         subdir.mkdir()
 97 |         (subdir / "module.py").write_text("class Module: pass", encoding="utf-8")
 98 | 
 99 |         # Create hidden file (should be skipped)
100 |         (project_path / ".hidden").write_text("secret", encoding="utf-8")
101 | 
102 |         # Read the directory
103 |         content = read_files([str(project_path)])
104 | 
105 |         # Check files are included
106 |         assert "file1.py" in content
107 |         assert "file2.js" in content
108 |         assert "readme.md" in content
109 |         # Handle both forward and backslashes for cross-platform compatibility
110 |         assert "module.py" in content
111 |         assert "class Module: pass" in content
112 | 
113 |         # Check content
114 |         assert "print('file1')" in content
115 |         assert "console.log('file2')" in content
116 |         assert "# README" in content
117 |         assert "class Module: pass" in content
118 | 
119 |         # Hidden file should not be included
120 |         assert ".hidden" not in content
121 |         assert "secret" not in content
122 | 
123 |         # Check that all files are included
124 |         assert all(filename in content for filename in ["file1.py", "file2.js", "readme.md", "module.py"])
125 | 
126 |     def test_read_files_mixed_paths(self, project_path):
127 |         """Test reading mix of files and directories"""
128 |         # Create files
129 |         file1 = project_path / "direct.py"
130 |         file1.write_text("# Direct file", encoding="utf-8")
131 | 
132 |         # Create directory with files
133 |         subdir = project_path / "subdir"
134 |         subdir.mkdir()
135 |         (subdir / "sub1.py").write_text("# Sub file 1", encoding="utf-8")
136 |         (subdir / "sub2.py").write_text("# Sub file 2", encoding="utf-8")
137 | 
138 |         # Read mix of direct file and directory
139 |         content = read_files([str(file1), str(subdir)])
140 | 
141 |         assert "direct.py" in content
142 |         assert "sub1.py" in content
143 |         assert "sub2.py" in content
144 |         assert "# Direct file" in content
145 |         assert "# Sub file 1" in content
146 |         assert "# Sub file 2" in content
147 | 
148 |         # Check that all files are included
149 |         assert all(filename in content for filename in ["direct.py", "sub1.py", "sub2.py"])
150 | 
151 |     def test_read_files_token_limit(self, project_path):
152 |         """Test token limit handling"""
153 |         # Create files with known token counts
154 |         # ~250 tokens each (1000 chars)
155 |         large_content = "x" * 1000
156 | 
157 |         for i in range(5):
158 |             (project_path / f"file{i}.txt").write_text(large_content, encoding="utf-8")
159 | 
160 |         # Read with small token limit (should skip some files)
161 |         # Reserve 50k tokens, limit to 51k total = 1k available
162 |         # Each file ~250 tokens, so should read ~3-4 files
163 |         content = read_files([str(project_path)], max_tokens=51_000)
164 | 
165 |         # Check that token limit handling is present
166 |         assert "--- SKIPPED FILES (TOKEN LIMIT) ---" in content
167 | 
168 |         # Count how many files were read
169 |         read_count = content.count("--- BEGIN FILE:")
170 |         assert 2 <= read_count <= 4  # Should read some but not all
171 | 
172 |     def test_read_files_large_file(self, project_path):
173 |         """Test handling of large files"""
174 |         # Create a file larger than max_size (1MB)
175 |         large_file = project_path / "large.txt"
176 |         large_file.write_text("x" * 2_000_000, encoding="utf-8")  # 2MB
177 | 
178 |         content = read_files([str(large_file)])
179 | 
180 |         assert "--- FILE TOO LARGE:" in content
181 |         assert "2,000,000 bytes" in content
182 |         # File too large message should be present
183 |         assert "--- FILE TOO LARGE:" in content
184 | 
185 |     def test_read_files_file_extensions(self, project_path):
186 |         """Test file extension filtering"""
187 |         # Create various file types
188 |         (project_path / "code.py").write_text("python", encoding="utf-8")
189 |         (project_path / "style.css").write_text("css", encoding="utf-8")
190 |         (project_path / "binary.exe").write_text("exe", encoding="utf-8")
191 |         (project_path / "image.jpg").write_text("jpg", encoding="utf-8")
192 | 
193 |         content = read_files([str(project_path)])
194 | 
195 |         # Code files should be included
196 |         assert "code.py" in content
197 |         assert "style.css" in content
198 | 
199 |         # Binary files should not be included (not in CODE_EXTENSIONS)
200 |         assert "binary.exe" not in content
201 |         assert "image.jpg" not in content
202 | 
203 | 
204 | class TestTokenUtils:
205 |     """Test token counting utilities"""
206 | 
207 |     def test_estimate_tokens(self):
208 |         """Test token estimation"""
209 |         # Rough estimate: 1 token ≈ 4 characters
210 |         text = "a" * 400  # 400 characters
211 |         assert estimate_tokens(text) == 100
212 | 
213 |     def test_check_token_limit_within(self):
214 |         """Test token limit check - within limit"""
215 |         text = "a" * 4000  # 1000 tokens
216 |         within_limit, tokens = check_token_limit(text)
217 |         assert within_limit is True
218 |         assert tokens == 1000
219 | 
220 |     def test_check_token_limit_exceeded(self):
221 |         """Test token limit check - exceeded"""
222 |         text = "a" * 5_000_000  # 1.25M tokens
223 |         within_limit, tokens = check_token_limit(text)
224 |         assert within_limit is False
225 |         assert tokens == 1_250_000
226 | 
```

--------------------------------------------------------------------------------
/tests/test_openrouter_registry.py:
--------------------------------------------------------------------------------

```python
  1 | """Tests for OpenRouter model registry functionality."""
  2 | 
  3 | import json
  4 | import os
  5 | import tempfile
  6 | from unittest.mock import patch
  7 | 
  8 | import pytest
  9 | 
 10 | from providers.registries.openrouter import OpenRouterModelRegistry
 11 | from providers.shared import ModelCapabilities, ProviderType
 12 | 
 13 | 
 14 | class TestOpenRouterModelRegistry:
 15 |     """Test cases for OpenRouter model registry."""
 16 | 
 17 |     def test_registry_initialization(self):
 18 |         """Test registry initializes with default config."""
 19 |         registry = OpenRouterModelRegistry()
 20 | 
 21 |         # Should load models from default location
 22 |         assert len(registry.list_models()) > 0
 23 |         assert len(registry.list_aliases()) > 0
 24 | 
 25 |     def test_custom_config_path(self):
 26 |         """Test registry with custom config path."""
 27 |         # Create temporary config
 28 |         config_data = {
 29 |             "models": [
 30 |                 {
 31 |                     "model_name": "test/model-1",
 32 |                     "aliases": ["test1", "t1"],
 33 |                     "context_window": 4096,
 34 |                     "max_output_tokens": 2048,
 35 |                 }
 36 |             ]
 37 |         }
 38 | 
 39 |         with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
 40 |             json.dump(config_data, f)
 41 |             temp_path = f.name
 42 | 
 43 |         try:
 44 |             registry = OpenRouterModelRegistry(config_path=temp_path)
 45 |             assert len(registry.list_models()) == 1
 46 |             assert "test/model-1" in registry.list_models()
 47 |             assert "test1" in registry.list_aliases()
 48 |             assert "t1" in registry.list_aliases()
 49 |         finally:
 50 |             os.unlink(temp_path)
 51 | 
 52 |     def test_environment_variable_override(self):
 53 |         """Test OPENROUTER_MODELS_CONFIG_PATH environment variable."""
 54 |         # Create custom config
 55 |         config_data = {
 56 |             "models": [
 57 |                 {"model_name": "env/model", "aliases": ["envtest"], "context_window": 8192, "max_output_tokens": 4096}
 58 |             ]
 59 |         }
 60 | 
 61 |         with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
 62 |             json.dump(config_data, f)
 63 |             temp_path = f.name
 64 | 
 65 |         try:
 66 |             # Set environment variable
 67 |             original_env = os.environ.get("OPENROUTER_MODELS_CONFIG_PATH")
 68 |             os.environ["OPENROUTER_MODELS_CONFIG_PATH"] = temp_path
 69 | 
 70 |             # Create registry without explicit path
 71 |             registry = OpenRouterModelRegistry()
 72 | 
 73 |             # Should load from environment path
 74 |             assert "env/model" in registry.list_models()
 75 |             assert "envtest" in registry.list_aliases()
 76 | 
 77 |         finally:
 78 |             # Restore environment
 79 |             if original_env is not None:
 80 |                 os.environ["OPENROUTER_MODELS_CONFIG_PATH"] = original_env
 81 |             else:
 82 |                 del os.environ["OPENROUTER_MODELS_CONFIG_PATH"]
 83 |             os.unlink(temp_path)
 84 | 
 85 |     def test_alias_resolution(self):
 86 |         """Test alias resolution functionality."""
 87 |         registry = OpenRouterModelRegistry()
 88 | 
 89 |         # Test various aliases
 90 |         test_cases = [
 91 |             ("opus", "anthropic/claude-opus-4.1"),
 92 |             ("OPUS", "anthropic/claude-opus-4.1"),  # Case insensitive
 93 |             ("sonnet", "anthropic/claude-sonnet-4.5"),
 94 |             ("o3", "openai/o3"),
 95 |             ("deepseek", "deepseek/deepseek-r1-0528"),
 96 |             ("mistral", "mistralai/mistral-large-2411"),
 97 |         ]
 98 | 
 99 |         for alias, expected_model in test_cases:
100 |             config = registry.resolve(alias)
101 |             assert config is not None, f"Failed to resolve alias '{alias}'"
102 |             assert config.model_name == expected_model
103 | 
104 |     def test_direct_model_name_lookup(self):
105 |         """Test looking up models by their full name."""
106 |         registry = OpenRouterModelRegistry()
107 | 
108 |         # Should be able to look up by full model name
109 |         config = registry.resolve("anthropic/claude-opus-4.1")
110 |         assert config is not None
111 |         assert config.model_name == "anthropic/claude-opus-4.1"
112 | 
113 |         config = registry.resolve("openai/o3")
114 |         assert config is not None
115 |         assert config.model_name == "openai/o3"
116 | 
117 |     def test_unknown_model_resolution(self):
118 |         """Test resolution of unknown models."""
119 |         registry = OpenRouterModelRegistry()
120 | 
121 |         # Unknown aliases should return None
122 |         assert registry.resolve("unknown-alias") is None
123 |         assert registry.resolve("") is None
124 |         assert registry.resolve("non-existent") is None
125 | 
126 |     def test_model_capabilities_conversion(self):
127 |         """Test that registry returns ModelCapabilities directly."""
128 |         registry = OpenRouterModelRegistry()
129 | 
130 |         config = registry.resolve("opus")
131 |         assert config is not None
132 | 
133 |         # Registry now returns ModelCapabilities objects directly
134 |         assert config.provider == ProviderType.OPENROUTER
135 |         assert config.model_name == "anthropic/claude-opus-4.1"
136 |         assert config.friendly_name == "OpenRouter (anthropic/claude-opus-4.1)"
137 |         assert config.context_window == 200000
138 |         assert not config.supports_extended_thinking
139 | 
140 |     def test_duplicate_alias_detection(self):
141 |         """Test that duplicate aliases are detected."""
142 |         config_data = {
143 |             "models": [
144 |                 {"model_name": "test/model-1", "aliases": ["dupe"], "context_window": 4096, "max_output_tokens": 2048},
145 |                 {
146 |                     "model_name": "test/model-2",
147 |                     "aliases": ["DUPE"],  # Same alias, different case
148 |                     "context_window": 8192,
149 |                     "max_output_tokens": 2048,
150 |                 },
151 |             ]
152 |         }
153 | 
154 |         with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
155 |             json.dump(config_data, f)
156 |             temp_path = f.name
157 | 
158 |         try:
159 |             with pytest.raises(ValueError, match="Duplicate alias"):
160 |                 OpenRouterModelRegistry(config_path=temp_path)
161 |         finally:
162 |             os.unlink(temp_path)
163 | 
164 |     def test_backwards_compatibility_max_tokens(self):
165 |         """Test that legacy max_tokens field maps to max_output_tokens."""
166 |         config_data = {
167 |             "models": [
168 |                 {
169 |                     "model_name": "test/old-model",
170 |                     "aliases": ["old"],
171 |                     "max_tokens": 16384,  # Old field name should cause error
172 |                     "supports_extended_thinking": False,
173 |                 }
174 |             ]
175 |         }
176 | 
177 |         with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
178 |             json.dump(config_data, f)
179 |             temp_path = f.name
180 | 
181 |         try:
182 |             with patch.dict("os.environ", {}, clear=True):
183 |                 with pytest.raises(ValueError, match="max_output_tokens"):
184 |                     OpenRouterModelRegistry(config_path=temp_path)
185 |         finally:
186 |             os.unlink(temp_path)
187 | 
188 |     def test_missing_config_file(self):
189 |         """Test behavior with missing config file."""
190 |         # Use a non-existent path
191 |         with patch.dict("os.environ", {}, clear=True):
192 |             registry = OpenRouterModelRegistry(config_path="/non/existent/path.json")
193 | 
194 |         # Should initialize with empty maps
195 |         assert len(registry.list_models()) == 0
196 |         assert len(registry.list_aliases()) == 0
197 |         assert registry.resolve("anything") is None
198 | 
199 |     def test_invalid_json_config(self):
200 |         """Test handling of invalid JSON."""
201 |         with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
202 |             f.write("{ invalid json }")
203 |             temp_path = f.name
204 | 
205 |         try:
206 |             registry = OpenRouterModelRegistry(config_path=temp_path)
207 |             # Should handle gracefully and initialize empty
208 |             assert len(registry.list_models()) == 0
209 |             assert len(registry.list_aliases()) == 0
210 |         finally:
211 |             os.unlink(temp_path)
212 | 
213 |     def test_model_with_all_capabilities(self):
214 |         """Test model with all capability flags."""
215 |         from providers.shared import TemperatureConstraint
216 | 
217 |         caps = ModelCapabilities(
218 |             provider=ProviderType.OPENROUTER,
219 |             model_name="test/full-featured",
220 |             friendly_name="OpenRouter (test/full-featured)",
221 |             aliases=["full"],
222 |             context_window=128000,
223 |             max_output_tokens=8192,
224 |             supports_extended_thinking=True,
225 |             supports_system_prompts=True,
226 |             supports_streaming=True,
227 |             supports_function_calling=True,
228 |             supports_json_mode=True,
229 |             description="Fully featured test model",
230 |             temperature_constraint=TemperatureConstraint.create("range"),
231 |         )
232 |         assert caps.context_window == 128000
233 |         assert caps.supports_extended_thinking
234 |         assert caps.supports_system_prompts
235 |         assert caps.supports_streaming
236 |         assert caps.supports_function_calling
237 |         # Note: supports_json_mode is not in ModelCapabilities yet
238 | 
```

--------------------------------------------------------------------------------
/docs/tools/docgen.md:
--------------------------------------------------------------------------------

```markdown
  1 | # DocGen Tool - Comprehensive Documentation Generation
  2 | 
  3 | **Generates comprehensive documentation with complexity analysis through workflow-driven investigation**
  4 | 
  5 | The `docgen` tool creates thorough documentation by analyzing your code structure, understanding function complexity, and documenting gotchas and unexpected behaviors that developers need to know. This workflow tool guides Claude through systematic investigation of code functionality, architectural patterns, and documentation needs across multiple steps before generating comprehensive documentation with complexity analysis and call flow information.
  6 | 
  7 | ## How the Workflow Works
  8 | 
  9 | The docgen tool implements a **structured workflow** for comprehensive documentation generation:
 10 | 
 11 | **Investigation Phase (Claude-Led):**
 12 | 1. **Step 1 (Discovery)**: Claude discovers ALL files needing documentation and reports exact count
 13 | 2. **Step 2+ (Documentation)**: Claude documents files one-by-one with complete coverage validation
 14 | 3. **Throughout**: Claude tracks progress with counters and enforces modern documentation styles
 15 | 4. **Completion**: Only when all files are documented (num_files_documented = total_files_to_document)
 16 | 
 17 | **Documentation Generation Phase:**
 18 | After Claude completes the investigation:
 19 | - Complete documentation strategy with style consistency
 20 | - Function/method documentation with complexity analysis
 21 | - Call flow and dependency documentation
 22 | - Gotchas and unexpected behavior documentation
 23 | - Final polished documentation following project standards
 24 | 
 25 | This workflow ensures methodical analysis before documentation generation, resulting in more comprehensive and valuable documentation.
 26 | 
 27 | ## Model Recommendation
 28 | 
 29 | Documentation generation excels with analytical models like Gemini Pro or O3, which can understand complex code relationships, identify non-obvious behaviors, and generate thorough documentation that covers gotchas and edge cases. The combination of large context windows and analytical reasoning enables generation of documentation that helps prevent integration issues and developer confusion.
 30 | 
 31 | ## Example Prompts
 32 | 
 33 | **Basic Usage:**
 34 | ```
 35 | "Use zen to generate documentation for the UserManager class"
 36 | "Document the authentication module with complexity analysis using gemini pro"
 37 | "Add comprehensive documentation to all methods in src/payment_processor.py"
 38 | ```
 39 | 
 40 | ## Key Features
 41 | 
 42 | - **Systematic file-by-file approach** - Complete documentation with progress tracking and validation
 43 | - **Modern documentation styles** - Enforces /// for Objective-C/Swift, /** */ for Java/JavaScript, etc.
 44 | - **Complexity analysis** - Big O notation for algorithms and performance characteristics
 45 | - **Call flow documentation** - Dependencies and method relationships
 46 | - **Counter-based completion** - Prevents stopping until all files are documented
 47 | - **Large file handling** - Systematic portion-by-portion documentation for comprehensive coverage
 48 | - **Final verification scan** - Mandatory check to ensure no functions are missed
 49 | - **Bug tracking** - Surfaces code issues without altering logic
 50 | - **Configuration parameters** - Control complexity analysis, call flow, and inline comments
 51 | 
 52 | ## Tool Parameters
 53 | 
 54 | **Workflow Parameters (used during step-by-step process):**
 55 | - `step`: Current step description - discovery phase (step 1) or documentation phase (step 2+)
 56 | - `step_number`: Current step number in documentation sequence (required)
 57 | - `total_steps`: Dynamically calculated as 1 + total_files_to_document
 58 | - `next_step_required`: Whether another step is needed
 59 | - `findings`: Discoveries about code structure and documentation needs (required)
 60 | - `relevant_files`: Files being actively documented in current step
 61 | - `num_files_documented`: Counter tracking completed files (required)
 62 | - `total_files_to_document`: Total count of files needing documentation (required)
 63 | 
 64 | **Configuration Parameters (required fields):**
 65 | - `document_complexity`: Include Big O complexity analysis (default: true)
 66 | - `document_flow`: Include call flow and dependency information (default: true)
 67 | - `update_existing`: Update existing documentation when incorrect/incomplete (default: true)
 68 | - `comments_on_complex_logic`: Add inline comments for complex algorithmic steps (default: true)
 69 | 
 70 | ## Usage Examples
 71 | 
 72 | **Class Documentation:**
 73 | ```
 74 | "Generate comprehensive documentation for the PaymentProcessor class including complexity analysis"
 75 | ```
 76 | 
 77 | **Module Documentation:**
 78 | ```
 79 | "Document all functions in the authentication module with call flow information"
 80 | ```
 81 | 
 82 | **API Documentation:**
 83 | ```
 84 | "Create documentation for the REST API endpoints in api/users.py with parameter gotchas"
 85 | ```
 86 | 
 87 | **Algorithm Documentation:**
 88 | ```
 89 | "Document the sorting algorithm in utils/sort.py with Big O analysis and edge cases"
 90 | ```
 91 | 
 92 | **Library Documentation:**
 93 | ```
 94 | "Add comprehensive documentation to the utility library with usage examples and warnings"
 95 | ```
 96 | 
 97 | ## Documentation Standards
 98 | 
 99 | **Function/Method Documentation:**
100 | - Parameter types and descriptions
101 | - Return value documentation with types
102 | - Algorithmic complexity analysis (Big O notation)
103 | - Call flow and dependency information
104 | - Purpose and behavior explanation
105 | - Exception types and conditions
106 | 
107 | **Gotchas and Edge Cases:**
108 | - Parameter combinations that produce unexpected results
109 | - Hidden dependencies on global state or environment
110 | - Order-dependent operations where sequence matters
111 | - Performance implications and bottlenecks
112 | - Thread safety considerations
113 | - Platform-specific behavior differences
114 | 
115 | **Code Quality Documentation:**
116 | - Inline comments for complex logic
117 | - Design pattern explanations
118 | - Architectural decision rationale
119 | - Usage examples and best practices
120 | 
121 | ## Documentation Features Generated
122 | 
123 | **Complexity Analysis:**
124 | - Time complexity (Big O notation)
125 | - Space complexity when relevant
126 | - Worst-case, average-case, and best-case scenarios
127 | - Performance characteristics and bottlenecks
128 | 
129 | **Call Flow Documentation:**
130 | - Which methods/functions this code calls
131 | - Which methods/functions call this code
132 | - Key dependencies and interactions
133 | - Side effects and state modifications
134 | - Data flow through functions
135 | 
136 | **Gotchas Documentation:**
137 | - Non-obvious parameter interactions
138 | - Hidden state dependencies
139 | - Silent failure conditions
140 | - Resource management requirements
141 | - Version compatibility issues
142 | - Platform-specific behaviors
143 | 
144 | ## Incremental Documentation Approach
145 | 
146 | **Key Benefits:**
147 | - **Immediate value delivery** - Code becomes more maintainable right away
148 | - **Iterative improvement** - Pattern recognition across multiple analysis rounds
149 | - **Quality validation** - Testing documentation effectiveness during workflow
150 | - **Reduced cognitive load** - Focus on one function/method at a time
151 | 
152 | **Workflow Process:**
153 | 1. **Analyze and Document**: Examine each function and immediately add documentation
154 | 2. **Continue Analyzing**: Move to next function while building understanding
155 | 3. **Refine and Standardize**: Review and improve previously added documentation
156 | 
157 | ## Language Support
158 | 
159 | **Modern Documentation Style Enforcement:**
160 | - **Python**: Triple-quote docstrings with type hints
161 | - **Objective-C**: /// comments
162 | - **Swift**: /// comments
163 | - **JavaScript/TypeScript**: /** */ JSDoc style
164 | - **Java**: /** */ Javadoc style  
165 | - **C#**: /// XML documentation comments
166 | - **C/C++**: /// for documentation comments
167 | - **Go**: // comments above functions/types
168 | - **Rust**: /// for documentation comments
169 | 
170 | ## Documentation Quality Features
171 | 
172 | **Comprehensive Coverage:**
173 | - All public methods and functions
174 | - Complex private methods requiring explanation
175 | - Class and module-level documentation
176 | - Configuration and setup requirements
177 | 
178 | **Developer-Focused:**
179 | - Clear explanations of non-obvious behavior
180 | - Usage examples for complex APIs
181 | - Warning about common pitfalls
182 | - Integration guidance and best practices
183 | 
184 | **Maintainable Format:**
185 | - Consistent documentation style
186 | - Appropriate level of detail
187 | - Cross-references and links
188 | - Version and compatibility notes
189 | 
190 | ## Best Practices
191 | 
192 | - **Use systematic approach**: Tool now documents all files with progress tracking and validation
193 | - **Trust the counters**: Tool prevents premature completion until all files are documented
194 | - **Large files handled**: Tool automatically processes large files in systematic portions
195 | - **Modern styles enforced**: Tool ensures correct documentation style per language
196 | - **Configuration matters**: Enable complexity analysis and call flow for comprehensive docs
197 | - **Bug tracking**: Tool surfaces issues without altering code - review findings after completion
198 | 
199 | ## When to Use DocGen vs Other Tools
200 | 
201 | - **Use `docgen`** for: Creating comprehensive documentation, adding missing docs, improving existing documentation
202 | - **Use `analyze`** for: Understanding code structure without generating documentation
203 | - **Use `codereview`** for: Reviewing code quality including documentation completeness
204 | - **Use `refactor`** for: Restructuring code before documentation (cleaner code = better docs)
```

--------------------------------------------------------------------------------
/clink/registry.py:
--------------------------------------------------------------------------------

```python
  1 | """Configuration registry for clink CLI integrations."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import json
  6 | import logging
  7 | import shlex
  8 | from collections.abc import Iterable
  9 | from pathlib import Path
 10 | 
 11 | from clink.constants import (
 12 |     CONFIG_DIR,
 13 |     DEFAULT_TIMEOUT_SECONDS,
 14 |     INTERNAL_DEFAULTS,
 15 |     PROJECT_ROOT,
 16 |     USER_CONFIG_DIR,
 17 |     CLIInternalDefaults,
 18 | )
 19 | from clink.models import (
 20 |     CLIClientConfig,
 21 |     CLIRoleConfig,
 22 |     ResolvedCLIClient,
 23 |     ResolvedCLIRole,
 24 | )
 25 | from utils.env import get_env
 26 | from utils.file_utils import read_json_file
 27 | 
 28 | logger = logging.getLogger("clink.registry")
 29 | 
 30 | CONFIG_ENV_VAR = "CLI_CLIENTS_CONFIG_PATH"
 31 | 
 32 | 
 33 | class RegistryLoadError(RuntimeError):
 34 |     """Raised when configuration files are invalid or missing critical data."""
 35 | 
 36 | 
 37 | class ClinkRegistry:
 38 |     """Loads CLI client definitions and exposes them for schema generation/runtime use."""
 39 | 
 40 |     def __init__(self) -> None:
 41 |         self._clients: dict[str, ResolvedCLIClient] = {}
 42 |         self._load()
 43 | 
 44 |     def _load(self) -> None:
 45 |         self._clients.clear()
 46 |         for config_path in self._iter_config_files():
 47 |             try:
 48 |                 data = read_json_file(str(config_path))
 49 |             except json.JSONDecodeError as exc:
 50 |                 raise RegistryLoadError(f"Invalid JSON in {config_path}: {exc}") from exc
 51 | 
 52 |             if not data:
 53 |                 logger.debug("Skipping empty configuration file: %s", config_path)
 54 |                 continue
 55 | 
 56 |             config = CLIClientConfig.model_validate(data)
 57 |             resolved = self._resolve_config(config, source_path=config_path)
 58 |             key = resolved.name.lower()
 59 |             if key in self._clients:
 60 |                 logger.info("Overriding CLI configuration for '%s' from %s", resolved.name, config_path)
 61 |             else:
 62 |                 logger.debug("Loaded CLI configuration for '%s' from %s", resolved.name, config_path)
 63 |             self._clients[key] = resolved
 64 | 
 65 |         if not self._clients:
 66 |             raise RegistryLoadError(
 67 |                 "No CLI clients configured. Ensure conf/cli_clients contains at least one definition or set "
 68 |                 f"{CONFIG_ENV_VAR}."
 69 |             )
 70 | 
 71 |     def reload(self) -> None:
 72 |         """Reload configurations from disk."""
 73 |         self._load()
 74 | 
 75 |     def list_clients(self) -> list[str]:
 76 |         return sorted(client.name for client in self._clients.values())
 77 | 
 78 |     def list_roles(self, cli_name: str) -> list[str]:
 79 |         config = self.get_client(cli_name)
 80 |         return sorted(config.roles.keys())
 81 | 
 82 |     def get_client(self, cli_name: str) -> ResolvedCLIClient:
 83 |         key = cli_name.lower()
 84 |         if key not in self._clients:
 85 |             available = ", ".join(self.list_clients())
 86 |             raise KeyError(f"CLI '{cli_name}' is not configured. Available clients: {available}")
 87 |         return self._clients[key]
 88 | 
 89 |     # ------------------------------------------------------------------
 90 |     # Internal helpers
 91 |     # ------------------------------------------------------------------
 92 | 
 93 |     def _iter_config_files(self) -> Iterable[Path]:
 94 |         search_paths: list[Path] = []
 95 | 
 96 |         # 1. Built-in configs
 97 |         search_paths.append(CONFIG_DIR)
 98 | 
 99 |         # 2. CLI_CLIENTS_CONFIG_PATH environment override (file or directory)
100 |         env_path_raw = get_env(CONFIG_ENV_VAR)
101 |         if env_path_raw:
102 |             env_path = Path(env_path_raw).expanduser()
103 |             search_paths.append(env_path)
104 | 
105 |         # 3. User overrides in ~/.zen/cli_clients
106 |         search_paths.append(USER_CONFIG_DIR)
107 | 
108 |         seen: set[Path] = set()
109 | 
110 |         for base in search_paths:
111 |             if not base:
112 |                 continue
113 |             if base in seen:
114 |                 continue
115 |             seen.add(base)
116 | 
117 |             if base.is_file() and base.suffix.lower() == ".json":
118 |                 yield base
119 |                 continue
120 | 
121 |             if base.is_dir():
122 |                 for path in sorted(base.glob("*.json")):
123 |                     if path.is_file():
124 |                         yield path
125 |             else:
126 |                 logger.debug("Configuration path does not exist: %s", base)
127 | 
128 |     def _resolve_config(self, raw: CLIClientConfig, *, source_path: Path) -> ResolvedCLIClient:
129 |         if not raw.name:
130 |             raise RegistryLoadError(f"CLI configuration at {source_path} is missing a 'name' field")
131 | 
132 |         normalized_name = raw.name.strip()
133 |         internal_defaults = INTERNAL_DEFAULTS.get(normalized_name.lower())
134 |         if internal_defaults is None:
135 |             raise RegistryLoadError(f"CLI '{raw.name}' is not supported by clink")
136 | 
137 |         executable = self._resolve_executable(raw, internal_defaults, source_path)
138 | 
139 |         internal_args = list(internal_defaults.additional_args) if internal_defaults else []
140 |         config_args = list(raw.additional_args)
141 | 
142 |         timeout_seconds = raw.timeout_seconds or (
143 |             internal_defaults.timeout_seconds if internal_defaults else DEFAULT_TIMEOUT_SECONDS
144 |         )
145 | 
146 |         parser_name = internal_defaults.parser
147 |         if not parser_name:
148 |             raise RegistryLoadError(
149 |                 f"CLI '{raw.name}' must define a parser either in configuration or internal defaults"
150 |             )
151 | 
152 |         runner_name = internal_defaults.runner if internal_defaults else None
153 | 
154 |         env = self._merge_env(raw, internal_defaults)
155 |         working_dir = self._resolve_optional_path(raw.working_dir, source_path.parent)
156 |         roles = self._resolve_roles(raw, internal_defaults, source_path)
157 | 
158 |         output_to_file = raw.output_to_file
159 | 
160 |         return ResolvedCLIClient(
161 |             name=normalized_name,
162 |             executable=executable,
163 |             internal_args=internal_args,
164 |             config_args=config_args,
165 |             env=env,
166 |             timeout_seconds=int(timeout_seconds),
167 |             parser=parser_name,
168 |             runner=runner_name,
169 |             roles=roles,
170 |             output_to_file=output_to_file,
171 |             working_dir=working_dir,
172 |         )
173 | 
174 |     def _resolve_executable(
175 |         self,
176 |         raw: CLIClientConfig,
177 |         internal_defaults: CLIInternalDefaults | None,
178 |         source_path: Path,
179 |     ) -> list[str]:
180 |         command = raw.command
181 |         if not command:
182 |             raise RegistryLoadError(f"CLI '{raw.name}' must specify a 'command' in configuration")
183 |         return shlex.split(command)
184 | 
185 |     def _merge_env(
186 |         self,
187 |         raw: CLIClientConfig,
188 |         internal_defaults: CLIInternalDefaults | None,
189 |     ) -> dict[str, str]:
190 |         merged: dict[str, str] = {}
191 |         if internal_defaults and internal_defaults.env:
192 |             merged.update(internal_defaults.env)
193 |         merged.update(raw.env)
194 |         return merged
195 | 
196 |     def _resolve_roles(
197 |         self,
198 |         raw: CLIClientConfig,
199 |         internal_defaults: CLIInternalDefaults | None,
200 |         source_path: Path,
201 |     ) -> dict[str, ResolvedCLIRole]:
202 |         roles: dict[str, CLIRoleConfig] = dict(raw.roles)
203 | 
204 |         default_role_prompt = internal_defaults.default_role_prompt if internal_defaults else None
205 |         if "default" not in roles:
206 |             roles["default"] = CLIRoleConfig(prompt_path=default_role_prompt)
207 |         elif roles["default"].prompt_path is None and default_role_prompt:
208 |             roles["default"].prompt_path = default_role_prompt
209 | 
210 |         resolved: dict[str, ResolvedCLIRole] = {}
211 |         for role_name, role_config in roles.items():
212 |             prompt_path_str = role_config.prompt_path or default_role_prompt
213 |             if not prompt_path_str:
214 |                 raise RegistryLoadError(f"Role '{role_name}' for CLI '{raw.name}' must define a prompt_path")
215 |             prompt_path = self._resolve_prompt_path(prompt_path_str, source_path.parent)
216 |             resolved[role_name] = ResolvedCLIRole(
217 |                 name=role_name,
218 |                 prompt_path=prompt_path,
219 |                 role_args=list(role_config.role_args),
220 |                 description=role_config.description,
221 |             )
222 |         return resolved
223 | 
224 |     def _resolve_prompt_path(self, prompt_path: str, base_dir: Path) -> Path:
225 |         resolved = self._resolve_path(prompt_path, base_dir)
226 |         if not resolved.exists():
227 |             raise RegistryLoadError(f"Prompt file not found: {resolved}")
228 |         return resolved
229 | 
230 |     def _resolve_optional_path(self, candidate: str | None, base_dir: Path) -> Path | None:
231 |         if not candidate:
232 |             return None
233 |         return self._resolve_path(candidate, base_dir)
234 | 
235 |     def _resolve_path(self, candidate: str, base_dir: Path) -> Path:
236 |         path = Path(candidate)
237 |         if path.is_absolute():
238 |             return path
239 | 
240 |         candidate_path = (base_dir / path).resolve()
241 |         if candidate_path.exists():
242 |             return candidate_path
243 | 
244 |         project_relative = (PROJECT_ROOT / path).resolve()
245 |         return project_relative
246 | 
247 | 
248 | _REGISTRY: ClinkRegistry | None = None
249 | 
250 | 
251 | def get_registry() -> ClinkRegistry:
252 |     global _REGISTRY
253 |     if _REGISTRY is None:
254 |         _REGISTRY = ClinkRegistry()
255 |     return _REGISTRY
256 | 
```

--------------------------------------------------------------------------------
/docs/tools/testgen.md:
--------------------------------------------------------------------------------

```markdown
  1 | # TestGen Tool - Comprehensive Test Generation
  2 | 
  3 | **Generates thorough test suites with edge case coverage through workflow-driven investigation**
  4 | 
  5 | The `testgen` tool creates comprehensive test suites by analyzing your code paths, understanding intricate dependencies, and identifying realistic edge cases and failure scenarios that need test coverage. This workflow tool guides Claude through systematic investigation of code functionality, critical paths, edge cases, and integration points across multiple steps before generating comprehensive tests with realistic failure mode analysis.
  6 | 
  7 | ## Thinking Mode
  8 | 
  9 | **Default is `medium` (8,192 tokens) for extended thinking models.** Use `high` for complex systems with many interactions or `max` for critical systems requiring exhaustive test coverage.
 10 | 
 11 | ## How the Workflow Works
 12 | 
 13 | The testgen tool implements a **structured workflow** for comprehensive test generation:
 14 | 
 15 | **Investigation Phase (Claude-Led):**
 16 | 1. **Step 1**: Claude describes the test generation plan and begins analyzing code functionality
 17 | 2. **Step 2+**: Claude examines critical paths, edge cases, error handling, and integration points
 18 | 3. **Throughout**: Claude tracks findings, test scenarios, and coverage gaps
 19 | 4. **Completion**: Once investigation is thorough, Claude signals completion
 20 | 
 21 | **Test Generation Phase:**
 22 | After Claude completes the investigation:
 23 | - Complete test scenario catalog with all edge cases
 24 | - Framework-specific test generation
 25 | - Realistic failure mode coverage
 26 | - Final test suite with comprehensive coverage
 27 | 
 28 | This workflow ensures methodical analysis before test generation, resulting in more thorough and valuable test suites.
 29 | 
 30 | ## Model Recommendation
 31 | 
 32 | Test generation excels with extended reasoning models like Gemini Pro or O3, which can analyze complex code paths, understand intricate dependencies, and identify comprehensive edge cases. The combination of large context windows and advanced reasoning enables generation of thorough test suites that cover realistic failure scenarios and integration points that shorter-context models might overlook.
 33 | 
 34 | ## Example Prompts
 35 | 
 36 | **Basic Usage:**
 37 | ```
 38 | "Use zen to generate tests for User.login() method"
 39 | "Generate comprehensive tests for the sorting method in src/new_sort.py using o3"
 40 | "Create tests for edge cases not already covered in our tests using gemini pro"
 41 | ```
 42 | 
 43 | ## Key Features
 44 | 
 45 | - **Multi-agent workflow** analyzing code paths and identifying realistic failure modes
 46 | - **Generates framework-specific tests** following project conventions
 47 | - **Supports test pattern following** when examples are provided
 48 | - **Dynamic token allocation** (25% for test examples, 75% for main code)
 49 | - **Prioritizes smallest test files** for pattern detection
 50 | - **Can reference existing test files**: `"Generate tests following patterns from tests/unit/"`
 51 | - **Specific code coverage** - target specific functions/classes rather than testing everything
 52 | - **Image support**: Test UI components, analyze visual requirements: `"Generate tests for this login form using the UI mockup screenshot"`
 53 | - **Edge case identification**: Systematic discovery of boundary conditions and error states
 54 | - **Realistic failure mode analysis**: Understanding what can actually go wrong in production
 55 | - **Integration test support**: Tests that cover component interactions and system boundaries
 56 | 
 57 | ## Tool Parameters
 58 | 
 59 | **Workflow Investigation Parameters (used during step-by-step process):**
 60 | - `step`: Current investigation step description (required for each step)
 61 | - `step_number`: Current step number in test generation sequence (required)
 62 | - `total_steps`: Estimated total investigation steps (adjustable)
 63 | - `next_step_required`: Whether another investigation step is needed
 64 | - `findings`: Discoveries about functionality and test scenarios (required)
 65 | - `files_checked`: All files examined during investigation
 66 | - `relevant_files`: Files directly needing tests (required in step 1)
 67 | - `relevant_context`: Methods/functions/classes requiring test coverage
 68 | - `confidence`: Confidence level in test plan completeness (exploring/low/medium/high/certain)
 69 | 
 70 | **Initial Configuration (used in step 1):**
 71 | - `prompt`: Description of what to test, testing objectives, and specific scope/focus areas (required)
 72 | - `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
 73 | - `test_examples`: Optional existing test files or directories to use as style/pattern reference (absolute paths)
 74 | - `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
 75 | - `use_assistant_model`: Whether to use expert test generation phase (default: true, set to false to use Claude only)
 76 | 
 77 | ## Usage Examples
 78 | 
 79 | **Method-Specific Tests:**
 80 | ```
 81 | "Generate tests for User.login() method covering authentication success, failure, and edge cases"
 82 | ```
 83 | 
 84 | **Class Testing:**
 85 | ```
 86 | "Use pro to generate comprehensive tests for PaymentProcessor class with max thinking mode"
 87 | ```
 88 | 
 89 | **Following Existing Patterns:**
 90 | ```
 91 | "Generate tests for new authentication module following patterns from tests/unit/auth/"
 92 | ```
 93 | 
 94 | **UI Component Testing:**
 95 | ```
 96 | "Generate tests for this login form component using the UI mockup screenshot"
 97 | ```
 98 | 
 99 | **Algorithm Testing:**
100 | ```
101 | "Create thorough tests for the sorting algorithm in utils/sort.py, focus on edge cases and performance"
102 | ```
103 | 
104 | **Integration Testing:**
105 | ```
106 | "Generate integration tests for the payment processing pipeline from order creation to completion"
107 | ```
108 | 
109 | ## Test Generation Strategy
110 | 
111 | **Code Path Analysis:**
112 | - Identifies all execution paths through the code
113 | - Maps conditional branches and loops
114 | - Discovers error handling paths
115 | - Analyzes state transitions
116 | 
117 | **Edge Case Discovery:**
118 | - Boundary value analysis (empty, null, max values)
119 | - Invalid input scenarios
120 | - Race conditions and timing issues
121 | - Resource exhaustion cases
122 | 
123 | **Failure Mode Analysis:**
124 | - External dependency failures
125 | - Network and I/O errors
126 | - Authentication and authorization failures
127 | - Data corruption scenarios
128 | 
129 | **Framework Detection:**
130 | The tool automatically detects and generates tests for:
131 | - **Python**: pytest, unittest, nose2
132 | - **JavaScript**: Jest, Mocha, Jasmine, Vitest
133 | - **Java**: JUnit 4/5, TestNG, Mockito
134 | - **C#**: NUnit, MSTest, xUnit
135 | - **Swift**: XCTest
136 | - **Go**: testing package
137 | - **And more**: Adapts to project conventions
138 | 
139 | ## Test Categories Generated
140 | 
141 | **Unit Tests:**
142 | - Function/method behavior validation
143 | - Input/output verification
144 | - Error condition handling
145 | - State change verification
146 | 
147 | **Integration Tests:**
148 | - Component interaction testing
149 | - API endpoint validation
150 | - Database integration
151 | - External service mocking
152 | 
153 | **Edge Case Tests:**
154 | - Boundary conditions
155 | - Invalid inputs
156 | - Resource limits
157 | - Concurrent access
158 | 
159 | **Performance Tests:**
160 | - Response time validation
161 | - Memory usage checks
162 | - Load handling
163 | - Scalability verification
164 | 
165 | ## Best Practices
166 | 
167 | - **Be specific about scope**: Target specific functions/classes rather than requesting tests for everything
168 | - **Provide test examples**: Include existing test files for pattern consistency
169 | - **Focus on critical paths**: Prioritize testing of business-critical functionality
170 | - **Include visual context**: Screenshots or mockups for UI component testing
171 | - **Describe testing objectives**: Explain what aspects are most important to test
172 | - **Consider test maintenance**: Request readable, maintainable test code
173 | 
174 | ## Test Quality Features
175 | 
176 | **Realistic Test Data:**
177 | - Generates meaningful test data that represents real-world scenarios
178 | - Avoids trivial test cases that don't add value
179 | - Creates data that exercises actual business logic
180 | 
181 | **Comprehensive Coverage:**
182 | - Happy path scenarios
183 | - Error conditions and exceptions
184 | - Edge cases and boundary conditions
185 | - Integration points and dependencies
186 | 
187 | **Maintainable Code:**
188 | - Clear test names that describe what's being tested
189 | - Well-organized test structure
190 | - Appropriate use of setup/teardown
191 | - Minimal test data and mocking
192 | 
193 | ## Advanced Features
194 | 
195 | **Pattern Following:**
196 | When test examples are provided, the tool analyzes:
197 | - Naming conventions and structure
198 | - Assertion patterns and style
199 | - Mocking and setup approaches
200 | - Test data organization
201 | 
202 | **Large Context Analysis:**
203 | With models like Gemini Pro, the tool can:
204 | - Analyze extensive codebases for comprehensive test coverage
205 | - Understand complex interactions across multiple modules
206 | - Generate integration tests that span multiple components
207 | 
208 | **Visual Testing:**
209 | For UI components and visual elements:
210 | - Generate tests based on visual requirements
211 | - Create accessibility testing scenarios
212 | - Test responsive design behaviors
213 | 
214 | ## When to Use TestGen vs Other Tools
215 | 
216 | - **Use `testgen`** for: Creating comprehensive test suites, filling test coverage gaps, testing new features
217 | - **Use `debug`** for: Diagnosing specific test failures or runtime issues
218 | - **Use `codereview`** for: Reviewing existing test quality and coverage
219 | - **Use `analyze`** for: Understanding existing test structure without generating new tests
220 | 
```

--------------------------------------------------------------------------------
/utils/client_info.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Client Information Utility for MCP Server
  3 | 
  4 | This module provides utilities to extract and format client information
  5 | from the MCP protocol's clientInfo sent during initialization.
  6 | 
  7 | It also provides friendly name mapping and caching for consistent client
  8 | identification across the application.
  9 | """
 10 | 
 11 | import logging
 12 | from typing import Any, Optional
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | # Global cache for client information
 17 | _client_info_cache: Optional[dict[str, Any]] = None
 18 | 
 19 | # Mapping of known client names to friendly names
 20 | # This is case-insensitive and checks if the key is contained in the client name
 21 | CLIENT_NAME_MAPPINGS = {
 22 |     # Claude variants
 23 |     "claude-ai": "Claude",
 24 |     "claude": "Claude",
 25 |     "claude-desktop": "Claude",
 26 |     "claude-code": "Claude",
 27 |     "anthropic": "Claude",
 28 |     # Gemini variants
 29 |     "gemini-cli-mcp-client": "Gemini",
 30 |     "gemini-cli": "Gemini",
 31 |     "gemini": "Gemini",
 32 |     "google": "Gemini",
 33 |     # Other known clients
 34 |     "cursor": "Cursor",
 35 |     "vscode": "VS Code",
 36 |     "codeium": "Codeium",
 37 |     "copilot": "GitHub Copilot",
 38 |     # Generic MCP clients
 39 |     "mcp-client": "MCP Client",
 40 |     "test-client": "Test Client",
 41 | }
 42 | 
 43 | # Default friendly name when no match is found
 44 | DEFAULT_FRIENDLY_NAME = "Claude"
 45 | 
 46 | 
 47 | def get_friendly_name(client_name: str) -> str:
 48 |     """
 49 |     Map a client name to a friendly name.
 50 | 
 51 |     Args:
 52 |         client_name: The raw client name from clientInfo
 53 | 
 54 |     Returns:
 55 |         A friendly name for display (e.g., "Claude", "Gemini")
 56 |     """
 57 |     if not client_name:
 58 |         return DEFAULT_FRIENDLY_NAME
 59 | 
 60 |     # Convert to lowercase for case-insensitive matching
 61 |     client_name_lower = client_name.lower()
 62 | 
 63 |     # Check each mapping - using 'in' to handle partial matches
 64 |     for key, friendly_name in CLIENT_NAME_MAPPINGS.items():
 65 |         if key.lower() in client_name_lower:
 66 |             return friendly_name
 67 | 
 68 |     # If no match found, return the default
 69 |     return DEFAULT_FRIENDLY_NAME
 70 | 
 71 | 
 72 | def get_cached_client_info() -> Optional[dict[str, Any]]:
 73 |     """
 74 |     Get cached client information if available.
 75 | 
 76 |     Returns:
 77 |         Cached client info dictionary or None
 78 |     """
 79 |     global _client_info_cache
 80 |     return _client_info_cache
 81 | 
 82 | 
 83 | def get_client_info_from_context(server: Any) -> Optional[dict[str, Any]]:
 84 |     """
 85 |     Extract client information from the MCP server's request context.
 86 | 
 87 |     The MCP protocol sends clientInfo during initialization containing:
 88 |     - name: The client application name (e.g., "Claude Code", "Claude Desktop")
 89 |     - version: The client version string
 90 | 
 91 |     This function also adds a friendly_name field and caches the result.
 92 | 
 93 |     Args:
 94 |         server: The MCP server instance
 95 | 
 96 |     Returns:
 97 |         Dictionary with client info or None if not available:
 98 |         {
 99 |             "name": "claude-ai",
100 |             "version": "1.0.0",
101 |             "friendly_name": "Claude"
102 |         }
103 |     """
104 |     global _client_info_cache
105 | 
106 |     # Return cached info if available
107 |     if _client_info_cache is not None:
108 |         return _client_info_cache
109 | 
110 |     try:
111 |         # Try to access the request context and session
112 |         if not server:
113 |             return None
114 | 
115 |         # Check if server has request_context property
116 |         request_context = None
117 |         try:
118 |             request_context = server.request_context
119 |         except AttributeError:
120 |             logger.debug("Server does not have request_context property")
121 |             return None
122 | 
123 |         if not request_context:
124 |             logger.debug("Request context is None")
125 |             return None
126 | 
127 |         # Try to access session from request context
128 |         session = None
129 |         try:
130 |             session = request_context.session
131 |         except AttributeError:
132 |             logger.debug("Request context does not have session property")
133 |             return None
134 | 
135 |         if not session:
136 |             logger.debug("Session is None")
137 |             return None
138 | 
139 |         # Try to access client params from session
140 |         client_params = None
141 |         try:
142 |             # The clientInfo is stored in _client_params.clientInfo
143 |             client_params = session._client_params
144 |         except AttributeError:
145 |             logger.debug("Session does not have _client_params property")
146 |             return None
147 | 
148 |         if not client_params:
149 |             logger.debug("Client params is None")
150 |             return None
151 | 
152 |         # Try to extract clientInfo
153 |         client_info = None
154 |         try:
155 |             client_info = client_params.clientInfo
156 |         except AttributeError:
157 |             logger.debug("Client params does not have clientInfo property")
158 |             return None
159 | 
160 |         if not client_info:
161 |             logger.debug("Client info is None")
162 |             return None
163 | 
164 |         # Extract name and version
165 |         result = {}
166 | 
167 |         try:
168 |             result["name"] = client_info.name
169 |         except AttributeError:
170 |             logger.debug("Client info does not have name property")
171 | 
172 |         try:
173 |             result["version"] = client_info.version
174 |         except AttributeError:
175 |             logger.debug("Client info does not have version property")
176 | 
177 |         if not result:
178 |             return None
179 | 
180 |         # Add friendly name
181 |         raw_name = result.get("name", "")
182 |         result["friendly_name"] = get_friendly_name(raw_name)
183 | 
184 |         # Cache the result
185 |         _client_info_cache = result
186 |         logger.debug(f"Cached client info: {result}")
187 | 
188 |         return result
189 | 
190 |     except Exception as e:
191 |         logger.debug(f"Error extracting client info: {e}")
192 |         return None
193 | 
194 | 
195 | def format_client_info(client_info: Optional[dict[str, Any]], use_friendly_name: bool = True) -> str:
196 |     """
197 |     Format client information for display.
198 | 
199 |     Args:
200 |         client_info: Dictionary with client info or None
201 |         use_friendly_name: If True, use the friendly name instead of raw name
202 | 
203 |     Returns:
204 |         Formatted string like "Claude v1.0.0" or "Claude"
205 |     """
206 |     if not client_info:
207 |         return DEFAULT_FRIENDLY_NAME
208 | 
209 |     if use_friendly_name:
210 |         name = client_info.get("friendly_name", client_info.get("name", DEFAULT_FRIENDLY_NAME))
211 |     else:
212 |         name = client_info.get("name", "Unknown")
213 | 
214 |     version = client_info.get("version", "")
215 | 
216 |     if version and not use_friendly_name:
217 |         return f"{name} v{version}"
218 |     else:
219 |         # For friendly names, we just return the name without version
220 |         return name
221 | 
222 | 
223 | def get_client_friendly_name() -> str:
224 |     """
225 |     Get the cached client's friendly name.
226 | 
227 |     This is a convenience function that returns just the friendly name
228 |     from the cached client info, defaulting to "Claude" if not available.
229 | 
230 |     Returns:
231 |         The friendly name (e.g., "Claude", "Gemini")
232 |     """
233 |     cached_info = get_cached_client_info()
234 |     if cached_info:
235 |         return cached_info.get("friendly_name", DEFAULT_FRIENDLY_NAME)
236 |     return DEFAULT_FRIENDLY_NAME
237 | 
238 | 
239 | def log_client_info(server: Any, logger_instance: Optional[logging.Logger] = None) -> None:
240 |     """
241 |     Log client information extracted from the server.
242 | 
243 |     Args:
244 |         server: The MCP server instance
245 |         logger_instance: Optional logger to use (defaults to module logger)
246 |     """
247 |     log = logger_instance or logger
248 | 
249 |     client_info = get_client_info_from_context(server)
250 |     if client_info:
251 |         # Log with both raw and friendly names for debugging
252 |         raw_name = client_info.get("name", "Unknown")
253 |         friendly_name = client_info.get("friendly_name", DEFAULT_FRIENDLY_NAME)
254 |         version = client_info.get("version", "")
255 | 
256 |         if raw_name != friendly_name:
257 |             log.info(f"MCP Client Connected: {friendly_name} (raw: {raw_name} v{version})")
258 |         else:
259 |             log.info(f"MCP Client Connected: {friendly_name} v{version}")
260 | 
261 |         # Log to activity logger as well
262 |         try:
263 |             activity_logger = logging.getLogger("mcp_activity")
264 |             activity_logger.info(f"CLIENT_IDENTIFIED: {friendly_name} (name={raw_name}, version={version})")
265 |         except Exception:
266 |             pass
267 |     else:
268 |         log.debug("Could not extract client info from MCP protocol")
269 | 
270 | 
271 | # Example usage in tools:
272 | #
273 | # from utils.client_info import get_client_friendly_name, get_cached_client_info
274 | #
275 | # # In a tool's execute method:
276 | # def execute(self, arguments: dict[str, Any]) -> list[TextContent]:
277 | #     # Get the friendly name of the connected client
278 | #     client_name = get_client_friendly_name()  # Returns "Claude" or "Gemini" etc.
279 | #
280 | #     # Or get full cached info if needed
281 | #     client_info = get_cached_client_info()
282 | #     if client_info:
283 | #         raw_name = client_info['name']        # e.g., "claude-ai"
284 | #         version = client_info['version']      # e.g., "1.0.0"
285 | #         friendly = client_info['friendly_name'] # e.g., "Claude"
286 | #
287 | #     # Customize response based on client
288 | #     if client_name == "Claude":
289 | #         response = f"Hello from Zen MCP Server to {client_name}!"
290 | #     elif client_name == "Gemini":
291 | #         response = f"Greetings {client_name}, welcome to Zen MCP Server!"
292 | #     else:
293 | #         response = f"Welcome {client_name}!"
294 | 
```

--------------------------------------------------------------------------------
/docs/tools/clink.md:
--------------------------------------------------------------------------------

```markdown
  1 | # Clink Tool - CLI-to-CLI Bridge
  2 | 
  3 | **Spawn AI subagents, connect external CLIs, orchestrate isolated contexts – all without leaving your session**
  4 | 
  5 | The `clink` tool transforms your CLI into a multi-agent orchestrator. Launch isolated Codex instances from _within_ Codex, delegate to Gemini's 1M context, or run specialized Claude agents—all while preserving conversation continuity. Instead of context-switching or token bloat, spawn fresh subagents that handle complex tasks in isolation and return only the results you need.
  6 | 
  7 | > **CAUTION**: Clink launches real CLI agents with relaxed permission flags (Gemini ships with `--yolo`, Codex with `--dangerously-bypass-approvals-and-sandbox`, Claude with `--permission-mode acceptEdits`) so they can edit files and run tools autonomously via MCP. If that’s more access than you want, remove those flags—the CLI can still open/read files and report findings, it just won’t auto-apply edits. You can also tighten role prompts or system prompts with stop-words/guardrails, or disable clink entirely. Otherwise, keep the shipped presets confined to workspaces you fully trust.
  8 | 
  9 | ## Why Use Clink (CLI + Link)?
 10 | 
 11 | ### Codex-within-Codex: The Ultimate Context Management
 12 | 
 13 | **The Problem**: You're deep in a Codex session debugging authentication. Now you need a comprehensive security audit, but that'll consume 50K tokens of context you can't spare.
 14 | 
 15 | **The Solution**: Spawn a fresh Codex subagent in an isolated context:
 16 | ```bash
 17 | clink with codex codereviewer to audit auth/ for OWASP Top 10 vulnerabilities
 18 | ```
 19 | 
 20 | The subagent:
 21 | - Launches in a **pristine context** with full token budget
 22 | - Performs deep analysis using its own MCP tools and web search
 23 | - Returns **only the final security report** (not intermediate steps)
 24 | - Your main session stays **laser-focused** on debugging
 25 | 
 26 | **Works with any supported CLI**: Codex can spawn Codex / Claude Code / Gemini CLI subagents, or mix and match between different CLIs.
 27 | 
 28 | ---
 29 | 
 30 | ### Cross-CLI Orchestration
 31 | 
 32 | **Scenario 1**: You're in Codex and need Gemini's 1M context window to analyze a massive legacy codebase.
 33 | 
 34 | **Without clink**: Open new terminal → run `gemini` → lose conversation context → manually copy/paste findings → context mismatch hell.
 35 | 
 36 | **With clink**: `"clink with gemini to map dependencies across this 500-file monorepo"` – Gemini processes, returns insights, conversation flows seamlessly.
 37 | 
 38 | **Scenario 2**: Use [`consensus`](consensus.md) to debate features with multiple models, then hand off to Gemini for implementation.
 39 | 
 40 | ```
 41 | "Use consensus with pro and gpt5 to decide whether to add dark mode or offline support next"
 42 | [consensus runs, models deliberate, recommendation emerges]
 43 | 
 44 | Use continuation with clink - implement the recommended feature
 45 | ```
 46 | 
 47 | Gemini receives the full conversation context from `consensus` including the consensus prompt + replies, understands the chosen feature, technical constraints discussed, and can start implementation immediately. No re-explaining, no context loss - true conversation continuity across tools and models.
 48 | 
 49 | ## Key Features
 50 | 
 51 | - **Stay in one CLI**: No switching between terminal sessions or losing context
 52 | - **Full conversation continuity**: Gemini's responses participate in the same conversation thread
 53 | - **Role-based prompts**: Pre-configured roles for planning, code review, or general questions
 54 | - **Full CLI capabilities**: Gemini can use its own web search, file tools, and latest features
 55 | - **Token efficiency**: File references (not full content) to conserve tokens
 56 | - **Cross-tool collaboration**: Combine with other Zen tools like `planner` → `clink` → `codereview`
 57 | - **Free tier available**: Gemini offers 1,000 requests/day free with a personal Google account - great for cost savings across tools
 58 | 
 59 | ## Available Roles
 60 | 
 61 | **Default Role** - General questions, summaries, quick answers
 62 | ```
 63 | Use clink to ask gemini about the latest React 19 features
 64 | ```
 65 | 
 66 | **Planner Role** - Strategic planning with multi-phase approach
 67 | ```
 68 | clink with gemini with planner role to map out our microservices migration strategy
 69 | ```
 70 | 
 71 | **Code Reviewer Role** - Focused code analysis with severity levels
 72 | ```
 73 | Use clink codereviewer role to review auth.py for security issues
 74 | ```
 75 | 
 76 | You can make your own custom roles in `conf/cli_clients/` or tweak any of the shipped presets.
 77 | 
 78 | ## Tool Parameters
 79 | 
 80 | - `prompt`: Your question or task for the external CLI (required)
 81 | - `cli_name`: Which CLI to use - `gemini` (default), `claude`, `codex`, or add your own in `conf/cli_clients/`
 82 | - `role`: Preset role - `default`, `planner`, `codereviewer` (default: `default`)
 83 | - `files`: Optional file paths for context (references only, CLI opens files itself)
 84 | - `images`: Optional image paths for visual context
 85 | - `continuation_id`: Continue previous clink conversations
 86 | 
 87 | ## Usage Examples
 88 | 
 89 | **Architecture Planning:**
 90 | ```
 91 | Use clink with gemini planner to design a 3-phase rollout plan for our feature flags system
 92 | ```
 93 | 
 94 | **Code Review with Context:**
 95 | ```
 96 | clink to gemini codereviewer: Review payment_service.py for race conditions and concurrency issues
 97 | ```
 98 | 
 99 | **Codex Code Review:**
100 | ```
101 | "clink with codex cli and perform a full code review using the codereview role"
102 | ```
103 | 
104 | **Quick Research Question:**
105 | ```
106 | "Ask gemini via clink: What are the breaking changes in TypeScript 5.5?"
107 | ```
108 | 
109 | **Multi-Tool Workflow:**
110 | ```
111 | "Use planner to outline the refactor, then clink gemini planner for validation,
112 | then codereview to verify the implementation"
113 | ```
114 | 
115 | **Leveraging Gemini's Web Search:**
116 | ```
117 | "Clink gemini to research current best practices for Kubernetes autoscaling in 2025"
118 | ```
119 | 
120 | ## How Clink Works
121 | 
122 | 1. **Your request** - You ask your current CLI to use `clink` with a specific CLI and role
123 | 2. **Background execution** - Zen spawns the configured CLI (e.g., `gemini --output-format json`)
124 | 3. **Context forwarding** - Your prompt, files (as references), and conversation history are sent as part of the prompt
125 | 4. **CLI processing** - Gemini (or other CLI) uses its own tools: web search, file access, thinking modes
126 | 5. **Seamless return** - Results flow back into your conversation with full context preserved
127 | 6. **Continuation support** - Future tools and models can reference Gemini's findings via [continuation support](../context-revival.md) within Zen. 
128 | 
129 | ## Best Practices
130 | 
131 | - **Pre-authenticate CLIs**: Install and configure Gemini CLI first (`npm install -g @google/gemini-cli`)
132 | - **Choose appropriate roles**: Use `planner` for strategy, `codereviewer` for code, `default` for general questions
133 | - **Leverage CLI strengths**: Gemini's 1M context for large codebases, web search for current docs
134 | - **Combine with Zen tools**: Chain `clink` with `planner`, `codereview`, `debug` for powerful workflows
135 | - **File efficiency**: Pass file paths, let the CLI decide what to read (saves tokens)
136 | 
137 | ## Configuration
138 | 
139 | Clink configurations live in `conf/cli_clients/`. We ship presets for the supported CLIs:
140 | 
141 | - `gemini.json` – runs `gemini --telemetry false --yolo -o json`
142 | - `claude.json` – runs `claude --print --output-format json --permission-mode acceptEdits --model sonnet`
143 | - `codex.json` – runs `codex exec --json --dangerously-bypass-approvals-and-sandbox`
144 | 
145 | > **CAUTION**: These flags intentionally bypass each CLI's safety prompts so they can edit files or launch tools autonomously via MCP. Only enable them in trusted sandboxes and tailor role prompts or CLI configs if you need more guardrails.
146 | 
147 | Each preset points to role-specific prompts in `systemprompts/clink/`. Duplicate those files to add more roles or adjust CLI flags.
148 | 
149 | > **Why `--yolo` for Gemini?** The Gemini CLI currently requires automatic approvals to execute its own tools (for example `run_shell_command`). Without the flag it errors with `Tool "run_shell_command" not found in registry`. See [issue #5382](https://github.com/google-gemini/gemini-cli/issues/5382) for more details.
150 | 
151 | **Adding new CLIs**: Drop a JSON config into `conf/cli_clients/`, create role prompts in `systemprompts/clink/`, and register a parser/agent if the CLI outputs a new format.
152 | 
153 | ## When to Use Clink vs Other Tools
154 | 
155 | - **Use `clink`** for: Leveraging external CLI capabilities (Gemini's web search, 1M context), specialized CLI features, cross-CLI collaboration
156 | - **Use `chat`** for: Direct model-to-model conversations within Zen
157 | - **Use `planner`** for: Zen's native planning workflows with step validation
158 | - **Use `codereview`** for: Zen's structured code review with severity levels
159 | 
160 | ## Setup Requirements
161 | 
162 | Ensure the relevant CLI is installed and configured:
163 | 
164 | - [Claude Code](https://www.anthropic.com/claude-code)
165 | - [Gemini CLI](https://github.com/google-gemini/gemini-cli)
166 | - [Codex CLI](https://docs.sourcegraph.com/codex)
167 | 
168 | ## Related Guides
169 | 
170 | - [Chat Tool](chat.md) - Direct model conversations
171 | - [Planner Tool](planner.md) - Zen's native planning workflows
172 | - [CodeReview Tool](codereview.md) - Structured code reviews
173 | - [Context Revival](../context-revival.md) - Continuing conversations across tools
174 | - [Advanced Usage](../advanced-usage.md) - Complex multi-tool workflows
175 | 
```

--------------------------------------------------------------------------------
/systemprompts/precommit_prompt.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Precommit tool system prompt
  3 | """
  4 | 
  5 | PRECOMMIT_PROMPT = """
  6 | ROLE
  7 | You are an expert pre-commit reviewer and senior engineering partner,
  8 | conducting a pull-request style review as the final gatekeeper for
  9 | production code.
 10 | As a polyglot programming expert with an encyclopedic knowledge of design patterns,
 11 | anti-patterns, and language-specific idioms, your responsibility goes beyond
 12 | surface-level correctness to rigorous, predictive analysis. Your review must
 13 | assess whether the changes:
 14 | - Introduce patterns or decisions that may become future technical debt.
 15 | - Create brittle dependencies or tight coupling that will hinder maintenance.
 16 | - Omit critical validation, error handling, or test scaffolding that will
 17 |   cause future failures.
 18 | - Interact negatively with other parts of the codebase, even those not
 19 |   directly touched.
 20 | 
 21 | Your task is to perform rigorous mental static analysis, simulating how new
 22 | inputs and edge cases flow through the changed code to predict failures. Think
 23 | like an engineer responsible for this code months from now, debugging a
 24 | production incident.
 25 | 
 26 | In addition to reviewing correctness, completeness, and quality of the change,
 27 | apply long-term architectural thinking. Your feedback helps ensure this code
 28 | won't cause silent regressions, developer confusion, or downstream side effects
 29 | later.
 30 | 
 31 | CRITICAL LINE NUMBER INSTRUCTIONS
 32 | Code is presented with line number markers "LINE│ code". These markers are for
 33 | reference ONLY and MUST NOT be included in any code you generate.
 34 | Always reference specific line numbers in your replies to locate exact
 35 | positions. Include a very short code excerpt alongside each finding for clarity.
 36 | Never include "LINE│" markers in generated code snippets.
 37 | 
 38 | INPUTS PROVIDED
 39 | 1. Git diff (staged or branch comparison)
 40 | 2. Original request / acceptance criteria or context around what changed
 41 | 3. File names and related code
 42 | 
 43 | SCOPE & FOCUS
 44 | - Review ONLY the changes in the diff and their immediate context.
 45 | - Reconstruct what changed, why it was changed, and what outcome it is supposed to deliver.
 46 | - Classify the diff (bug fix, improvement, new feature, refactor, etc.) and
 47 | confirm the implementation matches that intent.
 48 | - If the change is a bug fix, determine whether it addresses the root cause and
 49 | whether a materially safer or more maintainable fix was available.
 50 | - Evaluate whether the change achieves its stated goals without introducing
 51 | regressions, especially when new methods, public APIs, or behavioral fixes are
 52 | involved.
 53 | - Assess potential repercussions: downstream consumers, compatibility
 54 | contracts, documentation, dependencies, and operational impact.
 55 | - Anchor every observation in the provided request, commit message, tests, and
 56 | diff evidence; avoid speculation beyond available context.
 57 | - Surface any assumptions or missing context explicitly. If clarity is
 58 | impossible without more information, use the structured response to request it.
 59 | - Ensure the changes correctly implement the request and are secure, performant, and maintainable.
 60 | - Do not propose broad refactors or unrelated improvements. Stay strictly within the boundaries of the provided changes.
 61 | 
 62 | REVIEW PROCESS & MENTAL MODEL
 63 | 1.  **Identify Context:** Note the tech stack, frameworks, and existing patterns.
 64 | 2.  **Infer Intent & Change Type:** Determine what changed, why it changed, how
 65 | it is expected to behave, and categorize it (bug fix, feature, improvement,
 66 | refactor, etc.). Tie this back to the stated request, commit message, and
 67 | available tests so conclusions stay grounded; for bug fixes, confirm the root
 68 | cause is resolved and note if a materially better remedy exists.
 69 | 3.  **Perform Deep Static Analysis of the Diff:**
 70 |     - **Verify Objectives:** Confirm the modifications actually deliver the
 71 |       intended behavior and align with the inferred goals.
 72 |     - **Trace Data Flow:** Follow variables and data structures through the
 73 |       new/modified logic.
 74 |     - **Simulate Edge Cases:** Mentally test with `null`/`nil`, empty
 75 |       collections, zero, negative numbers, and extremely large values.
 76 |     - **Assess Side Effects:** Consider the impact on callers, downstream
 77 |       consumers, and shared state (e.g., databases, caches).
 78 | 4.  **Assess Ripple Effects:** Identify compatibility shifts, documentation
 79 |     impacts, regression risks, and untested surfaces introduced by the change.
 80 | 5.  **Prioritize Issues:** Detect and rank issues by severity (CRITICAL → HIGH → MEDIUM → LOW).
 81 | 6.  **Recommend Fixes:** Provide specific, actionable solutions for each issue.
 82 | 7.  **Acknowledge Positives:** Reinforce sound patterns and well-executed code.
 83 | 8.  **Avoid Over-engineering:** Do not suggest solutions that add unnecessary
 84 |     complexity for hypothetical future problems.
 85 | 
 86 | CORE ANALYSIS (Applied to the diff)
 87 | - **Security:** Does this change introduce injection risks, auth flaws, data
 88 |   exposure, or unsafe dependencies?
 89 | - **Bugs & Logic Errors:** Does this change introduce off-by-one errors, null
 90 |   dereferences, incorrect logic, or race conditions?
 91 | - **Performance:** Does this change introduce inefficient loops, blocking I/O on
 92 |   critical paths, or resource leaks?
 93 | - **Code Quality:** Does this change add unnecessary complexity, duplicate logic
 94 |   (DRY), or violate architectural principles (SOLID)?
 95 | 
 96 | ADDITIONAL ANALYSIS (only when relevant)
 97 | - Language/runtime concerns – memory management, concurrency, exception
 98 |   handling
 99 |     - Carefully assess the code's context and purpose before raising
100 |       concurrency-related concerns. Confirm the presence of shared state, race
101 |       conditions, or unsafe access patterns before flagging any issues to avoid
102 |       false positives.
103 |     - Also carefully evaluate concurrency and parallelism risks only after
104 |       confirming that the code runs in an environment where such concerns are
105 |       applicable. Avoid flagging issues unless shared state, asynchronous
106 |       execution, or multi-threaded access are clearly possible based on
107 |       context.
108 | - System/integration – config handling, external calls, operational impact
109 | - Testing – coverage gaps for new logic
110 |     - If no tests are found in the project, do not flag test coverage as an issue unless the change introduces logic
111 |       that is high-risk or complex.
112 |     - In such cases, offer a low-severity suggestion encouraging basic tests, rather than marking it as a required fix.
113 | - Change-specific pitfalls – unused new functions, partial enum updates, scope creep, risky deletions
114 | - Determine if there are any new dependencies added but not declared, or new functionality added but not used
115 | - Determine unintended side effects: could changes in file_A break module_B even if module_B wasn't changed?
116 | - Flag changes unrelated to the original request that may introduce needless complexity or an anti-pattern
117 | - Determine if there are code removal risks: was removed code truly dead, or could removal break functionality?
118 | - Missing documentation around new methods / parameters, or missing comments around complex logic and code that
119 |   requires it
120 | 
121 | OUTPUT FORMAT
122 | 
123 | ### Repository Summary
124 | **Repository:** /path/to/repo
125 | - Files changed: X
126 | - Overall assessment: brief statement with critical issue count
127 | 
128 | MANDATORY: You must ONLY respond in the following format. List issues by
129 | severity and include ONLY the severities that apply:
130 | 
131 | [CRITICAL] Short title
132 | - File: /absolute/path/to/file.py:line
133 | - Description: what & why
134 | - Fix: specific change (code snippet if helpful)
135 | 
136 | [HIGH] ...
137 | 
138 | [MEDIUM] ...
139 | 
140 | [LOW] ...
141 | 
142 | GIVE RECOMMENDATIONS:
143 | Make a final, short, and focused statement or bullet list:
144 | - Top priority fixes that MUST IMMEDIATELY be addressed before commit
145 | - Notable positives to retain
146 | 
147 | Be thorough yet actionable. Focus on the diff, map every issue to a concrete
148 | fix, and keep comments aligned with the stated implementation goals. Your goal
149 | is to help flag anything that could potentially slip through and break
150 | critical, production quality code.
151 | 
152 | STRUCTURED RESPONSES FOR SPECIAL CASES
153 | To ensure predictable interactions, use the following JSON formats for specific
154 | scenarios. Your entire response in these cases must be the JSON object and
155 | nothing else.
156 | 
157 | 1. IF MORE INFORMATION IS NEEDED
158 | If you need additional context (e.g., related files, configuration,
159 | dependencies) to provide a complete and accurate review, you MUST respond ONLY
160 | with this JSON format (and nothing else). Do NOT ask for the same file you've
161 | been provided unless its content is missing or incomplete:
162 | {
163 |   "status": "files_required_to_continue",
164 |   "mandatory_instructions": "<your critical instructions for the agent>",
165 |   "files_needed": ["[file name here]", "[or some folder/]"]
166 | }
167 | 
168 | 2. IF SCOPE TOO LARGE FOR FOCUSED REVIEW
169 | If the codebase is too large or complex to review effectively in a single
170 | response, you MUST request the agent to provide smaller, more focused subsets
171 | for review. Respond ONLY with this JSON format (and nothing else):
172 | {
173 |   "status": "focused_review_required",
174 |   "reason": "<brief explanation of why the scope is too large>",
175 |   "suggestion": "<e.g., 'Review authentication module (auth.py, login.py)' or
176 |   'Focus on data layer (models/)' or
177 |   'Review payment processing functionality'>"
178 |  }
179 | """
180 | 
```

--------------------------------------------------------------------------------
/simulator_tests/test_xai_models.py:
--------------------------------------------------------------------------------

```python
  1 | #!/usr/bin/env python3
  2 | """
  3 | X.AI GROK Model Tests
  4 | 
  5 | Tests that verify X.AI GROK functionality including:
  6 | - Model alias resolution (grok, grok3, grokfast map to actual GROK models)
  7 | - GROK-3 and GROK-3-fast models work correctly
  8 | - Conversation continuity works with GROK models
  9 | - API integration and response validation
 10 | """
 11 | 
 12 | 
 13 | from .base_test import BaseSimulatorTest
 14 | 
 15 | 
 16 | class XAIModelsTest(BaseSimulatorTest):
 17 |     """Test X.AI GROK model functionality and integration"""
 18 | 
 19 |     @property
 20 |     def test_name(self) -> str:
 21 |         return "xai_models"
 22 | 
 23 |     @property
 24 |     def test_description(self) -> str:
 25 |         return "X.AI GROK model functionality and integration"
 26 | 
 27 |     def run_test(self) -> bool:
 28 |         """Test X.AI GROK model functionality"""
 29 |         try:
 30 |             self.logger.info("Test: X.AI GROK model functionality and integration")
 31 | 
 32 |             # Check if X.AI API key is configured and not empty
 33 |             import os
 34 | 
 35 |             xai_key = os.environ.get("XAI_API_KEY", "")
 36 |             is_valid = bool(xai_key and xai_key != "your_xai_api_key_here" and xai_key.strip())
 37 | 
 38 |             if not is_valid:
 39 |                 self.logger.info("  ⚠️  X.AI API key not configured or empty - skipping test")
 40 |                 self.logger.info("  ℹ️  This test requires XAI_API_KEY to be set in .env with a valid key")
 41 |                 return True  # Return True to indicate test is skipped, not failed
 42 | 
 43 |             # Setup test files for later use
 44 |             self.setup_test_files()
 45 | 
 46 |             # Test 1: 'grok' alias (should map to grok-4)
 47 |             self.logger.info("  1: Testing 'grok' alias (should map to grok-4)")
 48 | 
 49 |             response1, continuation_id = self.call_mcp_tool(
 50 |                 "chat",
 51 |                 {
 52 |                     "prompt": "Say 'Hello from GROK model!' and nothing else.",
 53 |                     "model": "grok",
 54 |                     "temperature": 0.1,
 55 |                 },
 56 |             )
 57 | 
 58 |             if not response1:
 59 |                 self.logger.error("  ❌ GROK alias test failed")
 60 |                 return False
 61 | 
 62 |             self.logger.info("  ✅ GROK alias call completed")
 63 |             if continuation_id:
 64 |                 self.logger.info(f"  ✅ Got continuation_id: {continuation_id}")
 65 | 
 66 |             # Test 2: Direct grok-3 model name
 67 |             self.logger.info("  2: Testing direct model name (grok-3)")
 68 | 
 69 |             response2, _ = self.call_mcp_tool(
 70 |                 "chat",
 71 |                 {
 72 |                     "prompt": "Say 'Hello from GROK-3!' and nothing else.",
 73 |                     "model": "grok-3",
 74 |                     "temperature": 0.1,
 75 |                 },
 76 |             )
 77 | 
 78 |             if not response2:
 79 |                 self.logger.error("  ❌ Direct GROK-3 model test failed")
 80 |                 return False
 81 | 
 82 |             self.logger.info("  ✅ Direct GROK-3 model call completed")
 83 | 
 84 |             # Test 3: grok-3-fast model
 85 |             self.logger.info("  3: Testing GROK-3-fast model")
 86 | 
 87 |             response3, _ = self.call_mcp_tool(
 88 |                 "chat",
 89 |                 {
 90 |                     "prompt": "Say 'Hello from GROK-3-fast!' and nothing else.",
 91 |                     "model": "grok-3-fast",
 92 |                     "temperature": 0.1,
 93 |                 },
 94 |             )
 95 | 
 96 |             if not response3:
 97 |                 self.logger.error("  ❌ GROK-3-fast model test failed")
 98 |                 return False
 99 | 
100 |             self.logger.info("  ✅ GROK-3-fast model call completed")
101 | 
102 |             # Test 4: Shorthand aliases
103 |             self.logger.info("  4: Testing shorthand aliases (grok3, grokfast)")
104 | 
105 |             response4, _ = self.call_mcp_tool(
106 |                 "chat",
107 |                 {
108 |                     "prompt": "Say 'Hello from grok3 alias!' and nothing else.",
109 |                     "model": "grok3",
110 |                     "temperature": 0.1,
111 |                 },
112 |             )
113 | 
114 |             if not response4:
115 |                 self.logger.error("  ❌ grok3 alias test failed")
116 |                 return False
117 | 
118 |             response5, _ = self.call_mcp_tool(
119 |                 "chat",
120 |                 {
121 |                     "prompt": "Say 'Hello from grokfast alias!' and nothing else.",
122 |                     "model": "grokfast",
123 |                     "temperature": 0.1,
124 |                 },
125 |             )
126 | 
127 |             if not response5:
128 |                 self.logger.error("  ❌ grokfast alias test failed")
129 |                 return False
130 | 
131 |             self.logger.info("  ✅ Shorthand aliases work correctly")
132 | 
133 |             # Test 5: Conversation continuity with GROK models
134 |             self.logger.info("  5: Testing conversation continuity with GROK")
135 | 
136 |             response6, new_continuation_id = self.call_mcp_tool(
137 |                 "chat",
138 |                 {
139 |                     "prompt": "Remember this number: 87. What number did I just tell you?",
140 |                     "model": "grok",
141 |                     "temperature": 0.1,
142 |                 },
143 |             )
144 | 
145 |             if not response6 or not new_continuation_id:
146 |                 self.logger.error("  ❌ Failed to start conversation with continuation_id")
147 |                 return False
148 | 
149 |             # Continue the conversation
150 |             response7, _ = self.call_mcp_tool(
151 |                 "chat",
152 |                 {
153 |                     "prompt": "What was the number I told you earlier?",
154 |                     "model": "grok",
155 |                     "continuation_id": new_continuation_id,
156 |                     "temperature": 0.1,
157 |                 },
158 |             )
159 | 
160 |             if not response7:
161 |                 self.logger.error("  ❌ Failed to continue conversation")
162 |                 return False
163 | 
164 |             # Check if the model remembered the number
165 |             if "87" in response7:
166 |                 self.logger.info("  ✅ Conversation continuity working with GROK")
167 |             else:
168 |                 self.logger.warning("  ⚠️  Model may not have remembered the number")
169 | 
170 |             # Test 6: Validate X.AI API usage from logs
171 |             self.logger.info("  6: Validating X.AI API usage in logs")
172 |             logs = self.get_recent_server_logs()
173 | 
174 |             # Check for X.AI API calls
175 |             xai_logs = [line for line in logs.split("\n") if "x.ai" in line.lower()]
176 |             xai_api_logs = [line for line in logs.split("\n") if "api.x.ai" in line]
177 |             grok_logs = [line for line in logs.split("\n") if "grok" in line.lower()]
178 | 
179 |             # Check for specific model resolution
180 |             grok_resolution_logs = [
181 |                 line
182 |                 for line in logs.split("\n")
183 |                 if ("Resolved model" in line and "grok" in line.lower()) or ("grok" in line and "->" in line)
184 |             ]
185 | 
186 |             # Check for X.AI provider usage
187 |             xai_provider_logs = [line for line in logs.split("\n") if "XAI" in line or "X.AI" in line]
188 | 
189 |             # Log findings
190 |             self.logger.info(f"   X.AI-related logs: {len(xai_logs)}")
191 |             self.logger.info(f"   X.AI API logs: {len(xai_api_logs)}")
192 |             self.logger.info(f"   GROK-related logs: {len(grok_logs)}")
193 |             self.logger.info(f"   Model resolution logs: {len(grok_resolution_logs)}")
194 |             self.logger.info(f"   X.AI provider logs: {len(xai_provider_logs)}")
195 | 
196 |             # Sample log output for debugging
197 |             if self.verbose and xai_logs:
198 |                 self.logger.debug("  📋 Sample X.AI logs:")
199 |                 for log in xai_logs[:3]:
200 |                     self.logger.debug(f"    {log}")
201 | 
202 |             if self.verbose and grok_logs:
203 |                 self.logger.debug("  📋 Sample GROK logs:")
204 |                 for log in grok_logs[:3]:
205 |                     self.logger.debug(f"    {log}")
206 | 
207 |             # Success criteria
208 |             grok_mentioned = len(grok_logs) > 0
209 |             api_used = len(xai_api_logs) > 0 or len(xai_logs) > 0
210 |             provider_used = len(xai_provider_logs) > 0
211 | 
212 |             success_criteria = [
213 |                 ("GROK models mentioned in logs", grok_mentioned),
214 |                 ("X.AI API calls made", api_used),
215 |                 ("X.AI provider used", provider_used),
216 |                 ("All model calls succeeded", True),  # We already checked this above
217 |                 ("Conversation continuity works", True),  # We already tested this
218 |             ]
219 | 
220 |             passed_criteria = sum(1 for _, passed in success_criteria if passed)
221 |             self.logger.info(f"   Success criteria met: {passed_criteria}/{len(success_criteria)}")
222 | 
223 |             for criterion, passed in success_criteria:
224 |                 status = "✅" if passed else "❌"
225 |                 self.logger.info(f"    {status} {criterion}")
226 | 
227 |             if passed_criteria >= 3:  # At least 3 out of 5 criteria
228 |                 self.logger.info("  ✅ X.AI GROK model tests passed")
229 |                 return True
230 |             else:
231 |                 self.logger.error("  ❌ X.AI GROK model tests failed")
232 |                 return False
233 | 
234 |         except Exception as e:
235 |             self.logger.error(f"X.AI GROK model test failed: {e}")
236 |             return False
237 |         finally:
238 |             self.cleanup_test_files()
239 | 
240 | 
241 | def main():
242 |     """Run the X.AI GROK model tests"""
243 |     import sys
244 | 
245 |     verbose = "--verbose" in sys.argv or "-v" in sys.argv
246 |     test = XAIModelsTest(verbose=verbose)
247 | 
248 |     success = test.run_test()
249 |     sys.exit(0 if success else 1)
250 | 
251 | 
252 | if __name__ == "__main__":
253 |     main()
254 | 
```

--------------------------------------------------------------------------------
/simulator_tests/test_openrouter_fallback.py:
--------------------------------------------------------------------------------

```python
  1 | #!/usr/bin/env python3
  2 | """
  3 | OpenRouter Fallback Test
  4 | 
  5 | Tests that verify the system correctly falls back to OpenRouter when:
  6 | - Only OPENROUTER_API_KEY is configured
  7 | - Native models (flash, pro) are requested but map to OpenRouter equivalents
  8 | - Auto mode correctly selects OpenRouter models
  9 | """
 10 | 
 11 | 
 12 | from .base_test import BaseSimulatorTest
 13 | 
 14 | 
 15 | class OpenRouterFallbackTest(BaseSimulatorTest):
 16 |     """Test OpenRouter fallback behavior when it's the only provider"""
 17 | 
 18 |     @property
 19 |     def test_name(self) -> str:
 20 |         return "openrouter_fallback"
 21 | 
 22 |     @property
 23 |     def test_description(self) -> str:
 24 |         return "OpenRouter fallback behavior when only provider"
 25 | 
 26 |     def run_test(self) -> bool:
 27 |         """Test OpenRouter fallback behavior"""
 28 |         try:
 29 |             self.logger.info("Test: OpenRouter fallback behavior when only provider available")
 30 | 
 31 |             # Check if ONLY OpenRouter API key is configured (this is a fallback test)
 32 |             import os
 33 | 
 34 |             has_openrouter = bool(os.environ.get("OPENROUTER_API_KEY"))
 35 |             has_gemini = bool(os.environ.get("GEMINI_API_KEY"))
 36 |             has_openai = bool(os.environ.get("OPENAI_API_KEY"))
 37 | 
 38 |             if not has_openrouter:
 39 |                 self.logger.info("  ⚠️  OpenRouter API key not configured - skipping test")
 40 |                 self.logger.info("  ℹ️  This test requires OPENROUTER_API_KEY to be set in .env")
 41 |                 return True  # Return True to indicate test is skipped, not failed
 42 | 
 43 |             if has_gemini or has_openai:
 44 |                 self.logger.info("  ⚠️  Other API keys configured - this is not a fallback scenario")
 45 |                 self.logger.info("  ℹ️  This test requires ONLY OpenRouter to be configured (no Gemini/OpenAI keys)")
 46 |                 self.logger.info("  ℹ️  Current setup has multiple providers, so fallback behavior doesn't apply")
 47 |                 return True  # Return True to indicate test is skipped, not failed
 48 | 
 49 |             # Setup test files
 50 |             self.setup_test_files()
 51 | 
 52 |             # Test 1: Auto mode should work with OpenRouter
 53 |             self.logger.info("  1: Testing auto mode with OpenRouter as only provider")
 54 | 
 55 |             response1, continuation_id = self.call_mcp_tool(
 56 |                 "chat",
 57 |                 {
 58 |                     "prompt": "What is 2 + 2? Give a brief answer.",
 59 |                     # No model specified - should use auto mode
 60 |                     "temperature": 0.1,
 61 |                 },
 62 |             )
 63 | 
 64 |             if not response1:
 65 |                 self.logger.error("  ❌ Auto mode with OpenRouter failed")
 66 |                 return False
 67 | 
 68 |             self.logger.info("  ✅ Auto mode call completed with OpenRouter")
 69 | 
 70 |             # Test 2: Flash model should map to OpenRouter equivalent
 71 |             self.logger.info("  2: Testing flash model mapping to OpenRouter")
 72 | 
 73 |             # Use codereview tool to test a different tool type
 74 |             test_code = """def calculate_sum(numbers):
 75 |     total = 0
 76 |     for num in numbers:
 77 |         total += num
 78 |     return total"""
 79 | 
 80 |             test_file = self.create_additional_test_file("sum_function.py", test_code)
 81 | 
 82 |             response2, _ = self.call_mcp_tool(
 83 |                 "codereview",
 84 |                 {
 85 |                     "step": "Quick review of this sum function for quality and potential issues",
 86 |                     "step_number": 1,
 87 |                     "total_steps": 1,
 88 |                     "next_step_required": False,
 89 |                     "findings": "Starting code review of sum function",
 90 |                     "relevant_files": [test_file],
 91 |                     "model": "flash",
 92 |                     "temperature": 0.1,
 93 |                 },
 94 |             )
 95 | 
 96 |             if not response2:
 97 |                 self.logger.error("  ❌ Flash model mapping to OpenRouter failed")
 98 |                 return False
 99 | 
100 |             self.logger.info("  ✅ Flash model successfully mapped to OpenRouter")
101 | 
102 |             # Test 3: Pro model should map to OpenRouter equivalent
103 |             self.logger.info("  3: Testing pro model mapping to OpenRouter")
104 | 
105 |             response3, _ = self.call_mcp_tool(
106 |                 "analyze",
107 |                 {
108 |                     "step": "Analyze the structure of this Python code",
109 |                     "step_number": 1,
110 |                     "total_steps": 1,
111 |                     "next_step_required": False,
112 |                     "findings": "Starting code structure analysis",
113 |                     "relevant_files": [self.test_files["python"]],
114 |                     "model": "pro",
115 |                     "temperature": 0.1,
116 |                 },
117 |             )
118 | 
119 |             if not response3:
120 |                 self.logger.error("  ❌ Pro model mapping to OpenRouter failed")
121 |                 return False
122 | 
123 |             self.logger.info("  ✅ Pro model successfully mapped to OpenRouter")
124 | 
125 |             # Test 4: Debug tool with OpenRouter
126 |             self.logger.info("  4: Testing debug tool with OpenRouter")
127 | 
128 |             response4, _ = self.call_mcp_tool(
129 |                 "debug",
130 |                 {
131 |                     "step": "Why might a function return None instead of a value?",
132 |                     "step_number": 1,
133 |                     "total_steps": 1,
134 |                     "next_step_required": False,
135 |                     "findings": "Starting debug investigation of None return values",
136 |                     "model": "flash",  # Should map to OpenRouter
137 |                     "temperature": 0.1,
138 |                 },
139 |             )
140 | 
141 |             if not response4:
142 |                 self.logger.error("  ❌ Debug tool with OpenRouter failed")
143 |                 return False
144 | 
145 |             self.logger.info("  ✅ Debug tool working with OpenRouter")
146 | 
147 |             # Test 5: Validate logs show OpenRouter is being used
148 |             self.logger.info("  5: Validating OpenRouter is the active provider")
149 |             logs = self.get_recent_server_logs()
150 | 
151 |             # Check for provider fallback logs
152 |             fallback_logs = [
153 |                 line
154 |                 for line in logs.split("\n")
155 |                 if "No Gemini API key found" in line
156 |                 or "No OpenAI API key found" in line
157 |                 or "Only OpenRouter available" in line
158 |                 or "Using OpenRouter" in line
159 |             ]
160 | 
161 |             # Check for OpenRouter provider initialization
162 |             provider_logs = [
163 |                 line
164 |                 for line in logs.split("\n")
165 |                 if "OpenRouter provider" in line or "OpenRouterProvider" in line or "openrouter.ai/api/v1" in line
166 |             ]
167 | 
168 |             # Check for model resolution through OpenRouter
169 |             model_resolution_logs = [
170 |                 line
171 |                 for line in logs.split("\n")
172 |                 if ("Resolved model" in line and "via OpenRouter" in line)
173 |                 or ("Model alias" in line and "resolved to" in line)
174 |                 or ("flash" in line and "gemini-flash" in line)
175 |                 or ("pro" in line and "gemini-pro" in line)
176 |             ]
177 | 
178 |             # Log findings
179 |             self.logger.info(f"   Fallback indication logs: {len(fallback_logs)}")
180 |             self.logger.info(f"   OpenRouter provider logs: {len(provider_logs)}")
181 |             self.logger.info(f"   Model resolution logs: {len(model_resolution_logs)}")
182 | 
183 |             # Sample logs for debugging
184 |             if self.verbose:
185 |                 if fallback_logs:
186 |                     self.logger.debug("  📋 Sample fallback logs:")
187 |                     for log in fallback_logs[:3]:
188 |                         self.logger.debug(f"    {log}")
189 | 
190 |                 if provider_logs:
191 |                     self.logger.debug("  📋 Sample provider logs:")
192 |                     for log in provider_logs[:3]:
193 |                         self.logger.debug(f"    {log}")
194 | 
195 |             # Success criteria
196 |             openrouter_active = len(provider_logs) > 0
197 |             models_resolved = len(model_resolution_logs) > 0
198 |             all_tools_worked = True  # We checked this above
199 | 
200 |             success_criteria = [
201 |                 ("OpenRouter provider active", openrouter_active),
202 |                 ("Models resolved through OpenRouter", models_resolved),
203 |                 ("All tools worked with OpenRouter", all_tools_worked),
204 |             ]
205 | 
206 |             passed_criteria = sum(1 for _, passed in success_criteria if passed)
207 |             self.logger.info(f"   Success criteria met: {passed_criteria}/{len(success_criteria)}")
208 | 
209 |             for criterion, passed in success_criteria:
210 |                 status = "✅" if passed else "❌"
211 |                 self.logger.info(f"    {status} {criterion}")
212 | 
213 |             if passed_criteria >= 2:  # At least 2 out of 3 criteria
214 |                 self.logger.info("  ✅ OpenRouter fallback test passed")
215 |                 return True
216 |             else:
217 |                 self.logger.error("  ❌ OpenRouter fallback test failed")
218 |                 return False
219 | 
220 |         except Exception as e:
221 |             self.logger.error(f"OpenRouter fallback test failed: {e}")
222 |             return False
223 |         finally:
224 |             self.cleanup_test_files()
225 | 
226 | 
227 | def main():
228 |     """Run the OpenRouter fallback tests"""
229 |     import sys
230 | 
231 |     verbose = "--verbose" in sys.argv or "-v" in sys.argv
232 |     test = OpenRouterFallbackTest(verbose=verbose)
233 | 
234 |     success = test.run_test()
235 |     sys.exit(0 if success else 1)
236 | 
237 | 
238 | if __name__ == "__main__":
239 |     main()
240 | 
```

--------------------------------------------------------------------------------
/simulator_tests/test_openrouter_models.py:
--------------------------------------------------------------------------------

```python
  1 | #!/usr/bin/env python3
  2 | """
  3 | OpenRouter Model Tests
  4 | 
  5 | Tests that verify OpenRouter functionality including:
  6 | - Model alias resolution (flash, pro, o3, etc. map to OpenRouter equivalents)
  7 | - Multiple OpenRouter models work correctly
  8 | - Conversation continuity works with OpenRouter models
  9 | - Error handling when models are not available
 10 | """
 11 | 
 12 | 
 13 | from .base_test import BaseSimulatorTest
 14 | 
 15 | 
 16 | class OpenRouterModelsTest(BaseSimulatorTest):
 17 |     """Test OpenRouter model functionality and alias mapping"""
 18 | 
 19 |     @property
 20 |     def test_name(self) -> str:
 21 |         return "openrouter_models"
 22 | 
 23 |     @property
 24 |     def test_description(self) -> str:
 25 |         return "OpenRouter model functionality and alias mapping"
 26 | 
 27 |     def run_test(self) -> bool:
 28 |         """Test OpenRouter model functionality"""
 29 |         try:
 30 |             self.logger.info("Test: OpenRouter model functionality and alias mapping")
 31 | 
 32 |             # Check if OpenRouter API key is configured
 33 |             import os
 34 | 
 35 |             has_openrouter = bool(os.environ.get("OPENROUTER_API_KEY"))
 36 | 
 37 |             if not has_openrouter:
 38 |                 self.logger.info("  ⚠️  OpenRouter API key not configured - skipping test")
 39 |                 self.logger.info("  ℹ️  This test requires OPENROUTER_API_KEY to be set in .env")
 40 |                 return True  # Return True to indicate test is skipped, not failed
 41 | 
 42 |             # Setup test files for later use
 43 |             self.setup_test_files()
 44 | 
 45 |             # Test 1: Flash alias mapping to OpenRouter
 46 |             self.logger.info("  1: Testing 'flash' alias (should map to google/gemini-2.5-flash)")
 47 | 
 48 |             response1, continuation_id = self.call_mcp_tool(
 49 |                 "chat",
 50 |                 {
 51 |                     "prompt": "Say 'Hello from Flash model!' and nothing else.",
 52 |                     "model": "flash",
 53 |                     "temperature": 0.1,
 54 |                 },
 55 |             )
 56 | 
 57 |             if not response1:
 58 |                 self.logger.error("  ❌ Flash alias test failed")
 59 |                 return False
 60 | 
 61 |             self.logger.info("  ✅ Flash alias call completed")
 62 |             if continuation_id:
 63 |                 self.logger.info(f"  ✅ Got continuation_id: {continuation_id}")
 64 | 
 65 |             # Test 2: Pro alias mapping to OpenRouter
 66 |             self.logger.info("  2: Testing 'pro' alias (should map to google/gemini-2.5-pro)")
 67 | 
 68 |             response2, _ = self.call_mcp_tool(
 69 |                 "chat",
 70 |                 {
 71 |                     "prompt": "Say 'Hello from Pro model!' and nothing else.",
 72 |                     "model": "pro",
 73 |                     "temperature": 0.1,
 74 |                 },
 75 |             )
 76 | 
 77 |             if not response2:
 78 |                 self.logger.error("  ❌ Pro alias test failed")
 79 |                 return False
 80 | 
 81 |             self.logger.info("  ✅ Pro alias call completed")
 82 | 
 83 |             # Test 3: O3 alias mapping to OpenRouter (should map to openai/gpt-4o)
 84 |             self.logger.info("  3: Testing 'o3' alias (should map to openai/gpt-4o)")
 85 | 
 86 |             response3, _ = self.call_mcp_tool(
 87 |                 "chat",
 88 |                 {
 89 |                     "prompt": "Say 'Hello from O3 model!' and nothing else.",
 90 |                     "model": "o3",
 91 |                     "temperature": 0.1,
 92 |                 },
 93 |             )
 94 | 
 95 |             if not response3:
 96 |                 self.logger.error("  ❌ O3 alias test failed")
 97 |                 return False
 98 | 
 99 |             self.logger.info("  ✅ O3 alias call completed")
100 | 
101 |             # Test 4: Direct OpenRouter model name
102 |             self.logger.info("  4: Testing direct OpenRouter model name (anthropic/claude-3-haiku)")
103 | 
104 |             response4, _ = self.call_mcp_tool(
105 |                 "chat",
106 |                 {
107 |                     "prompt": "Say 'Hello from Claude Haiku!' and nothing else.",
108 |                     "model": "anthropic/claude-3-haiku",
109 |                     "temperature": 0.1,
110 |                 },
111 |             )
112 | 
113 |             if not response4:
114 |                 self.logger.error("  ❌ Direct OpenRouter model test failed")
115 |                 return False
116 | 
117 |             self.logger.info("  ✅ Direct OpenRouter model call completed")
118 | 
119 |             # Test 5: OpenRouter alias from config
120 |             self.logger.info("  5: Testing OpenRouter alias from config ('opus' -> anthropic/claude-opus-4)")
121 | 
122 |             response5, _ = self.call_mcp_tool(
123 |                 "chat",
124 |                 {
125 |                     "prompt": "Say 'Hello from Opus!' and nothing else.",
126 |                     "model": "opus",
127 |                     "temperature": 0.1,
128 |                 },
129 |             )
130 | 
131 |             if not response5:
132 |                 self.logger.error("  ❌ OpenRouter alias test failed")
133 |                 return False
134 | 
135 |             self.logger.info("  ✅ OpenRouter alias call completed")
136 | 
137 |             # Test 6: Conversation continuity with OpenRouter models
138 |             self.logger.info("  6: Testing conversation continuity with OpenRouter")
139 | 
140 |             response6, new_continuation_id = self.call_mcp_tool(
141 |                 "chat",
142 |                 {
143 |                     "prompt": "Remember this number: 42. What number did I just tell you?",
144 |                     "model": "sonnet",  # Claude Sonnet via OpenRouter
145 |                     "temperature": 0.1,
146 |                 },
147 |             )
148 | 
149 |             if not response6 or not new_continuation_id:
150 |                 self.logger.error("  ❌ Failed to start conversation with continuation_id")
151 |                 return False
152 | 
153 |             # Continue the conversation
154 |             response7, _ = self.call_mcp_tool(
155 |                 "chat",
156 |                 {
157 |                     "prompt": "What was the number I told you earlier?",
158 |                     "model": "sonnet",
159 |                     "continuation_id": new_continuation_id,
160 |                     "temperature": 0.1,
161 |                 },
162 |             )
163 | 
164 |             if not response7:
165 |                 self.logger.error("  ❌ Failed to continue conversation")
166 |                 return False
167 | 
168 |             # Check if the model remembered the number
169 |             if "42" in response7:
170 |                 self.logger.info("  ✅ Conversation continuity working with OpenRouter")
171 |             else:
172 |                 self.logger.warning("  ⚠️  Model may not have remembered the number")
173 | 
174 |             # Test 7: Validate OpenRouter API usage from logs
175 |             self.logger.info("  7: Validating OpenRouter API usage in logs")
176 |             logs = self.get_recent_server_logs()
177 | 
178 |             # Check for OpenRouter API calls
179 |             openrouter_logs = [line for line in logs.split("\n") if "openrouter" in line.lower()]
180 |             openrouter_api_logs = [line for line in logs.split("\n") if "openrouter.ai/api/v1" in line]
181 | 
182 |             # Check for specific model mappings
183 |             flash_mapping_logs = [
184 |                 line
185 |                 for line in logs.split("\n")
186 |                 if ("flash" in line and "google/gemini-flash" in line)
187 |                 or ("Resolved model" in line and "google/gemini-flash" in line)
188 |             ]
189 | 
190 |             pro_mapping_logs = [
191 |                 line
192 |                 for line in logs.split("\n")
193 |                 if ("pro" in line and "google/gemini-pro" in line)
194 |                 or ("Resolved model" in line and "google/gemini-pro" in line)
195 |             ]
196 | 
197 |             # Log findings
198 |             self.logger.info(f"   OpenRouter-related logs: {len(openrouter_logs)}")
199 |             self.logger.info(f"   OpenRouter API logs: {len(openrouter_api_logs)}")
200 |             self.logger.info(f"   Flash mapping logs: {len(flash_mapping_logs)}")
201 |             self.logger.info(f"   Pro mapping logs: {len(pro_mapping_logs)}")
202 | 
203 |             # Sample log output for debugging
204 |             if self.verbose and openrouter_logs:
205 |                 self.logger.debug("  📋 Sample OpenRouter logs:")
206 |                 for log in openrouter_logs[:5]:
207 |                     self.logger.debug(f"    {log}")
208 | 
209 |             # Success criteria
210 |             openrouter_api_used = len(openrouter_api_logs) > 0
211 |             models_mapped = len(flash_mapping_logs) > 0 or len(pro_mapping_logs) > 0
212 | 
213 |             success_criteria = [
214 |                 ("OpenRouter API calls made", openrouter_api_used),
215 |                 ("Model aliases mapped correctly", models_mapped),
216 |                 ("All model calls succeeded", True),  # We already checked this above
217 |             ]
218 | 
219 |             passed_criteria = sum(1 for _, passed in success_criteria if passed)
220 |             self.logger.info(f"   Success criteria met: {passed_criteria}/{len(success_criteria)}")
221 | 
222 |             for criterion, passed in success_criteria:
223 |                 status = "✅" if passed else "❌"
224 |                 self.logger.info(f"    {status} {criterion}")
225 | 
226 |             if passed_criteria >= 2:  # At least 2 out of 3 criteria
227 |                 self.logger.info("  ✅ OpenRouter model tests passed")
228 |                 return True
229 |             else:
230 |                 self.logger.error("  ❌ OpenRouter model tests failed")
231 |                 return False
232 | 
233 |         except Exception as e:
234 |             self.logger.error(f"OpenRouter model test failed: {e}")
235 |             return False
236 |         finally:
237 |             self.cleanup_test_files()
238 | 
239 | 
240 | def main():
241 |     """Run the OpenRouter model tests"""
242 |     import sys
243 | 
244 |     verbose = "--verbose" in sys.argv or "-v" in sys.argv
245 |     test = OpenRouterModelsTest(verbose=verbose)
246 | 
247 |     success = test.run_test()
248 |     sys.exit(0 if success else 1)
249 | 
250 | 
251 | if __name__ == "__main__":
252 |     main()
253 | 
```

--------------------------------------------------------------------------------
/simulator_tests/test_cross_tool_continuation.py:
--------------------------------------------------------------------------------

```python
  1 | #!/usr/bin/env python3
  2 | """
  3 | Cross-Tool Continuation Test
  4 | 
  5 | Tests comprehensive cross-tool continuation scenarios to ensure
  6 | conversation context is maintained when switching between different tools.
  7 | """
  8 | 
  9 | from .conversation_base_test import ConversationBaseTest
 10 | 
 11 | 
 12 | class CrossToolContinuationTest(ConversationBaseTest):
 13 |     """Test comprehensive cross-tool continuation scenarios"""
 14 | 
 15 |     @property
 16 |     def test_name(self) -> str:
 17 |         return "cross_tool_continuation"
 18 | 
 19 |     @property
 20 |     def test_description(self) -> str:
 21 |         return "Cross-tool conversation continuation scenarios"
 22 | 
 23 |     def run_test(self) -> bool:
 24 |         """Test comprehensive cross-tool continuation scenarios"""
 25 |         try:
 26 |             self.logger.info("🔧 Test: Cross-tool continuation scenarios")
 27 | 
 28 |             # Setup test environment for conversation testing
 29 |             self.setUp()
 30 | 
 31 |             success_count = 0
 32 |             total_scenarios = 3
 33 | 
 34 |             # Scenario 1: chat -> thinkdeep -> codereview
 35 |             if self._test_chat_thinkdeep_codereview():
 36 |                 success_count += 1
 37 | 
 38 |             # Scenario 2: analyze -> debug -> thinkdeep
 39 |             if self._test_analyze_debug_thinkdeep():
 40 |                 success_count += 1
 41 | 
 42 |             # Scenario 3: Multi-file cross-tool continuation
 43 |             if self._test_multi_file_continuation():
 44 |                 success_count += 1
 45 | 
 46 |             self.logger.info(
 47 |                 f"  ✅ Cross-tool continuation scenarios completed: {success_count}/{total_scenarios} scenarios passed"
 48 |             )
 49 | 
 50 |             # Consider successful if at least one scenario worked
 51 |             return success_count > 0
 52 | 
 53 |         except Exception as e:
 54 |             self.logger.error(f"Cross-tool continuation test failed: {e}")
 55 |             return False
 56 |         finally:
 57 |             self.cleanup_test_files()
 58 | 
 59 |     def _test_chat_thinkdeep_codereview(self) -> bool:
 60 |         """Test chat -> thinkdeep -> codereview scenario"""
 61 |         try:
 62 |             self.logger.info("  1: Testing chat -> thinkdeep -> codereview")
 63 | 
 64 |             # Start with chat
 65 |             chat_response, chat_id = self.call_mcp_tool(
 66 |                 "chat",
 67 |                 {
 68 |                     "prompt": "Please use low thinking mode. Look at this Python code and tell me what you think about it",
 69 |                     "absolute_file_paths": [self.test_files["python"]],
 70 |                     "model": "flash",
 71 |                 },
 72 |             )
 73 | 
 74 |             if not chat_response or not chat_id:
 75 |                 self.logger.error("Failed to start chat conversation")
 76 |                 return False
 77 | 
 78 |             # Continue with thinkdeep
 79 |             thinkdeep_response, _ = self.call_mcp_tool(
 80 |                 "thinkdeep",
 81 |                 {
 82 |                     "step": "Think deeply about potential performance issues in this code. Please use low thinking mode.",
 83 |                     "step_number": 1,
 84 |                     "total_steps": 1,
 85 |                     "next_step_required": False,
 86 |                     "findings": "Building on previous chat analysis to examine performance issues",
 87 |                     "relevant_files": [self.test_files["python"]],  # Same file should be deduplicated
 88 |                     "continuation_id": chat_id,
 89 |                     "model": "flash",
 90 |                 },
 91 |             )
 92 | 
 93 |             if not thinkdeep_response:
 94 |                 self.logger.error("Failed chat -> thinkdeep continuation")
 95 |                 return False
 96 | 
 97 |             # Continue with codereview
 98 |             codereview_response, _ = self.call_mcp_tool(
 99 |                 "codereview",
100 |                 {
101 |                     "step": "Building on our previous analysis, provide a comprehensive code review",
102 |                     "step_number": 1,
103 |                     "total_steps": 1,
104 |                     "next_step_required": False,
105 |                     "findings": "Continuing from previous chat and thinkdeep analysis for comprehensive review",
106 |                     "relevant_files": [self.test_files["python"]],  # Same file should be deduplicated
107 |                     "continuation_id": chat_id,
108 |                     "model": "flash",
109 |                 },
110 |             )
111 | 
112 |             if not codereview_response:
113 |                 self.logger.error("Failed thinkdeep -> codereview continuation")
114 |                 return False
115 | 
116 |             self.logger.info("  ✅ chat -> thinkdeep -> codereview working")
117 |             return True
118 | 
119 |         except Exception as e:
120 |             self.logger.error(f"Chat -> thinkdeep -> codereview scenario failed: {e}")
121 |             return False
122 | 
123 |     def _test_analyze_debug_thinkdeep(self) -> bool:
124 |         """Test analyze -> debug -> thinkdeep scenario"""
125 |         try:
126 |             self.logger.info("  2: Testing analyze -> debug -> thinkdeep")
127 | 
128 |             # Start with analyze
129 |             analyze_response, analyze_id = self.call_mcp_tool(
130 |                 "analyze",
131 |                 {
132 |                     "step": "Analyze this code for quality and performance issues",
133 |                     "step_number": 1,
134 |                     "total_steps": 1,
135 |                     "next_step_required": False,
136 |                     "findings": "Starting analysis of Python code for quality and performance issues",
137 |                     "relevant_files": [self.test_files["python"]],
138 |                     "model": "flash",
139 |                 },
140 |             )
141 | 
142 |             if not analyze_response or not analyze_id:
143 |                 self.logger.warning("Failed to start analyze conversation, skipping scenario 2")
144 |                 return False
145 | 
146 |             # Continue with debug
147 |             debug_response, _ = self.call_mcp_tool(
148 |                 "debug",
149 |                 {
150 |                     "step": "Based on our analysis, help debug the performance issue in fibonacci",
151 |                     "step_number": 1,
152 |                     "total_steps": 1,
153 |                     "next_step_required": False,
154 |                     "findings": "Building on previous analysis to debug specific performance issue",
155 |                     "relevant_files": [self.test_files["python"]],  # Same file should be deduplicated
156 |                     "continuation_id": analyze_id,
157 |                     "model": "flash",
158 |                 },
159 |             )
160 | 
161 |             if not debug_response:
162 |                 self.logger.warning("  ⚠️ analyze -> debug continuation failed")
163 |                 return False
164 | 
165 |             # Continue with thinkdeep
166 |             final_response, _ = self.call_mcp_tool(
167 |                 "thinkdeep",
168 |                 {
169 |                     "step": "Think deeply about the architectural implications of the issues we've found. Please use low thinking mode.",
170 |                     "step_number": 1,
171 |                     "total_steps": 1,
172 |                     "next_step_required": False,
173 |                     "findings": "Building on analysis and debug findings to explore architectural implications",
174 |                     "relevant_files": [self.test_files["python"]],  # Same file should be deduplicated
175 |                     "continuation_id": analyze_id,
176 |                     "model": "flash",
177 |                 },
178 |             )
179 | 
180 |             if not final_response:
181 |                 self.logger.warning("  ⚠️ debug -> thinkdeep continuation failed")
182 |                 return False
183 | 
184 |             self.logger.info("  ✅ analyze -> debug -> thinkdeep working")
185 |             return True
186 | 
187 |         except Exception as e:
188 |             self.logger.error(f"Analyze -> debug -> thinkdeep scenario failed: {e}")
189 |             return False
190 | 
191 |     def _test_multi_file_continuation(self) -> bool:
192 |         """Test multi-file cross-tool continuation"""
193 |         try:
194 |             self.logger.info("  3: Testing multi-file cross-tool continuation")
195 | 
196 |             # Start with both files
197 |             multi_response, multi_id = self.call_mcp_tool(
198 |                 "chat",
199 |                 {
200 |                     "prompt": "Please use low thinking mode. Analyze both the Python code and configuration file",
201 |                     "absolute_file_paths": [self.test_files["python"], self.test_files["config"]],
202 |                     "model": "flash",
203 |                 },
204 |             )
205 | 
206 |             if not multi_response or not multi_id:
207 |                 self.logger.warning("Failed to start multi-file conversation, skipping scenario 3")
208 |                 return False
209 | 
210 |             # Switch to codereview with same files (should use conversation history)
211 |             multi_review, _ = self.call_mcp_tool(
212 |                 "codereview",
213 |                 {
214 |                     "step": "Review both files in the context of our previous discussion",
215 |                     "step_number": 1,
216 |                     "total_steps": 1,
217 |                     "next_step_required": False,
218 |                     "findings": "Continuing multi-file analysis with code review perspective",
219 |                     "relevant_files": [self.test_files["python"], self.test_files["config"]],  # Same files
220 |                     "continuation_id": multi_id,
221 |                     "model": "flash",
222 |                 },
223 |             )
224 | 
225 |             if not multi_review:
226 |                 self.logger.warning("  ⚠️ Multi-file cross-tool continuation failed")
227 |                 return False
228 | 
229 |             self.logger.info("  ✅ Multi-file cross-tool continuation working")
230 |             return True
231 | 
232 |         except Exception as e:
233 |             self.logger.error(f"Multi-file continuation scenario failed: {e}")
234 |             return False
235 | 
```

--------------------------------------------------------------------------------
/providers/registries/base.py:
--------------------------------------------------------------------------------

```python
  1 | """Shared infrastructure for JSON-backed model registries."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import importlib.resources
  6 | import json
  7 | import logging
  8 | from collections.abc import Iterable
  9 | from dataclasses import fields
 10 | from pathlib import Path
 11 | 
 12 | from utils.env import get_env
 13 | from utils.file_utils import read_json_file
 14 | 
 15 | from ..shared import ModelCapabilities, ProviderType, TemperatureConstraint
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | CAPABILITY_FIELD_NAMES = {field.name for field in fields(ModelCapabilities)}
 21 | 
 22 | 
 23 | class CustomModelRegistryBase:
 24 |     """Load and expose capability metadata from a JSON manifest."""
 25 | 
 26 |     def __init__(
 27 |         self,
 28 |         *,
 29 |         env_var_name: str,
 30 |         default_filename: str,
 31 |         config_path: str | None = None,
 32 |     ) -> None:
 33 |         self._env_var_name = env_var_name
 34 |         self._default_filename = default_filename
 35 |         self._use_resources = False
 36 |         self._resource_package = "conf"
 37 |         self._default_path = Path(__file__).resolve().parents[3] / "conf" / default_filename
 38 | 
 39 |         if config_path:
 40 |             self.config_path = Path(config_path)
 41 |         else:
 42 |             env_path = get_env(env_var_name)
 43 |             if env_path:
 44 |                 self.config_path = Path(env_path)
 45 |             else:
 46 |                 try:
 47 |                     resource = importlib.resources.files(self._resource_package).joinpath(default_filename)
 48 |                     if hasattr(resource, "read_text"):
 49 |                         self._use_resources = True
 50 |                         self.config_path = None
 51 |                     else:
 52 |                         raise AttributeError("resource accessor not available")
 53 |                 except Exception:
 54 |                     self.config_path = Path(__file__).resolve().parents[3] / "conf" / default_filename
 55 | 
 56 |         self.alias_map: dict[str, str] = {}
 57 |         self.model_map: dict[str, ModelCapabilities] = {}
 58 |         self._extras: dict[str, dict] = {}
 59 | 
 60 |     def reload(self) -> None:
 61 |         data = self._load_config_data()
 62 |         configs = [config for config in self._parse_models(data) if config is not None]
 63 |         self._build_maps(configs)
 64 | 
 65 |     def list_models(self) -> list[str]:
 66 |         return list(self.model_map.keys())
 67 | 
 68 |     def list_aliases(self) -> list[str]:
 69 |         return list(self.alias_map.keys())
 70 | 
 71 |     def resolve(self, name_or_alias: str) -> ModelCapabilities | None:
 72 |         key = name_or_alias.lower()
 73 |         canonical = self.alias_map.get(key)
 74 |         if canonical:
 75 |             return self.model_map.get(canonical)
 76 | 
 77 |         for model_name in self.model_map:
 78 |             if model_name.lower() == key:
 79 |                 return self.model_map[model_name]
 80 |         return None
 81 | 
 82 |     def get_capabilities(self, name_or_alias: str) -> ModelCapabilities | None:
 83 |         return self.resolve(name_or_alias)
 84 | 
 85 |     def get_entry(self, model_name: str) -> dict | None:
 86 |         return self._extras.get(model_name)
 87 | 
 88 |     def get_model_config(self, model_name: str) -> ModelCapabilities | None:
 89 |         """Backwards-compatible accessor for registries expecting this helper."""
 90 | 
 91 |         return self.model_map.get(model_name) or self.resolve(model_name)
 92 | 
 93 |     def iter_entries(self) -> Iterable[tuple[str, ModelCapabilities, dict]]:
 94 |         for model_name, capability in self.model_map.items():
 95 |             yield model_name, capability, self._extras.get(model_name, {})
 96 | 
 97 |     # ------------------------------------------------------------------
 98 |     # Internal helpers
 99 |     # ------------------------------------------------------------------
100 |     def _load_config_data(self) -> dict:
101 |         if self._use_resources:
102 |             try:
103 |                 resource = importlib.resources.files(self._resource_package).joinpath(self._default_filename)
104 |                 if hasattr(resource, "read_text"):
105 |                     config_text = resource.read_text(encoding="utf-8")
106 |                 else:  # pragma: no cover - legacy Python fallback
107 |                     with resource.open("r", encoding="utf-8") as handle:
108 |                         config_text = handle.read()
109 |                 data = json.loads(config_text)
110 |             except FileNotFoundError:
111 |                 logger.debug("Packaged %s not found", self._default_filename)
112 |                 return {"models": []}
113 |             except Exception as exc:
114 |                 logger.warning("Failed to read packaged %s: %s", self._default_filename, exc)
115 |                 return {"models": []}
116 |             return data or {"models": []}
117 | 
118 |         if not self.config_path:
119 |             raise FileNotFoundError("Registry configuration path is not set")
120 | 
121 |         if not self.config_path.exists():
122 |             logger.debug("Model registry config not found at %s", self.config_path)
123 |             if self.config_path == self._default_path:
124 |                 fallback = Path.cwd() / "conf" / self._default_filename
125 |                 if fallback != self.config_path and fallback.exists():
126 |                     logger.debug("Falling back to %s", fallback)
127 |                     self.config_path = fallback
128 |                 else:
129 |                     return {"models": []}
130 |             else:
131 |                 return {"models": []}
132 | 
133 |         data = read_json_file(str(self.config_path))
134 |         return data or {"models": []}
135 | 
136 |     @property
137 |     def use_resources(self) -> bool:
138 |         return self._use_resources
139 | 
140 |     def _parse_models(self, data: dict) -> Iterable[ModelCapabilities | None]:
141 |         for raw in data.get("models", []):
142 |             if not isinstance(raw, dict):
143 |                 continue
144 |             yield self._convert_entry(raw)
145 | 
146 |     def _convert_entry(self, raw: dict) -> ModelCapabilities | None:
147 |         entry = dict(raw)
148 |         model_name = entry.get("model_name")
149 |         if not model_name:
150 |             return None
151 | 
152 |         aliases = entry.get("aliases")
153 |         if isinstance(aliases, str):
154 |             entry["aliases"] = [alias.strip() for alias in aliases.split(",") if alias.strip()]
155 | 
156 |         entry.setdefault("friendly_name", self._default_friendly_name(model_name))
157 | 
158 |         temperature_hint = entry.get("temperature_constraint")
159 |         if isinstance(temperature_hint, str):
160 |             entry["temperature_constraint"] = TemperatureConstraint.create(temperature_hint)
161 |         elif temperature_hint is None:
162 |             entry["temperature_constraint"] = TemperatureConstraint.create("range")
163 | 
164 |         if "max_tokens" in entry:
165 |             raise ValueError(
166 |                 "`max_tokens` is no longer supported. Use `max_output_tokens` in your model configuration."
167 |             )
168 | 
169 |         unknown_keys = set(entry.keys()) - CAPABILITY_FIELD_NAMES - self._extra_keys()
170 |         if unknown_keys:
171 |             raise ValueError("Unsupported fields in model configuration: " + ", ".join(sorted(unknown_keys)))
172 | 
173 |         capability, extras = self._finalise_entry(entry)
174 |         capability.provider = self._provider_default()
175 |         self._extras[capability.model_name] = extras or {}
176 |         return capability
177 | 
178 |     def _default_friendly_name(self, model_name: str) -> str:
179 |         return model_name
180 | 
181 |     def _extra_keys(self) -> set[str]:
182 |         return set()
183 | 
184 |     def _provider_default(self) -> ProviderType:
185 |         return ProviderType.OPENROUTER
186 | 
187 |     def _finalise_entry(self, entry: dict) -> tuple[ModelCapabilities, dict]:
188 |         return ModelCapabilities(**{k: v for k, v in entry.items() if k in CAPABILITY_FIELD_NAMES}), {}
189 | 
190 |     def _build_maps(self, configs: Iterable[ModelCapabilities]) -> None:
191 |         alias_map: dict[str, str] = {}
192 |         model_map: dict[str, ModelCapabilities] = {}
193 | 
194 |         for config in configs:
195 |             if not config:
196 |                 continue
197 |             model_map[config.model_name] = config
198 | 
199 |             model_name_lower = config.model_name.lower()
200 |             if model_name_lower not in alias_map:
201 |                 alias_map[model_name_lower] = config.model_name
202 | 
203 |             for alias in config.aliases:
204 |                 alias_lower = alias.lower()
205 |                 if alias_lower in alias_map and alias_map[alias_lower] != config.model_name:
206 |                     raise ValueError(
207 |                         f"Duplicate alias '{alias}' found for models '{alias_map[alias_lower]}' and '{config.model_name}'"
208 |                     )
209 |                 alias_map[alias_lower] = config.model_name
210 | 
211 |         self.alias_map = alias_map
212 |         self.model_map = model_map
213 | 
214 | 
215 | class CapabilityModelRegistry(CustomModelRegistryBase):
216 |     """Registry that returns :class:`ModelCapabilities` objects with alias support."""
217 | 
218 |     def __init__(
219 |         self,
220 |         *,
221 |         env_var_name: str,
222 |         default_filename: str,
223 |         provider: ProviderType,
224 |         friendly_prefix: str,
225 |         config_path: str | None = None,
226 |     ) -> None:
227 |         self._provider = provider
228 |         self._friendly_prefix = friendly_prefix
229 |         super().__init__(
230 |             env_var_name=env_var_name,
231 |             default_filename=default_filename,
232 |             config_path=config_path,
233 |         )
234 |         self.reload()
235 | 
236 |     def _provider_default(self) -> ProviderType:
237 |         return self._provider
238 | 
239 |     def _default_friendly_name(self, model_name: str) -> str:
240 |         return self._friendly_prefix.format(model=model_name)
241 | 
242 |     def _finalise_entry(self, entry: dict) -> tuple[ModelCapabilities, dict]:
243 |         filtered = {k: v for k, v in entry.items() if k in CAPABILITY_FIELD_NAMES}
244 |         filtered.setdefault("provider", self._provider_default())
245 |         capability = ModelCapabilities(**filtered)
246 |         return capability, {}
247 | 
```