beehiveinnovations/gemini-mcp-server # codebase.md

This is page 4 of 19. Use http://codebase.md/beehiveinnovations/gemini-mcp-server?lines=false&page={x} to view the full context.

# Directory Structure

```
├── .claude
│   ├── commands
│   │   └── fix-github-issue.md
│   └── settings.json
├── .coveragerc
├── .dockerignore
├── .env.example
├── .gitattributes
├── .github
│   ├── FUNDING.yml
│   ├── ISSUE_TEMPLATE
│   │   ├── bug_report.yml
│   │   ├── config.yml
│   │   ├── documentation.yml
│   │   ├── feature_request.yml
│   │   └── tool_addition.yml
│   ├── pull_request_template.md
│   └── workflows
│       ├── docker-pr.yml
│       ├── docker-release.yml
│       ├── semantic-pr.yml
│       ├── semantic-release.yml
│       └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── AGENTS.md
├── CHANGELOG.md
├── claude_config_example.json
├── CLAUDE.md
├── clink
│   ├── __init__.py
│   ├── agents
│   │   ├── __init__.py
│   │   ├── base.py
│   │   ├── claude.py
│   │   ├── codex.py
│   │   └── gemini.py
│   ├── constants.py
│   ├── models.py
│   ├── parsers
│   │   ├── __init__.py
│   │   ├── base.py
│   │   ├── claude.py
│   │   ├── codex.py
│   │   └── gemini.py
│   └── registry.py
├── code_quality_checks.ps1
├── code_quality_checks.sh
├── communication_simulator_test.py
├── conf
│   ├── __init__.py
│   ├── azure_models.json
│   ├── cli_clients
│   │   ├── claude.json
│   │   ├── codex.json
│   │   └── gemini.json
│   ├── custom_models.json
│   ├── dial_models.json
│   ├── gemini_models.json
│   ├── openai_models.json
│   ├── openrouter_models.json
│   └── xai_models.json
├── config.py
├── docker
│   ├── README.md
│   └── scripts
│       ├── build.ps1
│       ├── build.sh
│       ├── deploy.ps1
│       ├── deploy.sh
│       └── healthcheck.py
├── docker-compose.yml
├── Dockerfile
├── docs
│   ├── adding_providers.md
│   ├── adding_tools.md
│   ├── advanced-usage.md
│   ├── ai_banter.md
│   ├── ai-collaboration.md
│   ├── azure_openai.md
│   ├── configuration.md
│   ├── context-revival.md
│   ├── contributions.md
│   ├── custom_models.md
│   ├── docker-deployment.md
│   ├── gemini-setup.md
│   ├── getting-started.md
│   ├── index.md
│   ├── locale-configuration.md
│   ├── logging.md
│   ├── model_ranking.md
│   ├── testing.md
│   ├── tools
│   │   ├── analyze.md
│   │   ├── apilookup.md
│   │   ├── challenge.md
│   │   ├── chat.md
│   │   ├── clink.md
│   │   ├── codereview.md
│   │   ├── consensus.md
│   │   ├── debug.md
│   │   ├── docgen.md
│   │   ├── listmodels.md
│   │   ├── planner.md
│   │   ├── precommit.md
│   │   ├── refactor.md
│   │   ├── secaudit.md
│   │   ├── testgen.md
│   │   ├── thinkdeep.md
│   │   ├── tracer.md
│   │   └── version.md
│   ├── troubleshooting.md
│   ├── vcr-testing.md
│   └── wsl-setup.md
├── examples
│   ├── claude_config_macos.json
│   └── claude_config_wsl.json
├── LICENSE
├── providers
│   ├── __init__.py
│   ├── azure_openai.py
│   ├── base.py
│   ├── custom.py
│   ├── dial.py
│   ├── gemini.py
│   ├── openai_compatible.py
│   ├── openai.py
│   ├── openrouter.py
│   ├── registries
│   │   ├── __init__.py
│   │   ├── azure.py
│   │   ├── base.py
│   │   ├── custom.py
│   │   ├── dial.py
│   │   ├── gemini.py
│   │   ├── openai.py
│   │   ├── openrouter.py
│   │   └── xai.py
│   ├── registry_provider_mixin.py
│   ├── registry.py
│   ├── shared
│   │   ├── __init__.py
│   │   ├── model_capabilities.py
│   │   ├── model_response.py
│   │   ├── provider_type.py
│   │   └── temperature.py
│   └── xai.py
├── pyproject.toml
├── pytest.ini
├── README.md
├── requirements-dev.txt
├── requirements.txt
├── run_integration_tests.ps1
├── run_integration_tests.sh
├── run-server.ps1
├── run-server.sh
├── scripts
│   └── sync_version.py
├── server.py
├── simulator_tests
│   ├── __init__.py
│   ├── base_test.py
│   ├── conversation_base_test.py
│   ├── log_utils.py
│   ├── test_analyze_validation.py
│   ├── test_basic_conversation.py
│   ├── test_chat_simple_validation.py
│   ├── test_codereview_validation.py
│   ├── test_consensus_conversation.py
│   ├── test_consensus_three_models.py
│   ├── test_consensus_workflow_accurate.py
│   ├── test_content_validation.py
│   ├── test_conversation_chain_validation.py
│   ├── test_cross_tool_comprehensive.py
│   ├── test_cross_tool_continuation.py
│   ├── test_debug_certain_confidence.py
│   ├── test_debug_validation.py
│   ├── test_line_number_validation.py
│   ├── test_logs_validation.py
│   ├── test_model_thinking_config.py
│   ├── test_o3_model_selection.py
│   ├── test_o3_pro_expensive.py
│   ├── test_ollama_custom_url.py
│   ├── test_openrouter_fallback.py
│   ├── test_openrouter_models.py
│   ├── test_per_tool_deduplication.py
│   ├── test_planner_continuation_history.py
│   ├── test_planner_validation_old.py
│   ├── test_planner_validation.py
│   ├── test_precommitworkflow_validation.py
│   ├── test_prompt_size_limit_bug.py
│   ├── test_refactor_validation.py
│   ├── test_secaudit_validation.py
│   ├── test_testgen_validation.py
│   ├── test_thinkdeep_validation.py
│   ├── test_token_allocation_validation.py
│   ├── test_vision_capability.py
│   └── test_xai_models.py
├── systemprompts
│   ├── __init__.py
│   ├── analyze_prompt.py
│   ├── chat_prompt.py
│   ├── clink
│   │   ├── codex_codereviewer.txt
│   │   ├── default_codereviewer.txt
│   │   ├── default_planner.txt
│   │   └── default.txt
│   ├── codereview_prompt.py
│   ├── consensus_prompt.py
│   ├── debug_prompt.py
│   ├── docgen_prompt.py
│   ├── generate_code_prompt.py
│   ├── planner_prompt.py
│   ├── precommit_prompt.py
│   ├── refactor_prompt.py
│   ├── secaudit_prompt.py
│   ├── testgen_prompt.py
│   ├── thinkdeep_prompt.py
│   └── tracer_prompt.py
├── tests
│   ├── __init__.py
│   ├── CASSETTE_MAINTENANCE.md
│   ├── conftest.py
│   ├── gemini_cassettes
│   │   ├── chat_codegen
│   │   │   └── gemini25_pro_calculator
│   │   │       └── mldev.json
│   │   ├── chat_cross
│   │   │   └── step1_gemini25_flash_number
│   │   │       └── mldev.json
│   │   └── consensus
│   │       └── step2_gemini25_flash_against
│   │           └── mldev.json
│   ├── http_transport_recorder.py
│   ├── mock_helpers.py
│   ├── openai_cassettes
│   │   ├── chat_cross_step2_gpt5_reminder.json
│   │   ├── chat_gpt5_continuation.json
│   │   ├── chat_gpt5_moon_distance.json
│   │   ├── consensus_step1_gpt5_for.json
│   │   └── o3_pro_basic_math.json
│   ├── pii_sanitizer.py
│   ├── sanitize_cassettes.py
│   ├── test_alias_target_restrictions.py
│   ├── test_auto_mode_comprehensive.py
│   ├── test_auto_mode_custom_provider_only.py
│   ├── test_auto_mode_model_listing.py
│   ├── test_auto_mode_provider_selection.py
│   ├── test_auto_mode.py
│   ├── test_auto_model_planner_fix.py
│   ├── test_azure_openai_provider.py
│   ├── test_buggy_behavior_prevention.py
│   ├── test_cassette_semantic_matching.py
│   ├── test_challenge.py
│   ├── test_chat_codegen_integration.py
│   ├── test_chat_cross_model_continuation.py
│   ├── test_chat_openai_integration.py
│   ├── test_chat_simple.py
│   ├── test_clink_claude_agent.py
│   ├── test_clink_claude_parser.py
│   ├── test_clink_codex_agent.py
│   ├── test_clink_gemini_agent.py
│   ├── test_clink_gemini_parser.py
│   ├── test_clink_integration.py
│   ├── test_clink_parsers.py
│   ├── test_clink_tool.py
│   ├── test_collaboration.py
│   ├── test_config.py
│   ├── test_consensus_integration.py
│   ├── test_consensus_schema.py
│   ├── test_consensus.py
│   ├── test_conversation_continuation_integration.py
│   ├── test_conversation_field_mapping.py
│   ├── test_conversation_file_features.py
│   ├── test_conversation_memory.py
│   ├── test_conversation_missing_files.py
│   ├── test_custom_openai_temperature_fix.py
│   ├── test_custom_provider.py
│   ├── test_debug.py
│   ├── test_deploy_scripts.py
│   ├── test_dial_provider.py
│   ├── test_directory_expansion_tracking.py
│   ├── test_disabled_tools.py
│   ├── test_docker_claude_desktop_integration.py
│   ├── test_docker_config_complete.py
│   ├── test_docker_healthcheck.py
│   ├── test_docker_implementation.py
│   ├── test_docker_mcp_validation.py
│   ├── test_docker_security.py
│   ├── test_docker_volume_persistence.py
│   ├── test_file_protection.py
│   ├── test_gemini_token_usage.py
│   ├── test_image_support_integration.py
│   ├── test_image_validation.py
│   ├── test_integration_utf8.py
│   ├── test_intelligent_fallback.py
│   ├── test_issue_245_simple.py
│   ├── test_large_prompt_handling.py
│   ├── test_line_numbers_integration.py
│   ├── test_listmodels_restrictions.py
│   ├── test_listmodels.py
│   ├── test_mcp_error_handling.py
│   ├── test_model_enumeration.py
│   ├── test_model_metadata_continuation.py
│   ├── test_model_resolution_bug.py
│   ├── test_model_restrictions.py
│   ├── test_o3_pro_output_text_fix.py
│   ├── test_o3_temperature_fix_simple.py
│   ├── test_openai_compatible_token_usage.py
│   ├── test_openai_provider.py
│   ├── test_openrouter_provider.py
│   ├── test_openrouter_registry.py
│   ├── test_parse_model_option.py
│   ├── test_per_tool_model_defaults.py
│   ├── test_pii_sanitizer.py
│   ├── test_pip_detection_fix.py
│   ├── test_planner.py
│   ├── test_precommit_workflow.py
│   ├── test_prompt_regression.py
│   ├── test_prompt_size_limit_bug_fix.py
│   ├── test_provider_retry_logic.py
│   ├── test_provider_routing_bugs.py
│   ├── test_provider_utf8.py
│   ├── test_providers.py
│   ├── test_rate_limit_patterns.py
│   ├── test_refactor.py
│   ├── test_secaudit.py
│   ├── test_server.py
│   ├── test_supported_models_aliases.py
│   ├── test_thinking_modes.py
│   ├── test_tools.py
│   ├── test_tracer.py
│   ├── test_utf8_localization.py
│   ├── test_utils.py
│   ├── test_uvx_resource_packaging.py
│   ├── test_uvx_support.py
│   ├── test_workflow_file_embedding.py
│   ├── test_workflow_metadata.py
│   ├── test_workflow_prompt_size_validation_simple.py
│   ├── test_workflow_utf8.py
│   ├── test_xai_provider.py
│   ├── transport_helpers.py
│   └── triangle.png
├── tools
│   ├── __init__.py
│   ├── analyze.py
│   ├── apilookup.py
│   ├── challenge.py
│   ├── chat.py
│   ├── clink.py
│   ├── codereview.py
│   ├── consensus.py
│   ├── debug.py
│   ├── docgen.py
│   ├── listmodels.py
│   ├── models.py
│   ├── planner.py
│   ├── precommit.py
│   ├── refactor.py
│   ├── secaudit.py
│   ├── shared
│   │   ├── __init__.py
│   │   ├── base_models.py
│   │   ├── base_tool.py
│   │   ├── exceptions.py
│   │   └── schema_builders.py
│   ├── simple
│   │   ├── __init__.py
│   │   └── base.py
│   ├── testgen.py
│   ├── thinkdeep.py
│   ├── tracer.py
│   ├── version.py
│   └── workflow
│       ├── __init__.py
│       ├── base.py
│       ├── schema_builders.py
│       └── workflow_mixin.py
├── utils
│   ├── __init__.py
│   ├── client_info.py
│   ├── conversation_memory.py
│   ├── env.py
│   ├── file_types.py
│   ├── file_utils.py
│   ├── image_utils.py
│   ├── model_context.py
│   ├── model_restrictions.py
│   ├── security_config.py
│   ├── storage_backend.py
│   └── token_utils.py
└── zen-mcp-server
```

# Files

--------------------------------------------------------------------------------
/tests/test_conversation_field_mapping.py:
--------------------------------------------------------------------------------

```python
"""
Test that conversation history is correctly mapped to tool-specific fields
"""

from datetime import datetime
from unittest.mock import patch

import pytest

from server import reconstruct_thread_context
from utils.conversation_memory import ConversationTurn, ThreadContext


@pytest.mark.asyncio
@pytest.mark.no_mock_provider
async def test_conversation_history_field_mapping():
    """Test that enhanced prompts are mapped to prompt field for all tools"""

    # Test data for different tools - all use 'prompt' now
    test_cases = [
        {
            "tool_name": "analyze",
            "original_value": "What does this code do?",
        },
        {
            "tool_name": "chat",
            "original_value": "Explain this concept",
        },
        {
            "tool_name": "debug",
            "original_value": "Getting undefined error",
        },
        {
            "tool_name": "codereview",
            "original_value": "Review this implementation",
        },
        {
            "tool_name": "thinkdeep",
            "original_value": "My analysis so far",
        },
    ]

    for test_case in test_cases:
        # Create real conversation context
        mock_context = ThreadContext(
            thread_id="test-thread-123",
            tool_name=test_case["tool_name"],
            created_at=datetime.now().isoformat(),
            last_updated_at=datetime.now().isoformat(),
            turns=[
                ConversationTurn(
                    role="user",
                    content="Previous user message",
                    timestamp=datetime.now().isoformat(),
                    files=["/test/file1.py"],
                ),
                ConversationTurn(
                    role="assistant",
                    content="Previous assistant response",
                    timestamp=datetime.now().isoformat(),
                ),
            ],
            initial_context={},
        )

        # Mock get_thread to return our test context
        with patch("utils.conversation_memory.get_thread", return_value=mock_context):
            with patch("utils.conversation_memory.add_turn", return_value=True):
                # Create arguments with continuation_id and use a test model
                arguments = {
                    "continuation_id": "test-thread-123",
                    "prompt": test_case["original_value"],
                    "absolute_file_paths": ["/test/file2.py"],
                    "model": "flash",  # Use test model to avoid provider errors
                }

                # Call reconstruct_thread_context
                enhanced_args = await reconstruct_thread_context(arguments)

                # Verify the enhanced prompt is in the prompt field
                assert "prompt" in enhanced_args
                enhanced_value = enhanced_args["prompt"]

                # Should contain conversation history
                assert "=== CONVERSATION HISTORY" in enhanced_value  # Allow for both formats
                assert "Previous user message" in enhanced_value
                assert "Previous assistant response" in enhanced_value

                # Should contain the new user input
                assert "=== NEW USER INPUT ===" in enhanced_value
                assert test_case["original_value"] in enhanced_value

                # Should have token budget
                assert "_remaining_tokens" in enhanced_args
                assert enhanced_args["_remaining_tokens"] > 0


@pytest.mark.asyncio
@pytest.mark.no_mock_provider
async def test_unknown_tool_defaults_to_prompt():
    """Test that unknown tools default to using 'prompt' field"""

    mock_context = ThreadContext(
        thread_id="test-thread-456",
        tool_name="unknown_tool",
        created_at=datetime.now().isoformat(),
        last_updated_at=datetime.now().isoformat(),
        turns=[
            ConversationTurn(
                role="user",
                content="First message",
                timestamp=datetime.now().isoformat(),
            ),
            ConversationTurn(
                role="assistant",
                content="First response",
                timestamp=datetime.now().isoformat(),
            ),
        ],
        initial_context={},
    )

    with patch("utils.conversation_memory.get_thread", return_value=mock_context):
        with patch("utils.conversation_memory.add_turn", return_value=True):
            arguments = {
                "continuation_id": "test-thread-456",
                "prompt": "User input",
                "model": "flash",  # Use test model for real integration
            }

            enhanced_args = await reconstruct_thread_context(arguments)

            # Should default to 'prompt' field
            assert "prompt" in enhanced_args
            assert "=== CONVERSATION HISTORY" in enhanced_args["prompt"]  # Allow for both formats
            assert "First message" in enhanced_args["prompt"]
            assert "First response" in enhanced_args["prompt"]
            assert "User input" in enhanced_args["prompt"]


@pytest.mark.asyncio
async def test_tool_parameter_standardization():
    """Test that workflow tools use standardized investigation pattern"""
    from tools.analyze import AnalyzeWorkflowRequest
    from tools.codereview import CodeReviewRequest
    from tools.debug import DebugInvestigationRequest
    from tools.precommit import PrecommitRequest
    from tools.thinkdeep import ThinkDeepWorkflowRequest

    # Test analyze tool uses workflow pattern
    analyze = AnalyzeWorkflowRequest(
        step="What does this do?",
        step_number=1,
        total_steps=1,
        next_step_required=False,
        findings="Initial analysis",
        relevant_files=["/test.py"],
    )
    assert analyze.step == "What does this do?"

    # Debug tool now uses self-investigation pattern with different fields
    debug = DebugInvestigationRequest(
        step="Investigating error",
        step_number=1,
        total_steps=3,
        next_step_required=True,
        findings="Initial error analysis",
    )
    assert debug.step == "Investigating error"
    assert debug.findings == "Initial error analysis"

    # Test codereview tool uses workflow fields
    review = CodeReviewRequest(
        step="Initial code review investigation",
        step_number=1,
        total_steps=2,
        next_step_required=True,
        findings="Initial review findings",
        relevant_files=["/test.py"],
    )
    assert review.step == "Initial code review investigation"
    assert review.findings == "Initial review findings"

    # Test thinkdeep tool uses workflow pattern
    think = ThinkDeepWorkflowRequest(
        step="My analysis", step_number=1, total_steps=1, next_step_required=False, findings="Initial thinking analysis"
    )
    assert think.step == "My analysis"

    # Test precommit tool uses workflow fields
    precommit = PrecommitRequest(
        step="Validating changes for commit",
        step_number=1,
        total_steps=2,
        next_step_required=True,
        findings="Initial validation findings",
        path="/repo",  # path only needed for step 1
    )
    assert precommit.step == "Validating changes for commit"
    assert precommit.findings == "Initial validation findings"

```

--------------------------------------------------------------------------------
/providers/custom.py:
--------------------------------------------------------------------------------

```python
"""Custom API provider implementation."""

import logging

from utils.env import get_env

from .openai_compatible import OpenAICompatibleProvider
from .registries.custom import CustomEndpointModelRegistry
from .registries.openrouter import OpenRouterModelRegistry
from .shared import ModelCapabilities, ProviderType


class CustomProvider(OpenAICompatibleProvider):
    """Adapter for self-hosted or local OpenAI-compatible endpoints.

    Role
        Provide a uniform bridge between the MCP server and user-managed
        OpenAI-compatible services (Ollama, vLLM, LM Studio, bespoke gateways).
        By subclassing :class:`OpenAICompatibleProvider` it inherits request and
        token handling, while the custom registry exposes locally defined model
        metadata.

    Notable behaviour
        * Uses :class:`OpenRouterModelRegistry` to load model definitions and
          aliases so custom deployments share the same metadata pipeline as
          OpenRouter itself.
        * Normalises version-tagged model names (``model:latest``) and applies
          restriction policies just like cloud providers, ensuring consistent
          behaviour across environments.
    """

    FRIENDLY_NAME = "Custom API"

    # Model registry for managing configurations and aliases
    _registry: CustomEndpointModelRegistry | None = None

    def __init__(self, api_key: str = "", base_url: str = "", **kwargs):
        """Initialize Custom provider for local/self-hosted models.

        This provider supports any OpenAI-compatible API endpoint including:
        - Ollama (typically no API key required)
        - vLLM (may require API key)
        - LM Studio (may require API key)
        - Text Generation WebUI (may require API key)
        - Enterprise/self-hosted APIs (typically require API key)

        Args:
            api_key: API key for the custom endpoint. Can be empty string for
                    providers that don't require authentication (like Ollama).
                    Falls back to CUSTOM_API_KEY environment variable if not provided.
            base_url: Base URL for the custom API endpoint (e.g., 'http://localhost:11434/v1').
                     Falls back to CUSTOM_API_URL environment variable if not provided.
            **kwargs: Additional configuration passed to parent OpenAI-compatible provider

        Raises:
            ValueError: If no base_url is provided via parameter or environment variable
        """
        # Fall back to environment variables only if not provided
        if not base_url:
            base_url = get_env("CUSTOM_API_URL", "") or ""
        if not api_key:
            api_key = get_env("CUSTOM_API_KEY", "") or ""

        if not base_url:
            raise ValueError(
                "Custom API URL must be provided via base_url parameter or CUSTOM_API_URL environment variable"
            )

        # For Ollama and other providers that don't require authentication,
        # set a dummy API key to avoid OpenAI client header issues
        if not api_key:
            api_key = "dummy-key-for-unauthenticated-endpoint"
            logging.debug("Using dummy API key for unauthenticated custom endpoint")

        logging.info(f"Initializing Custom provider with endpoint: {base_url}")

        self._alias_cache: dict[str, str] = {}

        super().__init__(api_key, base_url=base_url, **kwargs)

        # Initialize model registry
        if CustomProvider._registry is None:
            CustomProvider._registry = CustomEndpointModelRegistry()
            # Log loaded models and aliases only on first load
            models = self._registry.list_models()
            aliases = self._registry.list_aliases()
            logging.info(f"Custom provider loaded {len(models)} models with {len(aliases)} aliases")

    # ------------------------------------------------------------------
    # Capability surface
    # ------------------------------------------------------------------
    def _lookup_capabilities(
        self,
        canonical_name: str,
        requested_name: str | None = None,
    ) -> ModelCapabilities | None:
        """Return capabilities for models explicitly marked as custom."""

        builtin = super()._lookup_capabilities(canonical_name, requested_name)
        if builtin is not None:
            return builtin

        registry_entry = self._registry.resolve(canonical_name)
        if registry_entry:
            registry_entry.provider = ProviderType.CUSTOM
            return registry_entry

        logging.debug(
            "Custom provider cannot resolve model '%s'; ensure it is declared in custom_models.json",
            canonical_name,
        )
        return None

    def get_provider_type(self) -> ProviderType:
        """Identify this provider for restriction and logging logic."""

        return ProviderType.CUSTOM

    # ------------------------------------------------------------------
    # Registry helpers
    # ------------------------------------------------------------------

    def _resolve_model_name(self, model_name: str) -> str:
        """Resolve registry aliases and strip version tags for local models."""

        cache_key = model_name.lower()
        if cache_key in self._alias_cache:
            return self._alias_cache[cache_key]

        config = self._registry.resolve(model_name)
        if config:
            if config.model_name != model_name:
                logging.debug("Resolved model alias '%s' to '%s'", model_name, config.model_name)
            resolved = config.model_name
            self._alias_cache[cache_key] = resolved
            self._alias_cache.setdefault(resolved.lower(), resolved)
            return resolved

        if ":" in model_name:
            base_model = model_name.split(":")[0]
            logging.debug(f"Stripped version tag from '{model_name}' -> '{base_model}'")

            base_config = self._registry.resolve(base_model)
            if base_config:
                logging.debug("Resolved base model '%s' to '%s'", base_model, base_config.model_name)
                resolved = base_config.model_name
                self._alias_cache[cache_key] = resolved
                self._alias_cache.setdefault(resolved.lower(), resolved)
                return resolved
            self._alias_cache[cache_key] = base_model
            return base_model

        logging.debug(f"Model '{model_name}' not found in registry, using as-is")
        # Attempt to resolve via OpenRouter registry so aliases still map cleanly
        openrouter_registry = OpenRouterModelRegistry()
        openrouter_config = openrouter_registry.resolve(model_name)
        if openrouter_config:
            resolved = openrouter_config.model_name
            self._alias_cache[cache_key] = resolved
            self._alias_cache.setdefault(resolved.lower(), resolved)
            return resolved

        self._alias_cache[cache_key] = model_name
        return model_name

    def get_all_model_capabilities(self) -> dict[str, ModelCapabilities]:
        """Expose registry capabilities for models marked as custom."""

        if not self._registry:
            return {}

        capabilities = {}
        for model in self._registry.list_models():
            config = self._registry.resolve(model)
            if config:
                capabilities[model] = config
        return capabilities

```

--------------------------------------------------------------------------------
/simulator_tests/test_line_number_validation.py:
--------------------------------------------------------------------------------

```python
"""
Test to validate line number handling across different tools
"""

import json
import os

from .base_test import BaseSimulatorTest


class LineNumberValidationTest(BaseSimulatorTest):
    """Test that validates correct line number handling in chat, analyze, and refactor tools"""

    @property
    def test_name(self) -> str:
        return "line_number_validation"

    @property
    def test_description(self) -> str:
        return "Line number handling validation across tools"

    def run_test(self) -> bool:
        """Test line number handling in different tools"""
        try:
            self.logger.info("Test: Line number handling validation")

            # Setup test files
            self.setup_test_files()

            # Create a test file with known content
            test_file_content = '''# Example code with specific elements
def calculate_total(items):
    """Calculate total with tax"""
    subtotal = 0
    tax_rate = 0.08  # Line 5 - tax_rate defined

    for item in items:  # Line 7 - loop starts
        if item.price > 0:
            subtotal += item.price

    tax_amount = subtotal * tax_rate  # Line 11
    return subtotal + tax_amount

def validate_data(data):
    """Validate input data"""  # Line 15
    required_fields = ["name", "email", "age"]  # Line 16

    for field in required_fields:
        if field not in data:
            raise ValueError(f"Missing field: {field}")

    return True  # Line 22
'''

            test_file_path = os.path.join(self.test_dir, "line_test.py")
            with open(test_file_path, "w") as f:
                f.write(test_file_content)

            self.logger.info(f"Created test file: {test_file_path}")

            # Test 1: Chat tool asking about specific line
            self.logger.info("  1.1: Testing chat tool with line number question")
            content, continuation_id = self.call_mcp_tool(
                "chat",
                {
                    "prompt": "Where is tax_rate defined in this file? Please tell me the exact line number.",
                    "absolute_file_paths": [test_file_path],
                    "model": "flash",
                },
            )

            if content:
                # Check if the response mentions line 5
                if "line 5" in content.lower() or "line 5" in content:
                    self.logger.info("  ✅ Chat tool correctly identified tax_rate at line 5")
                else:
                    self.logger.warning(f"  ⚠️ Chat tool response didn't mention line 5: {content[:200]}...")
            else:
                self.logger.error("  ❌ Chat tool request failed")
                return False

            # Test 2: Analyze tool with line number reference
            self.logger.info("  1.2: Testing analyze tool with line number analysis")
            content, continuation_id = self.call_mcp_tool(
                "analyze",
                {
                    "prompt": "What happens between lines 7-11 in this code? Focus on the loop logic.",
                    "absolute_file_paths": [test_file_path],
                    "model": "flash",
                },
            )

            if content:
                # Check if the response references the loop
                if any(term in content.lower() for term in ["loop", "iterate", "line 7", "lines 7"]):
                    self.logger.info("  ✅ Analyze tool correctly analyzed the specified line range")
                else:
                    self.logger.warning("  ⚠️ Analyze tool response unclear about line range")
            else:
                self.logger.error("  ❌ Analyze tool request failed")
                return False

            # Test 3: Refactor tool with line number precision
            self.logger.info("  1.3: Testing refactor tool line number precision")
            content, continuation_id = self.call_mcp_tool(
                "refactor",
                {
                    "prompt": "Analyze this code for refactoring opportunities",
                    "absolute_file_paths": [test_file_path],
                    "refactor_type": "codesmells",
                    "model": "flash",
                },
            )

            if content:
                try:
                    # Parse the JSON response
                    result = json.loads(content)
                    if result.get("status") == "refactor_analysis_complete":
                        opportunities = result.get("refactor_opportunities", [])
                        if opportunities:
                            # Check if line numbers are precise
                            has_line_refs = any(
                                opp.get("start_line") is not None and opp.get("end_line") is not None
                                for opp in opportunities
                            )
                            if has_line_refs:
                                self.logger.info("  ✅ Refactor tool provided precise line number references")
                                # Log some examples
                                for opp in opportunities[:2]:
                                    if opp.get("start_line"):
                                        self.logger.info(
                                            f"    - Issue at lines {opp['start_line']}-{opp['end_line']}: {opp.get('issue', '')[:50]}..."
                                        )
                            else:
                                self.logger.warning("  ⚠️ Refactor tool response missing line numbers")
                        else:
                            self.logger.info("  ℹ️ No refactoring opportunities found (code might be too clean)")
                except json.JSONDecodeError:
                    self.logger.warning("  ⚠️ Refactor tool response not valid JSON")
            else:
                self.logger.error("  ❌ Refactor tool request failed")
                return False

            # Test 4: Validate log patterns
            self.logger.info("  1.4: Validating line number processing in logs")

            # Get logs from server
            try:
                log_file_path = "logs/mcp_server.log"
                with open(log_file_path) as f:
                    lines = f.readlines()
                    logs = "".join(lines[-500:])
            except Exception as e:
                self.logger.error(f"Failed to read server logs: {e}")
                logs = ""
                pass

            # Check for line number formatting patterns
            line_number_patterns = ["Line numbers for", "enabled", "│", "line number"]  # The line number separator

            found_patterns = 0
            for pattern in line_number_patterns:
                if pattern in logs:
                    found_patterns += 1

            self.logger.info(f"    Found {found_patterns}/{len(line_number_patterns)} line number patterns in logs")

            if found_patterns >= 2:
                self.logger.info("  ✅ Line number processing confirmed in logs")
            else:
                self.logger.warning("  ⚠️ Limited line number processing evidence in logs")

            self.logger.info("  ✅ Line number validation test completed successfully")
            return True

        except Exception as e:
            self.logger.error(f"Line number validation test failed: {type(e).__name__}: {e}")
            return False

```

--------------------------------------------------------------------------------
/utils/model_context.py:
--------------------------------------------------------------------------------

```python
"""
Model context management for dynamic token allocation.

This module provides a clean abstraction for model-specific token management,
ensuring that token limits are properly calculated based on the current model
being used, not global constants.

CONVERSATION MEMORY INTEGRATION:
This module works closely with the conversation memory system to provide
optimal token allocation for multi-turn conversations:

1. DUAL PRIORITIZATION STRATEGY SUPPORT:
   - Provides separate token budgets for conversation history vs. files
   - Enables the conversation memory system to apply newest-first prioritization
   - Ensures optimal balance between context preservation and new content

2. MODEL-SPECIFIC ALLOCATION:
   - Dynamic allocation based on model capabilities (context window size)
   - Conservative allocation for smaller models (O3: 200K context)
   - Generous allocation for larger models (Gemini: 1M+ context)
   - Adapts token distribution ratios based on model capacity

3. CROSS-TOOL CONSISTENCY:
   - Provides consistent token budgets across different tools
   - Enables seamless conversation continuation between tools
   - Supports conversation reconstruction with proper budget management
"""

import logging
from dataclasses import dataclass
from typing import Any, Optional

from config import DEFAULT_MODEL
from providers import ModelCapabilities, ModelProviderRegistry

logger = logging.getLogger(__name__)


@dataclass
class TokenAllocation:
    """Token allocation strategy for a model."""

    total_tokens: int
    content_tokens: int
    response_tokens: int
    file_tokens: int
    history_tokens: int

    @property
    def available_for_prompt(self) -> int:
        """Tokens available for the actual prompt after allocations."""
        return self.content_tokens - self.file_tokens - self.history_tokens


class ModelContext:
    """
    Encapsulates model-specific information and token calculations.

    This class provides a single source of truth for all model-related
    token calculations, ensuring consistency across the system.
    """

    def __init__(self, model_name: str, model_option: Optional[str] = None):
        self.model_name = model_name
        self.model_option = model_option  # Store optional model option (e.g., "for", "against", etc.)
        self._provider = None
        self._capabilities = None
        self._token_allocation = None

    @property
    def provider(self):
        """Get the model provider lazily."""
        if self._provider is None:
            self._provider = ModelProviderRegistry.get_provider_for_model(self.model_name)
            if not self._provider:
                available_models = ModelProviderRegistry.get_available_model_names()
                if available_models:
                    available_text = ", ".join(available_models)
                else:
                    available_text = (
                        "No models detected. Configure provider credentials or set DEFAULT_MODEL to a valid option."
                    )

                raise ValueError(
                    f"Model '{self.model_name}' is not available with current API keys. Available models: {available_text}."
                )
        return self._provider

    @property
    def capabilities(self) -> ModelCapabilities:
        """Get model capabilities lazily."""
        if self._capabilities is None:
            self._capabilities = self.provider.get_capabilities(self.model_name)
        return self._capabilities

    def calculate_token_allocation(self, reserved_for_response: Optional[int] = None) -> TokenAllocation:
        """
        Calculate token allocation based on model capacity and conversation requirements.

        This method implements the core token budget calculation that supports the
        dual prioritization strategy used in conversation memory and file processing:

        TOKEN ALLOCATION STRATEGY:
        1. CONTENT vs RESPONSE SPLIT:
           - Smaller models (< 300K): 60% content, 40% response (conservative)
           - Larger models (≥ 300K): 80% content, 20% response (generous)

        2. CONTENT SUB-ALLOCATION:
           - File tokens: 30-40% of content budget for newest file versions
           - History tokens: 40-50% of content budget for conversation context
           - Remaining: Available for tool-specific prompt content

        3. CONVERSATION MEMORY INTEGRATION:
           - History allocation enables conversation reconstruction in reconstruct_thread_context()
           - File allocation supports newest-first file prioritization in tools
           - Remaining budget passed to tools via _remaining_tokens parameter

        Args:
            reserved_for_response: Override response token reservation

        Returns:
            TokenAllocation with calculated budgets for dual prioritization strategy
        """
        total_tokens = self.capabilities.context_window

        # Dynamic allocation based on model capacity
        if total_tokens < 300_000:
            # Smaller context models (O3): Conservative allocation
            content_ratio = 0.6  # 60% for content
            response_ratio = 0.4  # 40% for response
            file_ratio = 0.3  # 30% of content for files
            history_ratio = 0.5  # 50% of content for history
        else:
            # Larger context models (Gemini): More generous allocation
            content_ratio = 0.8  # 80% for content
            response_ratio = 0.2  # 20% for response
            file_ratio = 0.4  # 40% of content for files
            history_ratio = 0.4  # 40% of content for history

        # Calculate allocations
        content_tokens = int(total_tokens * content_ratio)
        response_tokens = reserved_for_response or int(total_tokens * response_ratio)

        # Sub-allocations within content budget
        file_tokens = int(content_tokens * file_ratio)
        history_tokens = int(content_tokens * history_ratio)

        allocation = TokenAllocation(
            total_tokens=total_tokens,
            content_tokens=content_tokens,
            response_tokens=response_tokens,
            file_tokens=file_tokens,
            history_tokens=history_tokens,
        )

        logger.debug(f"Token allocation for {self.model_name}:")
        logger.debug(f"  Total: {allocation.total_tokens:,}")
        logger.debug(f"  Content: {allocation.content_tokens:,} ({content_ratio:.0%})")
        logger.debug(f"  Response: {allocation.response_tokens:,} ({response_ratio:.0%})")
        logger.debug(f"  Files: {allocation.file_tokens:,} ({file_ratio:.0%} of content)")
        logger.debug(f"  History: {allocation.history_tokens:,} ({history_ratio:.0%} of content)")

        return allocation

    def estimate_tokens(self, text: str) -> int:
        """
        Estimate token count for text using model-specific tokenizer.

        For now, uses simple estimation. Can be enhanced with model-specific
        tokenizers (tiktoken for OpenAI, etc.) in the future.
        """
        # TODO: Integrate model-specific tokenizers
        # For now, use conservative estimation
        return len(text) // 3  # Conservative estimate

    @classmethod
    def from_arguments(cls, arguments: dict[str, Any]) -> "ModelContext":
        """Create ModelContext from tool arguments."""
        model_name = arguments.get("model") or DEFAULT_MODEL
        return cls(model_name)

```

--------------------------------------------------------------------------------
/tools/challenge.py:
--------------------------------------------------------------------------------

```python
"""
Challenge tool - Encourages critical thinking and thoughtful disagreement

This tool takes a user's statement and returns it wrapped in instructions that
encourage the CLI agent to challenge ideas and think critically before agreeing. It helps
avoid reflexive agreement by prompting deeper analysis and genuine evaluation.

This is a simple, self-contained tool that doesn't require AI model access.
"""

from typing import TYPE_CHECKING, Any, Optional

from pydantic import Field

if TYPE_CHECKING:
    from tools.models import ToolModelCategory

from config import TEMPERATURE_ANALYTICAL
from tools.shared.base_models import ToolRequest
from tools.shared.exceptions import ToolExecutionError

from .simple.base import SimpleTool

# Field descriptions for the Challenge tool
CHALLENGE_FIELD_DESCRIPTIONS = {
    "prompt": (
        "Statement to scrutinize. If you invoke `challenge` manually, strip the word 'challenge' and pass just the statement. "
        "Automatic invocations send the full user message as-is; do not modify it."
    ),
}


class ChallengeRequest(ToolRequest):
    """Request model for Challenge tool"""

    prompt: str = Field(..., description=CHALLENGE_FIELD_DESCRIPTIONS["prompt"])


class ChallengeTool(SimpleTool):
    """
    Challenge tool for encouraging critical thinking and avoiding automatic agreement.

    This tool wraps user statements in instructions that encourage the CLI agent to:
    - Challenge ideas and think critically before responding
    - Evaluate whether they actually agree or disagree
    - Provide thoughtful analysis rather than reflexive agreement

    The tool is self-contained and doesn't require AI model access - it simply
    transforms the input prompt into a structured critical thinking challenge.
    """

    def get_name(self) -> str:
        return "challenge"

    def get_description(self) -> str:
        return (
            "Prevents reflexive agreement by forcing critical thinking and reasoned analysis when a statement is challenged. "
            "Trigger automatically when a user critically questions, disagrees or appears to push back on earlier answers, and use it manually to sanity-check contentious claims."
        )

    def get_system_prompt(self) -> str:
        # Challenge tool doesn't need a system prompt since it doesn't call AI
        return ""

    def get_default_temperature(self) -> float:
        return TEMPERATURE_ANALYTICAL

    def get_model_category(self) -> "ToolModelCategory":
        """Challenge doesn't need a model category since it doesn't use AI"""
        from tools.models import ToolModelCategory

        return ToolModelCategory.FAST_RESPONSE  # Default, but not used

    def requires_model(self) -> bool:
        """
        Challenge tool doesn't require model resolution at the MCP boundary.

        Like the planner tool, this is a pure data processing tool that transforms
        the input without calling external AI models.

        Returns:
            bool: False - challenge doesn't need AI model access
        """
        return False

    def get_request_model(self):
        """Return the Challenge-specific request model"""
        return ChallengeRequest

    def get_input_schema(self) -> dict[str, Any]:
        """
        Generate input schema for the challenge tool.

        Since this tool doesn't require a model, we exclude model-related fields.
        """
        schema = {
            "type": "object",
            "properties": {
                "prompt": {
                    "type": "string",
                    "description": CHALLENGE_FIELD_DESCRIPTIONS["prompt"],
                },
            },
            "required": ["prompt"],
        }

        return schema

    async def execute(self, arguments: dict[str, Any]) -> list:
        """
        Execute the challenge tool by wrapping the prompt in critical thinking instructions.

        This is the main execution method that transforms the user's statement into
        a structured challenge that encourages thoughtful re-evaluation.
        """
        import json

        from mcp.types import TextContent

        try:
            # Validate request
            request = self.get_request_model()(**arguments)

            # Wrap the prompt in challenge instructions
            wrapped_prompt = self._wrap_prompt_for_challenge(request.prompt)

            # Return the wrapped prompt as the response
            response_data = {
                "status": "challenge_accepted",
                "original_statement": request.prompt,
                "challenge_prompt": wrapped_prompt,
                "instructions": (
                    "Present the challenge_prompt to yourself and follow its instructions. "
                    "Reassess the statement carefully and critically before responding. "
                    "If, after reflection, you find reasons to disagree or qualify it, explain your reasoning. "
                    "Likewise, if you find reasons to agree, articulate them clearly and justify your agreement."
                ),
            }

            return [TextContent(type="text", text=json.dumps(response_data, indent=2, ensure_ascii=False))]

        except ToolExecutionError:
            raise
        except Exception as e:
            import logging

            logger = logging.getLogger(__name__)
            logger.error(f"Error in challenge tool execution: {e}", exc_info=True)

            error_data = {
                "status": "error",
                "error": str(e),
                "content": f"Failed to create challenge prompt: {str(e)}",
            }

            raise ToolExecutionError(json.dumps(error_data, ensure_ascii=False)) from e

    def _wrap_prompt_for_challenge(self, prompt: str) -> str:
        """
        Wrap the user's statement in instructions that encourage critical challenge.

        Args:
            prompt: The original user statement to wrap

        Returns:
            The statement wrapped in challenge instructions
        """
        return (
            f"CRITICAL REASSESSMENT – Do not automatically agree:\n\n"
            f'"{prompt}"\n\n'
            f"Carefully evaluate the statement above. Is it accurate, complete, and well-reasoned? "
            f"Investigate if needed before replying, and stay focused. If you identify flaws, gaps, or misleading "
            f"points, explain them clearly. Likewise, if you find the reasoning sound, explain why it holds up. "
            f"Respond with thoughtful analysis—stay to the point and avoid reflexive agreement."
        )

    # Required method implementations from SimpleTool

    async def prepare_prompt(self, request: ChallengeRequest) -> str:
        """Not used since challenge doesn't call AI models"""
        return ""

    def format_response(self, response: str, request: ChallengeRequest, model_info: Optional[dict] = None) -> str:
        """Not used since challenge doesn't call AI models"""
        return response

    def get_tool_fields(self) -> dict[str, dict[str, Any]]:
        """Tool-specific field definitions for Challenge"""
        return {
            "prompt": {
                "type": "string",
                "description": CHALLENGE_FIELD_DESCRIPTIONS["prompt"],
            },
        }

    def get_required_fields(self) -> list[str]:
        """Required fields for Challenge tool"""
        return ["prompt"]

```

--------------------------------------------------------------------------------
/tests/test_precommit_workflow.py:
--------------------------------------------------------------------------------

```python
"""
Unit tests for the workflow-based PrecommitTool

Tests the core functionality of the precommit workflow tool including:
- Tool metadata and configuration
- Request model validation
- Workflow step handling
- Tool categorization
"""

import pytest

from tools.models import ToolModelCategory
from tools.precommit import PrecommitRequest, PrecommitTool


class TestPrecommitWorkflowTool:
    """Test suite for the workflow-based PrecommitTool"""

    def test_tool_metadata(self):
        """Test basic tool metadata"""
        tool = PrecommitTool()

        assert tool.get_name() == "precommit"
        assert "git changes" in tool.get_description()
        assert "systematic analysis" in tool.get_description()

    def test_tool_model_category(self):
        """Test that precommit tool uses extended reasoning category"""
        tool = PrecommitTool()
        assert tool.get_model_category() == ToolModelCategory.EXTENDED_REASONING

    def test_default_temperature(self):
        """Test analytical temperature setting"""
        tool = PrecommitTool()
        temp = tool.get_default_temperature()
        # Should be analytical temperature (0.2)
        assert temp == 0.2

    def test_request_model_basic_validation(self):
        """Test basic request model validation"""
        # Valid minimal workflow request
        request = PrecommitRequest(
            step="Initial validation step",
            step_number=1,
            total_steps=3,
            next_step_required=True,
            findings="Initial findings",
            path="/test/repo",  # Required for step 1
        )

        assert request.step == "Initial validation step"
        assert request.step_number == 1
        assert request.total_steps == 3
        assert request.next_step_required is True
        assert request.findings == "Initial findings"
        assert request.path == "/test/repo"

    def test_request_model_step_one_validation(self):
        """Test that step 1 requires path field"""
        # Step 1 without path should fail
        with pytest.raises(ValueError, match="Step 1 requires 'path' field"):
            PrecommitRequest(
                step="Initial validation step",
                step_number=1,
                total_steps=3,
                next_step_required=True,
                findings="Initial findings",
                # Missing path for step 1
            )

    def test_request_model_later_steps_no_path_required(self):
        """Test that later steps don't require path"""
        # Step 2+ without path should be fine
        request = PrecommitRequest(
            step="Continued validation",
            step_number=2,
            total_steps=3,
            next_step_required=True,
            findings="Detailed findings",
            # No path needed for step 2+
        )

        assert request.step_number == 2
        assert request.path is None

    def test_request_model_optional_fields(self):
        """Test optional workflow fields"""
        request = PrecommitRequest(
            step="Validation with optional fields",
            step_number=1,
            total_steps=2,
            next_step_required=False,
            findings="Comprehensive findings",
            path="/test/repo",
            precommit_type="external",
            files_checked=["/file1.py", "/file2.py"],
            relevant_files=["/file1.py"],
            relevant_context=["function_name", "class_name"],
            issues_found=[{"severity": "medium", "description": "Test issue"}],
            images=["/screenshot.png"],
        )

        assert request.precommit_type == "external"
        assert len(request.files_checked) == 2
        assert len(request.relevant_files) == 1
        assert len(request.relevant_context) == 2
        assert len(request.issues_found) == 1
        assert len(request.images) == 1

    def test_precommit_specific_fields(self):
        """Test precommit-specific configuration fields"""
        request = PrecommitRequest(
            step="Validation with git config",
            step_number=1,
            total_steps=1,
            next_step_required=False,
            findings="Complete validation",
            path="/repo",
            compare_to="main",
            include_staged=True,
            include_unstaged=False,
            focus_on="security issues",
            severity_filter="high",
        )

        assert request.compare_to == "main"
        assert request.include_staged is True
        assert request.include_unstaged is False
        assert request.focus_on == "security issues"
        assert request.severity_filter == "high"

    def test_precommit_type_validation(self):
        """Test precommit type validation"""
        valid_types = ["external", "internal"]

        for precommit_type in valid_types:
            request = PrecommitRequest(
                step="Test precommit type",
                step_number=1,
                total_steps=1,
                next_step_required=False,
                findings="Test findings",
                path="/repo",
                precommit_type=precommit_type,
            )
            assert request.precommit_type == precommit_type

        # Test default is external
        request = PrecommitRequest(
            step="Test default type",
            step_number=1,
            total_steps=1,
            next_step_required=False,
            findings="Test findings",
            path="/repo",
        )
        assert request.precommit_type == "external"

    def test_severity_filter_options(self):
        """Test severity filter validation"""
        valid_severities = ["critical", "high", "medium", "low", "all"]

        for severity in valid_severities:
            request = PrecommitRequest(
                step="Test severity filter",
                step_number=1,
                total_steps=1,
                next_step_required=False,
                findings="Test findings",
                path="/repo",
                severity_filter=severity,
            )
            assert request.severity_filter == severity

    def test_input_schema_generation(self):
        """Test that input schema is generated correctly"""
        tool = PrecommitTool()
        schema = tool.get_input_schema()

        # Check basic schema structure
        assert schema["type"] == "object"
        assert "properties" in schema
        assert "required" in schema

        # Check required fields are present
        required_fields = {"step", "step_number", "total_steps", "next_step_required", "findings"}
        assert all(field in schema["properties"] for field in required_fields)

        # Check model field is present and configured correctly
        assert "model" in schema["properties"]
        assert schema["properties"]["model"]["type"] == "string"

    def test_workflow_request_model_method(self):
        """Test get_workflow_request_model returns correct model"""
        tool = PrecommitTool()
        assert tool.get_workflow_request_model() == PrecommitRequest
        assert tool.get_request_model() == PrecommitRequest

    def test_system_prompt_integration(self):
        """Test system prompt integration"""
        tool = PrecommitTool()
        system_prompt = tool.get_system_prompt()

        # Should get the precommit prompt
        assert isinstance(system_prompt, str)
        assert len(system_prompt) > 0

```

--------------------------------------------------------------------------------
/tests/test_chat_cross_model_continuation.py:
--------------------------------------------------------------------------------

```python
"""Cross-provider continuation tests for ChatTool."""

from __future__ import annotations

import json
import os
import re
import uuid
from pathlib import Path

import pytest

from providers.registry import ModelProviderRegistry
from providers.shared import ProviderType
from tests.transport_helpers import inject_transport
from tools.chat import ChatTool

CASSETTE_DIR = Path(__file__).parent / "openai_cassettes"
CASSETTE_DIR.mkdir(exist_ok=True)
OPENAI_CASSETTE_PATH = CASSETTE_DIR / "chat_cross_step2_gpt5_reminder.json"

GEMINI_CASSETTE_DIR = Path(__file__).parent / "gemini_cassettes"
GEMINI_CASSETTE_DIR.mkdir(exist_ok=True)
GEMINI_REPLAY_ID = "chat_cross/step1_gemini25_flash_number/mldev"
GEMINI_REPLAY_PATH = GEMINI_CASSETTE_DIR / "chat_cross" / "step1_gemini25_flash_number" / "mldev.json"

FIXED_THREAD_ID = uuid.UUID("dbadc23e-c0f4-4853-982f-6c5bc722b5de")


WORD_TO_NUMBER = {
    "one": 1,
    "two": 2,
    "three": 3,
    "four": 4,
    "five": 5,
    "six": 6,
    "seven": 7,
    "eight": 8,
    "nine": 9,
    "ten": 10,
}


def _extract_number(text: str) -> str:
    digit_match = re.search(r"\b(\d{1,2})\b", text)
    if digit_match:
        return digit_match.group(1)

    lower_text = text.lower()
    for word, value in WORD_TO_NUMBER.items():
        if re.search(rf"\b{word}\b", lower_text):
            return str(value)
    return ""


@pytest.mark.asyncio
@pytest.mark.no_mock_provider
async def test_chat_cross_model_continuation(monkeypatch, tmp_path):
    """Verify continuation across Gemini then OpenAI using recorded interactions."""

    env_updates = {
        "DEFAULT_MODEL": "auto",
        "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", ""),
        "GEMINI_API_KEY": os.getenv("GEMINI_API_KEY", ""),
    }
    keys_to_clear = [
        "XAI_API_KEY",
        "OPENROUTER_API_KEY",
        "ANTHROPIC_API_KEY",
        "MISTRAL_API_KEY",
        "CUSTOM_API_KEY",
        "CUSTOM_API_URL",
    ]

    recording_mode = not OPENAI_CASSETTE_PATH.exists() or not GEMINI_REPLAY_PATH.exists()
    if recording_mode:
        openai_key = env_updates["OPENAI_API_KEY"].strip()
        gemini_key = env_updates["GEMINI_API_KEY"].strip()
        if (not openai_key or openai_key.startswith("dummy")) or (not gemini_key or gemini_key.startswith("dummy")):
            pytest.skip(
                "Cross-provider cassette missing and OPENAI_API_KEY/GEMINI_API_KEY not configured. Provide real keys to record."
            )

    GEMINI_REPLAY_PATH.parent.mkdir(parents=True, exist_ok=True)

    # Step 1 – Gemini picks a number
    with monkeypatch.context() as m:
        m.setenv("DEFAULT_MODEL", env_updates["DEFAULT_MODEL"])
        m.setenv("GOOGLE_ALLOWED_MODELS", "gemini-2.5-flash")
        m.setenv("OPENAI_ALLOWED_MODELS", "gpt-5")
        if recording_mode:
            m.setenv("OPENAI_API_KEY", env_updates["OPENAI_API_KEY"])
            m.setenv("GEMINI_API_KEY", env_updates["GEMINI_API_KEY"])
            m.setenv("GOOGLE_GENAI_CLIENT_MODE", "record")
        else:
            m.setenv("OPENAI_API_KEY", "dummy-key-for-replay")
            m.setenv("GEMINI_API_KEY", "dummy-key-for-replay")
            m.setenv("GOOGLE_GENAI_CLIENT_MODE", "replay")

        m.setenv("GOOGLE_GENAI_REPLAYS_DIRECTORY", str(GEMINI_CASSETTE_DIR))
        m.setenv("GOOGLE_GENAI_REPLAY_ID", GEMINI_REPLAY_ID)

        for key in keys_to_clear:
            m.delenv(key, raising=False)

        ModelProviderRegistry.reset_for_testing()
        from providers.gemini import GeminiModelProvider
        from providers.openai import OpenAIModelProvider

        ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
        ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)

        from utils import conversation_memory

        m.setattr(conversation_memory.uuid, "uuid4", lambda: FIXED_THREAD_ID)

        chat_tool = ChatTool()
        working_directory = str(tmp_path)

        step1_args = {
            "prompt": "Pick a number between 1 and 10 and respond with JUST that number.",
            "model": "gemini-2.5-flash",
            "temperature": 0.2,
            "working_directory_absolute_path": working_directory,
        }

        step1_result = await chat_tool.execute(step1_args)
        assert step1_result and step1_result[0].type == "text"

        step1_data = json.loads(step1_result[0].text)
        assert step1_data["status"] in {"success", "continuation_available"}
        assert step1_data.get("metadata", {}).get("provider_used") == "google"
        continuation_offer = step1_data.get("continuation_offer")
        assert continuation_offer is not None
        continuation_id = continuation_offer["continuation_id"]
        assert continuation_id

        chosen_number = _extract_number(step1_data["content"])
        assert chosen_number.isdigit()
        assert 1 <= int(chosen_number) <= 10

        # Ensure replay is flushed for Gemini recordings
        gemini_provider = ModelProviderRegistry.get_provider_for_model("gemini-2.5-flash")
        if gemini_provider is not None:
            try:
                client = gemini_provider.client
                if hasattr(client, "close"):
                    client.close()
            finally:
                if hasattr(gemini_provider, "_client"):
                    gemini_provider._client = None

    assert GEMINI_REPLAY_PATH.exists()

    # Step 2 – gpt-5 recalls the number via continuation
    with monkeypatch.context() as m:
        if recording_mode:
            m.setenv("OPENAI_API_KEY", env_updates["OPENAI_API_KEY"])
            m.setenv("GEMINI_API_KEY", env_updates["GEMINI_API_KEY"])
            m.setenv("GOOGLE_GENAI_CLIENT_MODE", "record")
        else:
            m.setenv("OPENAI_API_KEY", "dummy-key-for-replay")
            m.setenv("GEMINI_API_KEY", "dummy-key-for-replay")
            m.setenv("GOOGLE_GENAI_CLIENT_MODE", "replay")

        m.setenv("DEFAULT_MODEL", env_updates["DEFAULT_MODEL"])
        m.setenv("GOOGLE_ALLOWED_MODELS", "gemini-2.5-flash")
        m.setenv("OPENAI_ALLOWED_MODELS", "gpt-5")
        m.setenv("GOOGLE_GENAI_REPLAYS_DIRECTORY", str(GEMINI_CASSETTE_DIR))
        m.setenv("GOOGLE_GENAI_REPLAY_ID", GEMINI_REPLAY_ID)
        for key in keys_to_clear:
            m.delenv(key, raising=False)

        ModelProviderRegistry.reset_for_testing()
        from providers.gemini import GeminiModelProvider
        from providers.openai import OpenAIModelProvider

        ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
        ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)

        inject_transport(monkeypatch, OPENAI_CASSETTE_PATH)

        chat_tool = ChatTool()
        step2_args = {
            "prompt": "Remind me, what number did you pick, respond with JUST that number.",
            "model": "gpt-5",
            "continuation_id": continuation_id,
            "temperature": 0.2,
            "working_directory_absolute_path": working_directory,
        }

        step2_result = await chat_tool.execute(step2_args)
        assert step2_result and step2_result[0].type == "text"

        step2_data = json.loads(step2_result[0].text)
        assert step2_data["status"] in {"success", "continuation_available"}
        assert step2_data.get("metadata", {}).get("provider_used") == "openai"

        recalled_number = _extract_number(step2_data["content"])
        assert recalled_number == chosen_number

    assert OPENAI_CASSETTE_PATH.exists()

    ModelProviderRegistry.reset_for_testing()

```

--------------------------------------------------------------------------------
/tests/test_auto_mode_model_listing.py:
--------------------------------------------------------------------------------

```python
"""Tests covering model restriction-aware error messaging in auto mode."""

import asyncio
import importlib
import json

import pytest

import utils.env as env_config
import utils.model_restrictions as model_restrictions
from providers.gemini import GeminiModelProvider
from providers.openai import OpenAIModelProvider
from providers.openrouter import OpenRouterProvider
from providers.registry import ModelProviderRegistry
from providers.shared import ProviderType
from providers.xai import XAIModelProvider
from tools.shared.exceptions import ToolExecutionError


def _extract_available_models(message: str) -> list[str]:
    """Parse the available model list from the error message."""

    marker = "Available models: "
    if marker not in message:
        raise AssertionError(f"Expected '{marker}' in message: {message}")

    start = message.index(marker) + len(marker)
    end = message.find(". Suggested", start)
    if end == -1:
        end = len(message)

    available_segment = message[start:end].strip()
    if not available_segment:
        return []

    return [item.strip() for item in available_segment.split(",")]


@pytest.fixture
def reset_registry():
    """Ensure registry and restriction service state is isolated."""

    ModelProviderRegistry.reset_for_testing()
    model_restrictions._restriction_service = None
    env_config.reload_env()
    yield
    ModelProviderRegistry.reset_for_testing()
    model_restrictions._restriction_service = None


def _register_core_providers(*, include_xai: bool = False):
    ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
    ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
    ModelProviderRegistry.register_provider(ProviderType.OPENROUTER, OpenRouterProvider)
    if include_xai:
        ModelProviderRegistry.register_provider(ProviderType.XAI, XAIModelProvider)


@pytest.mark.no_mock_provider
def test_error_listing_respects_env_restrictions(monkeypatch, reset_registry):
    """Error payload should surface only the allowed models for each provider."""

    monkeypatch.setenv("DEFAULT_MODEL", "auto")
    monkeypatch.setenv("GEMINI_API_KEY", "test-gemini")
    monkeypatch.setenv("OPENAI_API_KEY", "test-openai")
    monkeypatch.setenv("OPENROUTER_API_KEY", "test-openrouter")
    monkeypatch.delenv("XAI_API_KEY", raising=False)
    # Ensure Azure provider stays disabled regardless of developer workstation env
    for azure_var in (
        "AZURE_OPENAI_API_KEY",
        "AZURE_OPENAI_ENDPOINT",
        "AZURE_OPENAI_ALLOWED_MODELS",
        "AZURE_MODELS_CONFIG_PATH",
    ):
        monkeypatch.delenv(azure_var, raising=False)
    monkeypatch.setenv("ZEN_MCP_FORCE_ENV_OVERRIDE", "false")
    env_config.reload_env({"ZEN_MCP_FORCE_ENV_OVERRIDE": "false"})
    try:
        import dotenv

        monkeypatch.setattr(dotenv, "dotenv_values", lambda *_args, **_kwargs: {"ZEN_MCP_FORCE_ENV_OVERRIDE": "false"})
    except ModuleNotFoundError:
        pass

    monkeypatch.setenv("GOOGLE_ALLOWED_MODELS", "gemini-2.5-pro")
    monkeypatch.setenv("OPENAI_ALLOWED_MODELS", "gpt-5")
    monkeypatch.setenv("OPENROUTER_ALLOWED_MODELS", "gpt5nano")
    monkeypatch.setenv("XAI_ALLOWED_MODELS", "")

    import config

    importlib.reload(config)

    _register_core_providers()

    import server

    importlib.reload(server)

    # Reload may have re-applied .env overrides; enforce our test configuration
    for key, value in (
        ("DEFAULT_MODEL", "auto"),
        ("GEMINI_API_KEY", "test-gemini"),
        ("OPENAI_API_KEY", "test-openai"),
        ("OPENROUTER_API_KEY", "test-openrouter"),
        ("GOOGLE_ALLOWED_MODELS", "gemini-2.5-pro"),
        ("OPENAI_ALLOWED_MODELS", "gpt-5"),
        ("OPENROUTER_ALLOWED_MODELS", "gpt5nano"),
        ("XAI_ALLOWED_MODELS", ""),
    ):
        monkeypatch.setenv(key, value)

    for var in ("XAI_API_KEY", "CUSTOM_API_URL", "CUSTOM_API_KEY", "DIAL_API_KEY"):
        monkeypatch.delenv(var, raising=False)
    for azure_var in (
        "AZURE_OPENAI_API_KEY",
        "AZURE_OPENAI_ENDPOINT",
        "AZURE_OPENAI_ALLOWED_MODELS",
        "AZURE_MODELS_CONFIG_PATH",
    ):
        monkeypatch.delenv(azure_var, raising=False)

    ModelProviderRegistry.reset_for_testing()
    model_restrictions._restriction_service = None
    server.configure_providers()

    with pytest.raises(ToolExecutionError) as exc_info:
        asyncio.run(
            server.handle_call_tool(
                "chat",
                {
                    "model": "gpt5mini",
                    "prompt": "Tell me about your strengths",
                },
            )
        )

    payload = json.loads(exc_info.value.payload)
    assert payload["status"] == "error"

    available_models = _extract_available_models(payload["content"])
    assert set(available_models) == {"gemini-2.5-pro", "gpt-5", "gpt5nano", "openai/gpt-5-nano"}


@pytest.mark.no_mock_provider
def test_error_listing_without_restrictions_shows_full_catalog(monkeypatch, reset_registry):
    """When no restrictions are set, the full high-capability catalogue should appear."""

    monkeypatch.setenv("DEFAULT_MODEL", "auto")
    monkeypatch.setenv("GEMINI_API_KEY", "test-gemini")
    monkeypatch.setenv("OPENAI_API_KEY", "test-openai")
    monkeypatch.setenv("OPENROUTER_API_KEY", "test-openrouter")
    monkeypatch.setenv("XAI_API_KEY", "test-xai")
    monkeypatch.setenv("ZEN_MCP_FORCE_ENV_OVERRIDE", "false")
    for azure_var in (
        "AZURE_OPENAI_API_KEY",
        "AZURE_OPENAI_ENDPOINT",
        "AZURE_OPENAI_ALLOWED_MODELS",
        "AZURE_MODELS_CONFIG_PATH",
    ):
        monkeypatch.delenv(azure_var, raising=False)
    env_config.reload_env({"ZEN_MCP_FORCE_ENV_OVERRIDE": "false"})
    try:
        import dotenv

        monkeypatch.setattr(dotenv, "dotenv_values", lambda *_args, **_kwargs: {"ZEN_MCP_FORCE_ENV_OVERRIDE": "false"})
    except ModuleNotFoundError:
        pass

    for var in (
        "GOOGLE_ALLOWED_MODELS",
        "OPENAI_ALLOWED_MODELS",
        "OPENROUTER_ALLOWED_MODELS",
        "XAI_ALLOWED_MODELS",
        "DIAL_ALLOWED_MODELS",
    ):
        monkeypatch.delenv(var, raising=False)

    import config

    importlib.reload(config)

    _register_core_providers(include_xai=True)

    import server

    importlib.reload(server)

    for key, value in (
        ("DEFAULT_MODEL", "auto"),
        ("GEMINI_API_KEY", "test-gemini"),
        ("OPENAI_API_KEY", "test-openai"),
        ("OPENROUTER_API_KEY", "test-openrouter"),
    ):
        monkeypatch.setenv(key, value)

    for var in (
        "GOOGLE_ALLOWED_MODELS",
        "OPENAI_ALLOWED_MODELS",
        "OPENROUTER_ALLOWED_MODELS",
        "XAI_ALLOWED_MODELS",
        "DIAL_ALLOWED_MODELS",
        "CUSTOM_API_URL",
        "CUSTOM_API_KEY",
    ):
        monkeypatch.delenv(var, raising=False)

    ModelProviderRegistry.reset_for_testing()
    model_restrictions._restriction_service = None
    server.configure_providers()

    with pytest.raises(ToolExecutionError) as exc_info:
        asyncio.run(
            server.handle_call_tool(
                "chat",
                {
                    "model": "dummymodel",
                    "prompt": "Hi there",
                },
            )
        )

    payload = json.loads(exc_info.value.payload)
    assert payload["status"] == "error"

    available_models = _extract_available_models(payload["content"])
    assert "gemini-2.5-pro" in available_models
    assert "gpt-5" in available_models
    assert "grok-4" in available_models
    assert len(available_models) >= 5

```

--------------------------------------------------------------------------------
/simulator_tests/test_prompt_size_limit_bug.py:
--------------------------------------------------------------------------------

```python
#!/usr/bin/env python3
"""
Prompt Size Limit Bug Test

This test reproduces a critical bug where the prompt size limit check
incorrectly includes conversation history when validating incoming prompts
from Claude to MCP. The limit should ONLY apply to the actual prompt text
sent by the user, not the entire conversation context.

Bug Scenario:
- User starts a conversation with chat tool
- Continues conversation multiple times (building up history)
- On subsequent continuation, a short prompt (150 chars) triggers
  "resend_prompt" error claiming >50k characters

Expected Behavior:
- Only count the actual prompt parameter for size limit
- Conversation history should NOT count toward prompt size limit
- Only the user's actual input should be validated against 50k limit
"""

from .conversation_base_test import ConversationBaseTest


class PromptSizeLimitBugTest(ConversationBaseTest):
    """Test to reproduce and verify fix for prompt size limit bug"""

    @property
    def test_name(self) -> str:
        return "prompt_size_limit_bug"

    @property
    def test_description(self) -> str:
        return "Reproduce prompt size limit bug with conversation continuation"

    def run_test(self) -> bool:
        """Test prompt size limit bug reproduction using in-process calls"""
        try:
            self.logger.info("🐛 Test: Prompt size limit bug reproduction (in-process)")

            # Setup test environment
            self.setUp()

            # Create a test file to provide context
            test_file_content = """
# Test SwiftUI-like Framework Implementation

struct ContentView: View {
    @State private var counter = 0

    var body: some View {
        VStack {
            Text("Count: \\(counter)")
            Button("Increment") {
                counter += 1
            }
        }
    }
}

class Renderer {
    static let shared = Renderer()

    func render(view: View) {
        // Implementation details for UIKit/AppKit rendering
    }
}

protocol View {
    var body: some View { get }
}
"""
            test_file_path = self.create_additional_test_file("SwiftFramework.swift", test_file_content)

            # Step 1: Start initial conversation
            self.logger.info("  Step 1: Start conversation with initial context")

            initial_prompt = "I'm building a SwiftUI-like framework. Can you help me design the architecture?"

            response1, continuation_id = self.call_mcp_tool_direct(
                "chat",
                {
                    "prompt": initial_prompt,
                    "absolute_file_paths": [test_file_path],
                    "model": "flash",
                },
            )

            if not response1 or not continuation_id:
                self.logger.error("  ❌ Failed to start initial conversation")
                return False

            self.logger.info(f"  ✅ Initial conversation started: {continuation_id[:8]}...")

            # Step 2: Continue conversation multiple times to build substantial history
            conversation_prompts = [
                "That's helpful! Can you elaborate on the View protocol design?",
                "How should I implement the State property wrapper?",
                "What's the best approach for the VStack layout implementation?",
                "Should I use UIKit directly or create an abstraction layer?",
                "Smart approach! For the rendering layer, would you suggest UIKit/AppKit directly?",
            ]

            for i, prompt in enumerate(conversation_prompts, 2):
                self.logger.info(f"  Step {i}: Continue conversation (exchange {i})")

                response, _ = self.call_mcp_tool_direct(
                    "chat",
                    {
                        "prompt": prompt,
                        "continuation_id": continuation_id,
                        "model": "flash",
                    },
                )

                if not response:
                    self.logger.error(f"  ❌ Failed at exchange {i}")
                    return False

                self.logger.info(f"  ✅ Exchange {i} completed")

            # Step 3: Send short prompt that should NOT trigger size limit
            self.logger.info("  Step 7: Send short prompt (should NOT trigger size limit)")

            # This is a very short prompt - should not trigger the bug after fix
            short_prompt = "Thanks! This gives me a solid foundation to start prototyping."

            self.logger.info(f"     Short prompt length: {len(short_prompt)} characters")

            response_final, _ = self.call_mcp_tool_direct(
                "chat",
                {
                    "prompt": short_prompt,
                    "continuation_id": continuation_id,
                    "model": "flash",
                },
            )

            if not response_final:
                self.logger.error("  ❌ Final short prompt failed")
                return False

            # Parse the response to check for the bug
            import json

            try:
                response_data = json.loads(response_final)
                status = response_data.get("status", "")

                if status == "resend_prompt":
                    # This is the bug! Short prompt incorrectly triggering size limit
                    metadata = response_data.get("metadata", {})
                    prompt_size = metadata.get("prompt_size", 0)

                    self.logger.error(
                        f"  🐛 BUG STILL EXISTS: Short prompt ({len(short_prompt)} chars) triggered resend_prompt"
                    )
                    self.logger.error(f"     Reported prompt_size: {prompt_size} (should be ~{len(short_prompt)})")
                    self.logger.error("     This indicates conversation history is still being counted")

                    return False  # Bug still exists

                elif status in ["success", "continuation_available"]:
                    self.logger.info("  ✅ Short prompt processed correctly - bug appears to be FIXED!")
                    self.logger.info(f"     Prompt length: {len(short_prompt)} chars, Status: {status}")
                    return True

                else:
                    self.logger.warning(f"  ⚠️ Unexpected status: {status}")
                    # Check if this might be a non-JSON response (successful execution)
                    if len(response_final) > 0 and not response_final.startswith('{"'):
                        self.logger.info("  ✅ Non-JSON response suggests successful tool execution")
                        return True
                    return False

            except json.JSONDecodeError:
                # Non-JSON response often means successful tool execution
                self.logger.info("  ✅ Non-JSON response suggests successful tool execution (bug likely fixed)")
                self.logger.debug(f"     Response preview: {response_final[:200]}...")
                return True

        except Exception as e:
            self.logger.error(f"Prompt size limit bug test failed: {e}")
            import traceback

            self.logger.debug(f"Full traceback: {traceback.format_exc()}")
            return False


def main():
    """Run the prompt size limit bug test"""
    import sys

    verbose = "--verbose" in sys.argv or "-v" in sys.argv
    test = PromptSizeLimitBugTest(verbose=verbose)

    success = test.run_test()
    if success:
        print("Bug reproduction test completed - check logs for details")
    else:
        print("Test failed to complete")
    sys.exit(0 if success else 1)


if __name__ == "__main__":
    main()

```

--------------------------------------------------------------------------------
/systemprompts/debug_prompt.py:
--------------------------------------------------------------------------------

```python
"""
Debug tool system prompt
"""

DEBUG_ISSUE_PROMPT = """
ROLE
You are an expert debugging assistant receiving systematic investigation findings from another AI agent.
The agent has performed methodical investigation work following systematic debugging methodology.
Your role is to provide expert analysis based on the comprehensive investigation presented to you.

SYSTEMATIC INVESTIGATION CONTEXT
The agent has followed a systematic investigation approach:
1. Methodical examination of error reports and symptoms
2. Step-by-step code analysis and evidence collection
3. Use of tracer tool for complex method interactions when needed
4. Hypothesis formation and testing against actual code
5. Documentation of findings and investigation evolution

You are receiving:
1. Issue description and original symptoms
2. The agent's systematic investigation findings (comprehensive analysis)
3. Essential files identified as critical for understanding the issue
4. Error context, logs, and diagnostic information
5. Tracer tool analysis results (if complex flow analysis was needed)

TRACER TOOL INTEGRATION AWARENESS
If the agent used the tracer tool during investigation, the findings will include:
- Method call flow analysis
- Class dependency mapping
- Side effect identification
- Execution path tracing
This provides deep understanding of how code interactions contribute to the issue.

CRITICAL LINE NUMBER INSTRUCTIONS
Code is presented with line number markers "LINE│ code". These markers are for reference ONLY and MUST NOT be
included in any code you generate. Always reference specific line numbers in your replies in order to locate
exact positions if needed to point to exact locations. Include a very short code excerpt alongside for clarity.
Include context_start_text and context_end_text as backup references. Never include "LINE│" markers in generated code
snippets.

WORKFLOW CONTEXT
Your task is to analyze the systematic investigation given to you and provide expert debugging analysis back to the
agent, who will then present the findings to the user in a consolidated format.

STRUCTURED JSON OUTPUT FORMAT
You MUST respond with a properly formatted JSON object following this exact schema.
Do NOT include any text before or after the JSON. The response must be valid JSON only.

IF MORE INFORMATION IS NEEDED:
If you lack critical information to proceed, you MUST only respond with the following:
{
  "status": "files_required_to_continue",
  "mandatory_instructions": "<your critical instructions for the agent>",
  "files_needed": ["[file name here]", "[or some folder/]"]
}

IF NO BUG FOUND AFTER THOROUGH INVESTIGATION:
If after a very thorough investigation, no concrete evidence of a bug is found correlating to reported symptoms, you
MUST only respond with the following:
{
  "status": "no_bug_found",
  "summary": "<summary of what was thoroughly investigated>",
  "investigation_steps": ["<step 1>", "<step 2>", "..."],
  "areas_examined": ["<code areas>", "<potential failure points>", "..."],
  "confidence_level": "High|Medium|Low",
  "alternative_explanations": ["<possible misunderstanding>", "<user expectation mismatch>", "..."],
  "recommended_questions": ["<question 1 to clarify the issue>", "<question 2 to gather more context>", "..."],
  "next_steps": ["<suggested actions to better understand the reported issue>"]
}

FOR COMPLETE ANALYSIS:
{
  "status": "analysis_complete",
  "summary": "<brief description of the problem and its impact>",
  "investigation_steps": [
    "<step 1: what you analyzed first>",
    "<step 2: what you discovered next>",
    "<step 3: how findings evolved>",
    "..."
  ],
  "hypotheses": [
    {
      "name": "<HYPOTHESIS NAME>",
      "confidence": "High|Medium|Low",
      "root_cause": "<technical explanation>",
      "evidence": "<logs or code clues supporting this hypothesis>",
      "correlation": "<how symptoms map to the cause>",
      "validation": "<quick test to confirm>",
      "minimal_fix": "<smallest change to resolve the issue>",
      "regression_check": "<why this fix is safe>",
      "file_references": ["<file:line format for exact locations>"],
      "function_name": "<optional: specific function/method name if identified>",
      "start_line": "<optional: starting line number if specific location identified>",
      "end_line": "<optional: ending line number if specific location identified>",
      "context_start_text": "<optional: exact text from start line for verification>",
      "context_end_text": "<optional: exact text from end line for verification>"
    }
  ],
  "key_findings": [
    "<finding 1: important discoveries made during analysis>",
    "<finding 2: code patterns or issues identified>",
    "<finding 3: invalidated assumptions or refined understanding>"
  ],
  "immediate_actions": [
    "<action 1: steps to take regardless of which hypothesis is correct>",
    "<action 2: additional logging or monitoring needed>"
  ],
  "recommended_tools": [
    "<tool recommendation if additional analysis needed, e.g., 'tracer tool for call flow analysis'>"
  ],
  "prevention_strategy": "<optional: targeted measures to prevent this exact issue from recurring>",
  "investigation_summary": "<comprehensive summary of the complete investigation process and final conclusions>"
}

CRITICAL DEBUGGING PRINCIPLES:
1. Bugs can ONLY be found and fixed from given code - these cannot be made up or imagined
2. Focus ONLY on the reported issue - avoid suggesting extensive refactoring or unrelated improvements
3. Propose minimal fixes that address the specific problem without introducing regressions
4. Document your investigation process systematically for future reference
5. Rank hypotheses by likelihood based on evidence from the actual code and logs provided
6. Always include specific file:line references for exact locations of issues
7. CRITICAL: If the agent's investigation finds no concrete evidence of a bug correlating to reported symptoms,
   you should consider that the reported issue may not actually exist, may be a misunderstanding, or may be
   conflated with something else entirely. In such cases, recommend gathering more information from the user
   through targeted questioning rather than continuing to hunt for non-existent bugs

PRECISE LOCATION REFERENCES:
When you identify specific code locations for hypotheses, include optional precision fields:
- function_name: The exact function/method name where the issue occurs
- start_line/end_line: Line numbers from the LINE│ markers (for reference ONLY - never include LINE│ in generated code)
- context_start_text/context_end_text: Exact text from those lines for verification
- These fields help the agent locate exact positions for implementing fixes

REGRESSION PREVENTION: Before suggesting any fix, thoroughly analyze the proposed change to ensure it does not
introduce new issues or break existing functionality. Consider:
- How the change might affect other parts of the codebase
- Whether the fix could impact related features or workflows
- If the solution maintains backward compatibility
- What potential side effects or unintended consequences might occur

Your debugging approach should generate focused hypotheses ranked by likelihood, with emphasis on identifying
the exact root cause and implementing minimal, targeted fixes while maintaining comprehensive documentation
of the investigation process.

Your analysis should build upon the agent's systematic investigation to provide:
- Expert validation of hypotheses
- Additional insights based on systematic findings
- Specific implementation guidance for fixes
- Regression prevention analysis
"""

```

--------------------------------------------------------------------------------
/tests/test_docker_config_complete.py:
--------------------------------------------------------------------------------

```python
"""
Complete configuration test for Docker MCP
"""

import os
from pathlib import Path
from unittest.mock import patch

import pytest


class TestDockerMCPConfiguration:
    """Docker MCP configuration tests"""

    def test_dockerfile_configuration(self):
        """Test Dockerfile configuration"""
        project_root = Path(__file__).parent.parent
        dockerfile = project_root / "Dockerfile"

        if not dockerfile.exists():
            pytest.skip("Dockerfile not found")

        content = dockerfile.read_text()

        # Essential checks
        assert "FROM python:" in content
        assert "COPY" in content or "ADD" in content
        assert "server.py" in content

        # Recommended security checks
        security_checks = [
            "USER " in content,  # Non-root user
            "WORKDIR" in content,  # Defined working directory
        ]

        # At least one security practice should be present
        if any(security_checks):
            assert True, "Security best practices detected"

    def test_environment_file_template(self):
        """Test environment file template"""
        project_root = Path(__file__).parent.parent
        env_example = project_root / ".env.example"

        if env_example.exists():
            content = env_example.read_text()

            # Essential variables
            essential_vars = ["GEMINI_API_KEY", "OPENAI_API_KEY", "LOG_LEVEL"]

            for var in essential_vars:
                assert f"{var}=" in content, f"Variable {var} missing"

            # Docker-specific variables should also be present
            docker_vars = ["COMPOSE_PROJECT_NAME", "TZ", "LOG_MAX_SIZE"]
            for var in docker_vars:
                assert f"{var}=" in content, f"Docker variable {var} missing"

    def test_logs_directory_setup(self):
        """Test logs directory setup"""
        project_root = Path(__file__).parent.parent
        logs_dir = project_root / "logs"

        # The logs directory should exist or be creatable
        if not logs_dir.exists():
            try:
                logs_dir.mkdir(exist_ok=True)
                created = True
            except Exception:
                created = False

            assert created, "Logs directory should be creatable"
        else:
            assert logs_dir.is_dir(), "logs should be a directory"


class TestDockerCommandValidation:
    """Docker command validation tests"""

    @patch("subprocess.run")
    def test_docker_build_command(self, mock_run):
        """Test docker build command"""
        mock_run.return_value.returncode = 0

        # Standard build command
        build_cmd = ["docker", "build", "-t", "zen-mcp-server:latest", "."]

        import subprocess

        subprocess.run(build_cmd, capture_output=True)
        mock_run.assert_called_once()

    @patch("subprocess.run")
    def test_docker_run_mcp_command(self, mock_run):
        """Test docker run command for MCP"""
        mock_run.return_value.returncode = 0

        # Run command for MCP
        run_cmd = [
            "docker",
            "run",
            "--rm",
            "-i",
            "--env-file",
            ".env",
            "-v",
            "logs:/app/logs",
            "zen-mcp-server:latest",
            "python",
            "server.py",
        ]

        import subprocess

        subprocess.run(run_cmd, capture_output=True)
        mock_run.assert_called_once()

    def test_docker_command_structure(self):
        """Test Docker command structure"""

        # Recommended MCP command
        mcp_cmd = [
            "docker",
            "run",
            "--rm",
            "-i",
            "--env-file",
            "/path/to/.env",
            "-v",
            "/path/to/logs:/app/logs",
            "zen-mcp-server:latest",
            "python",
            "server.py",
        ]

        # Structure checks
        assert mcp_cmd[0] == "docker"
        assert "run" in mcp_cmd
        assert "--rm" in mcp_cmd  # Automatic cleanup
        assert "-i" in mcp_cmd  # Interactive mode
        assert "--env-file" in mcp_cmd  # Environment variables
        assert "zen-mcp-server:latest" in mcp_cmd  # Image


class TestIntegrationChecks:
    """Integration checks"""

    def test_complete_setup_checklist(self):
        """Test complete setup checklist"""
        project_root = Path(__file__).parent.parent

        # Checklist for essential files
        essential_files = {
            "Dockerfile": project_root / "Dockerfile",
            "server.py": project_root / "server.py",
            "requirements.txt": project_root / "requirements.txt",
            "docker-compose.yml": project_root / "docker-compose.yml",
        }

        missing_files = []
        for name, path in essential_files.items():
            if not path.exists():
                missing_files.append(name)

        # Allow some missing files for flexibility
        critical_files = ["Dockerfile", "server.py"]
        missing_critical = [f for f in missing_files if f in critical_files]

        assert not missing_critical, f"Critical files missing: {missing_critical}"

    def test_mcp_integration_readiness(self):
        """Test MCP integration readiness"""
        project_root = Path(__file__).parent.parent

        # MCP integration checks
        checks = {
            "dockerfile": (project_root / "Dockerfile").exists(),
            "server_script": (project_root / "server.py").exists(),
            "logs_dir": (project_root / "logs").exists() or True,
        }

        # At least critical elements must be present
        critical_checks = ["dockerfile", "server_script"]
        missing_critical = [k for k in critical_checks if not checks[k]]

        assert not missing_critical, f"Critical elements missing: {missing_critical}"

        # Readiness score
        ready_score = sum(checks.values()) / len(checks)
        assert ready_score >= 0.75, f"Insufficient readiness score: {ready_score:.2f}"


class TestErrorHandling:
    """Error handling tests"""

    def test_missing_api_key_handling(self):
        """Test handling of missing API key"""

        # Simulate environment without API keys
        with patch.dict(os.environ, {}, clear=True):
            api_keys = [os.getenv("GEMINI_API_KEY"), os.getenv("OPENAI_API_KEY"), os.getenv("XAI_API_KEY")]

            has_api_key = any(key for key in api_keys)

            # No key should be present
            assert not has_api_key, "No API key detected (expected for test)"

            # System should handle this gracefully
            error_handled = True  # Simulate error handling
            assert error_handled, "API key error handling implemented"

    def test_docker_not_available_handling(self):
        """Test handling of Docker not available"""

        @patch("subprocess.run")
        def simulate_docker_unavailable(mock_run):
            # Simulate Docker not available
            mock_run.side_effect = FileNotFoundError("docker: command not found")

            try:
                import subprocess

                subprocess.run(["docker", "--version"], capture_output=True)
                docker_available = True
            except FileNotFoundError:
                docker_available = False

            # Docker is not available - expected error
            assert not docker_available, "Docker unavailable (simulation)"

            # System should provide a clear error message
            error_message_clear = True  # Simulation
            assert error_message_clear, "Clear Docker error message"

        simulate_docker_unavailable()


if __name__ == "__main__":
    pytest.main([__file__, "-v"])

```

--------------------------------------------------------------------------------
/utils/file_types.py:
--------------------------------------------------------------------------------

```python
"""
File type definitions and constants for file processing

This module centralizes all file type and extension definitions used
throughout the MCP server for consistent file handling.
"""

# Programming language file extensions - core code files
PROGRAMMING_LANGUAGES = {
    ".py",  # Python
    ".js",  # JavaScript
    ".ts",  # TypeScript
    ".jsx",  # React JavaScript
    ".tsx",  # React TypeScript
    ".java",  # Java
    ".cpp",  # C++
    ".c",  # C
    ".h",  # C/C++ Header
    ".hpp",  # C++ Header
    ".cs",  # C#
    ".go",  # Go
    ".rs",  # Rust
    ".rb",  # Ruby
    ".php",  # PHP
    ".swift",  # Swift
    ".kt",  # Kotlin
    ".scala",  # Scala
    ".r",  # R
    ".m",  # Objective-C
    ".mm",  # Objective-C++
}

# Script and shell file extensions
SCRIPTS = {
    ".sql",  # SQL
    ".sh",  # Shell
    ".bash",  # Bash
    ".zsh",  # Zsh
    ".fish",  # Fish shell
    ".ps1",  # PowerShell
    ".bat",  # Batch
    ".cmd",  # Command
}

# Configuration and data file extensions
CONFIGS = {
    ".yml",  # YAML
    ".yaml",  # YAML
    ".json",  # JSON
    ".xml",  # XML
    ".toml",  # TOML
    ".ini",  # INI
    ".cfg",  # Config
    ".conf",  # Config
    ".properties",  # Properties
    ".env",  # Environment
}

# Documentation and markup file extensions
DOCS = {
    ".txt",  # Text
    ".md",  # Markdown
    ".rst",  # reStructuredText
    ".tex",  # LaTeX
}

# Web development file extensions
WEB = {
    ".html",  # HTML
    ".css",  # CSS
    ".scss",  # Sass
    ".sass",  # Sass
    ".less",  # Less
}

# Additional text file extensions for logs and data
TEXT_DATA = {
    ".log",  # Log files
    ".csv",  # CSV
    ".tsv",  # TSV
    ".gitignore",  # Git ignore
    ".dockerfile",  # Dockerfile
    ".makefile",  # Make
    ".cmake",  # CMake
    ".gradle",  # Gradle
    ".sbt",  # SBT
    ".pom",  # Maven POM
    ".lock",  # Lock files
    ".changeset",  # Precommit changeset
}

# Image file extensions - limited to what AI models actually support
# Based on OpenAI and Gemini supported formats: PNG, JPEG, GIF, WebP
IMAGES = {".jpg", ".jpeg", ".png", ".gif", ".webp"}

# Binary executable and library extensions
BINARIES = {
    ".exe",  # Windows executable
    ".dll",  # Windows library
    ".so",  # Linux shared object
    ".dylib",  # macOS dynamic library
    ".bin",  # Binary
    ".class",  # Java class
}

# Archive and package file extensions
ARCHIVES = {
    ".jar",
    ".war",
    ".ear",  # Java archives
    ".zip",
    ".tar",
    ".gz",  # General archives
    ".7z",
    ".rar",  # Compression
    ".deb",
    ".rpm",  # Linux packages
    ".dmg",
    ".pkg",  # macOS packages
}

# Derived sets for different use cases
CODE_EXTENSIONS = PROGRAMMING_LANGUAGES | SCRIPTS | CONFIGS | DOCS | WEB
PROGRAMMING_EXTENSIONS = PROGRAMMING_LANGUAGES  # For line numbering
TEXT_EXTENSIONS = CODE_EXTENSIONS | TEXT_DATA
IMAGE_EXTENSIONS = IMAGES
BINARY_EXTENSIONS = BINARIES | ARCHIVES

# All extensions by category for easy access
FILE_CATEGORIES = {
    "programming": PROGRAMMING_LANGUAGES,
    "scripts": SCRIPTS,
    "configs": CONFIGS,
    "docs": DOCS,
    "web": WEB,
    "text_data": TEXT_DATA,
    "images": IMAGES,
    "binaries": BINARIES,
    "archives": ARCHIVES,
}


def get_file_category(file_path: str) -> str:
    """
    Determine the category of a file based on its extension.

    Args:
        file_path: Path to the file

    Returns:
        Category name or "unknown" if not recognized
    """
    from pathlib import Path

    extension = Path(file_path).suffix.lower()

    for category, extensions in FILE_CATEGORIES.items():
        if extension in extensions:
            return category

    return "unknown"


def is_code_file(file_path: str) -> bool:
    """Check if a file is a code file (programming language)."""
    from pathlib import Path

    return Path(file_path).suffix.lower() in PROGRAMMING_LANGUAGES


def is_text_file(file_path: str) -> bool:
    """Check if a file is a text file."""
    from pathlib import Path

    return Path(file_path).suffix.lower() in TEXT_EXTENSIONS


def is_binary_file(file_path: str) -> bool:
    """Check if a file is a binary file."""
    from pathlib import Path

    return Path(file_path).suffix.lower() in BINARY_EXTENSIONS


# File-type specific token-to-byte ratios for accurate token estimation
# Based on empirical analysis of file compression characteristics and tokenization patterns
TOKEN_ESTIMATION_RATIOS = {
    # Programming languages
    ".py": 3.5,  # Python - moderate verbosity
    ".js": 3.2,  # JavaScript - compact syntax
    ".ts": 3.3,  # TypeScript - type annotations add tokens
    ".jsx": 3.1,  # React JSX - JSX tags are tokenized efficiently
    ".tsx": 3.0,  # React TSX - combination of TypeScript + JSX
    ".java": 3.6,  # Java - verbose syntax, long identifiers
    ".cpp": 3.7,  # C++ - preprocessor directives, templates
    ".c": 3.8,  # C - function definitions, struct declarations
    ".go": 3.9,  # Go - explicit error handling, package names
    ".rs": 3.5,  # Rust - similar to Python in verbosity
    ".php": 3.3,  # PHP - mixed HTML/code, variable prefixes
    ".rb": 3.6,  # Ruby - descriptive method names
    ".swift": 3.4,  # Swift - modern syntax, type inference
    ".kt": 3.5,  # Kotlin - similar to modern languages
    ".scala": 3.2,  # Scala - functional programming, concise
    # Scripts and configuration
    ".sh": 4.1,  # Shell scripts - commands and paths
    ".bat": 4.0,  # Batch files - similar to shell
    ".ps1": 3.8,  # PowerShell - more structured than bash
    ".sql": 3.8,  # SQL - keywords and table/column names
    # Data and configuration formats
    ".json": 2.5,  # JSON - lots of punctuation and quotes
    ".yaml": 3.0,  # YAML - structured but readable
    ".yml": 3.0,  # YAML (alternative extension)
    ".xml": 2.8,  # XML - tags and attributes
    ".toml": 3.2,  # TOML - similar to config files
    # Documentation and text
    ".md": 4.2,  # Markdown - natural language with formatting
    ".txt": 4.0,  # Plain text - mostly natural language
    ".rst": 4.1,  # reStructuredText - documentation format
    # Web technologies
    ".html": 2.9,  # HTML - tags and attributes
    ".css": 3.4,  # CSS - properties and selectors
    # Logs and data
    ".log": 4.5,  # Log files - timestamps, messages, stack traces
    ".csv": 3.1,  # CSV - data with delimiters
    # Infrastructure files
    ".dockerfile": 3.7,  # Dockerfile - commands and paths
    ".tf": 3.5,  # Terraform - infrastructure as code
}


def get_token_estimation_ratio(file_path: str) -> float:
    """
    Get the token estimation ratio for a file based on its extension.

    Args:
        file_path: Path to the file

    Returns:
        Token-to-byte ratio for the file type (default: 3.5 for unknown types)
    """
    from pathlib import Path

    extension = Path(file_path).suffix.lower()
    return TOKEN_ESTIMATION_RATIOS.get(extension, 3.5)  # Conservative default


# MIME type mappings for image files - limited to what AI models actually support
# Based on OpenAI and Gemini supported formats: PNG, JPEG, GIF, WebP
IMAGE_MIME_TYPES = {
    ".jpg": "image/jpeg",
    ".jpeg": "image/jpeg",
    ".png": "image/png",
    ".gif": "image/gif",
    ".webp": "image/webp",
}


def get_image_mime_type(extension: str) -> str:
    """
    Get the MIME type for an image file extension.

    Args:
        extension: File extension (with or without leading dot)

    Returns:
        MIME type string (default: image/jpeg for unknown extensions)
    """
    if not extension.startswith("."):
        extension = "." + extension
    extension = extension.lower()
    return IMAGE_MIME_TYPES.get(extension, "image/jpeg")

```

--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------

```python
"""
Pytest configuration for Zen MCP Server tests
"""

import asyncio
import importlib
import os
import sys
from pathlib import Path

import pytest

# Ensure the parent directory is in the Python path for imports
parent_dir = Path(__file__).resolve().parent.parent
if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))

import utils.env as env_config  # noqa: E402

# Ensure tests operate with runtime environment rather than .env overrides during imports
env_config.reload_env({"ZEN_MCP_FORCE_ENV_OVERRIDE": "false"})

# Set default model to a specific value for tests to avoid auto mode
# This prevents all tests from failing due to missing model parameter
os.environ["DEFAULT_MODEL"] = "gemini-2.5-flash"

# Force reload of config module to pick up the env var
import config  # noqa: E402

importlib.reload(config)

# Note: This creates a test sandbox environment
# Tests create their own temporary directories as needed

# Configure asyncio for Windows compatibility
if sys.platform == "win32":
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

# Register providers for all tests
from providers.gemini import GeminiModelProvider  # noqa: E402
from providers.openai import OpenAIModelProvider  # noqa: E402
from providers.registry import ModelProviderRegistry  # noqa: E402
from providers.shared import ProviderType  # noqa: E402
from providers.xai import XAIModelProvider  # noqa: E402

# Register providers at test startup
ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
ModelProviderRegistry.register_provider(ProviderType.XAI, XAIModelProvider)

# Register CUSTOM provider if CUSTOM_API_URL is available (for integration tests)
# But only if we're actually running integration tests, not unit tests
if os.getenv("CUSTOM_API_URL") and "test_prompt_regression.py" in os.getenv("PYTEST_CURRENT_TEST", ""):
    from providers.custom import CustomProvider  # noqa: E402

    def custom_provider_factory(api_key=None):
        """Factory function that creates CustomProvider with proper parameters."""
        base_url = os.getenv("CUSTOM_API_URL", "")
        return CustomProvider(api_key=api_key or "", base_url=base_url)

    ModelProviderRegistry.register_provider(ProviderType.CUSTOM, custom_provider_factory)


@pytest.fixture
def project_path(tmp_path):
    """
    Provides a temporary directory for tests.
    This ensures all file operations during tests are isolated.
    """
    # Create a subdirectory for this specific test
    test_dir = tmp_path / "test_workspace"
    test_dir.mkdir(parents=True, exist_ok=True)

    return test_dir


def _set_dummy_keys_if_missing():
    """Set dummy API keys only when they are completely absent."""
    for var in ("GEMINI_API_KEY", "OPENAI_API_KEY", "XAI_API_KEY"):
        if not os.environ.get(var):
            os.environ[var] = "dummy-key-for-tests"


# Pytest configuration
def pytest_configure(config):
    """Configure pytest with custom markers"""
    config.addinivalue_line("markers", "asyncio: mark test as async")
    config.addinivalue_line("markers", "no_mock_provider: disable automatic provider mocking")
    # Assume we need dummy keys until we learn otherwise
    config._needs_dummy_keys = True


def pytest_collection_modifyitems(session, config, items):
    """Hook that runs after test collection to check for no_mock_provider markers."""
    # Always set dummy keys if real keys are missing
    # This ensures tests work in CI even with no_mock_provider marker
    _set_dummy_keys_if_missing()


@pytest.fixture(autouse=True)
def mock_provider_availability(request, monkeypatch):
    """
    Automatically mock provider availability for all tests to prevent
    effective auto mode from being triggered when DEFAULT_MODEL is unavailable.

    This fixture ensures that when tests run with dummy API keys,
    the tools don't require model selection unless explicitly testing auto mode.
    """
    # Skip this fixture for tests that need real providers
    if hasattr(request, "node"):
        marker = request.node.get_closest_marker("no_mock_provider")
        if marker:
            return

    # Ensure providers are registered (in case other tests cleared the registry)
    from providers.shared import ProviderType

    registry = ModelProviderRegistry()

    if ProviderType.GOOGLE not in registry._providers:
        ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
    if ProviderType.OPENAI not in registry._providers:
        ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
    if ProviderType.XAI not in registry._providers:
        ModelProviderRegistry.register_provider(ProviderType.XAI, XAIModelProvider)

    # Ensure CUSTOM provider is registered if needed for integration tests
    if (
        os.getenv("CUSTOM_API_URL")
        and "test_prompt_regression.py" in os.getenv("PYTEST_CURRENT_TEST", "")
        and ProviderType.CUSTOM not in registry._providers
    ):
        from providers.custom import CustomProvider

        def custom_provider_factory(api_key=None):
            base_url = os.getenv("CUSTOM_API_URL", "")
            return CustomProvider(api_key=api_key or "", base_url=base_url)

        ModelProviderRegistry.register_provider(ProviderType.CUSTOM, custom_provider_factory)

    # Also mock is_effective_auto_mode for all BaseTool instances to return False
    # unless we're specifically testing auto mode behavior
    from tools.shared.base_tool import BaseTool

    def mock_is_effective_auto_mode(self):
        # If this is an auto mode test file or specific auto mode test, use the real logic
        test_file = request.node.fspath.basename if hasattr(request, "node") and hasattr(request.node, "fspath") else ""
        test_name = request.node.name if hasattr(request, "node") else ""

        # Allow auto mode for tests in auto mode files or with auto in the name
        if (
            "auto_mode" in test_file.lower()
            or "auto" in test_name.lower()
            or "intelligent_fallback" in test_file.lower()
            or "per_tool_model_defaults" in test_file.lower()
        ):
            # Call original method logic
            from config import DEFAULT_MODEL

            if DEFAULT_MODEL.lower() == "auto":
                return True
            provider = ModelProviderRegistry.get_provider_for_model(DEFAULT_MODEL)
            return provider is None
        # For all other tests, return False to disable auto mode
        return False

    monkeypatch.setattr(BaseTool, "is_effective_auto_mode", mock_is_effective_auto_mode)


@pytest.fixture(autouse=True)
def clear_model_restriction_env(monkeypatch):
    """Ensure per-test isolation from user-defined model restriction env vars."""

    restriction_vars = [
        "OPENAI_ALLOWED_MODELS",
        "GOOGLE_ALLOWED_MODELS",
        "XAI_ALLOWED_MODELS",
        "OPENROUTER_ALLOWED_MODELS",
        "DIAL_ALLOWED_MODELS",
    ]

    for var in restriction_vars:
        monkeypatch.delenv(var, raising=False)


@pytest.fixture(autouse=True)
def disable_force_env_override(monkeypatch):
    """Default tests to runtime environment visibility unless they explicitly opt in."""

    monkeypatch.setenv("ZEN_MCP_FORCE_ENV_OVERRIDE", "false")
    env_config.reload_env({"ZEN_MCP_FORCE_ENV_OVERRIDE": "false"})
    monkeypatch.setenv("DEFAULT_MODEL", "gemini-2.5-flash")
    monkeypatch.setenv("MAX_CONVERSATION_TURNS", "50")

    import importlib

    import config
    import utils.conversation_memory as conversation_memory

    importlib.reload(config)
    importlib.reload(conversation_memory)

    try:
        yield
    finally:
        env_config.reload_env()

```

--------------------------------------------------------------------------------
/systemprompts/generate_code_prompt.py:
--------------------------------------------------------------------------------

```python
"""System prompt fragment enabling structured code generation exports.

This prompt is injected into the system prompt for models that have the
'allow_code_generation' capability enabled. It instructs the model to output
complete, working code in a structured format that coding agents can parse
and apply automatically.

The structured format uses XML-like tags to clearly delineate:
- New files to create (<NEWFILE>)
- Existing files to update (<UPDATED_EXISTING_FILE>)
- Step-by-step instructions for the coding agent

This enables:
1. Automated code extraction and application
2. Clear separation between instructions and implementation
3. Complete, runnable code without manual edits
4. Precise change tracking across multiple files
"""

GENERATE_CODE_PROMPT = """
# Structured Code Generation Protocol

**WHEN TO USE THIS PROTOCOL:**

Use this structured format ONLY when you are explicitly tasked with substantial code generation, such as:
- Creating new features from scratch with multiple files or significant code and you have been asked to help implement this
- Major refactoring across multiple files or large sections of code and you have been tasked to help do this
- Implementing new modules, components, or subsystems and you have been tasked to help with the implementation
- Large-scale updates affecting substantial portions of the codebase that you have been asked to help implement

**WHEN NOT TO USE THIS PROTOCOL:**

Do NOT use this format for minor changes:
- Small tweaks to existing functions or methods (1-20 lines)
- Bug fixes in isolated sections
- Simple algorithm improvements
- Minor refactoring of a single function
- Adding/removing a few lines of code
- Quick parameter adjustments or config changes

For minor changes:
- Follow the existing instructions provided earlier in your system prompt, such as the CRITICAL LINE NUMBER INSTRUCTIONS.
- Use inline code blocks with proper line number references and direct explanations instead of this structured format.

**IMPORTANT:** This protocol is for SUBSTANTIAL implementation work when explicitly requested, such as:
- "implement feature X"
- "create module Y"
- "refactor system Z"
- "rewrite the authentication logic"
- "redesign the data processing pipeline"
- "rebuild the algorithm from scratch"
- "convert this approach to use a different pattern"
- "create a complete implementation of..."
- "build out the entire workflow for..."

If the request is for explanation, analysis, debugging, planning, or discussion WITHOUT substantial code generation, respond normally without this structured format.

## Core Requirements (for substantial code generation tasks)

1. **Complete, Working Code**: Every code block must be fully functional without requiring additional edits. Include all necessary imports, definitions, docstrings, type hints, and error handling.

2. **Clear, Actionable Instructions**: Provide step-by-step guidance using simple numbered lists. Each instruction should map directly to file blocks that follow.

3. **Structured Output Format**: All generated code MUST be contained within a single `<GENERATED-CODE>` block using the exact structure defined below.

4. **Minimal External Commentary**: Keep any text outside the `<GENERATED-CODE>` block brief. Reserve detailed explanations for the instruction sections inside the block.

## Required Structure

Use this exact format (do not improvise tag names or reorder components):

```
<GENERATED-CODE>
[Step-by-step instructions for the coding agent]
1. Create new file [filename] with [description]
2. Update existing file [filename] by [description]
3. [Additional steps as needed]

<NEWFILE: path/to/new_file.py>
[Complete file contents with all necessary components:
- File-level docstring
- All imports (standard library, third-party, local)
- All class/function definitions with complete implementations
- All necessary helper functions
- Inline comments for complex logic
- Type hints where applicable]
</NEWFILE>

[Additional instructions for the next file, if needed]

<NEWFILE: path/to/another_file.py>
[Complete, working code for this file - no partial implementations or placeholders]
</NEWFILE>

[Instructions for updating existing files]

<UPDATED_EXISTING_FILE: existing/path.py>
[Complete replacement code for the modified sections or routines / lines that need updating:
- Full function/method bodies (not just the changed lines)
- Complete class definitions if modifying class methods
- All necessary imports if adding new dependencies
- Preserve existing code structure and style]
</UPDATED_EXISTING_FILE>

[If additional files need updates (based on existing code that was shared with you earlier), repeat the UPDATED_EXISTING_FILE block]

<UPDATED_EXISTING_FILE: another/existing/file.py>
[Complete code for this file's modifications]
</UPDATED_EXISTING_FILE>

[For file deletions, explicitly state in instructions with justification:
"Delete file path/to/obsolete.py - no longer needed because [reason]"]
</GENERATED-CODE>
```

## Critical Rules

**Completeness:**
- Never output partial code snippets or placeholder comments like "# rest of code here"
- Include complete function/class implementations from start to finish
- Add all required imports at the file level
- Include proper error handling and edge case logic

**Accuracy:**
- Match the existing codebase indentation style (tabs vs spaces)
- Preserve language-specific formatting conventions
- Include trailing newlines where required by language tooling
- Use correct file paths relative to project root

**Clarity:**
- Number instructions sequentially (1, 2, 3...)
- Map each instruction to specific file blocks below it
- Explain *why* changes are needed, not just *what* changes
- Highlight any breaking changes or migration steps required

**Structure:**
- Use `<NEWFILE: ...>` for files that don't exist yet
- Use `<UPDATED_EXISTING_FILE: ...>` for modifying existing files
- Place instructions between file blocks to provide context
- Keep the single `<GENERATED-CODE>` wrapper around everything

## Special Cases

**No Changes Needed:**
If the task doesn't require file creation or modification, explicitly state:
"No file changes required. The existing implementation already handles [requirement]."
Do not emit an empty `<GENERATED-CODE>` block.

**Configuration Changes:**
If modifying configuration files (JSON, YAML, TOML), include complete file contents with the changes applied, not just the changed lines.

**Test Files:**
When generating tests, include complete test suites with:
- All necessary test fixtures and setup
- Multiple test cases covering happy path and edge cases
- Proper teardown and cleanup
- Clear test descriptions and assertions

**Documentation:**
Include docstrings for all public functions, classes, and modules using the project's documentation style (Google, NumPy, Sphinx, etc.).

## Context Awareness

**CRITICAL:** Your implementation builds upon the ongoing conversation context:
- All previously shared files, requirements, and constraints remain relevant
- If updating existing code discussed earlier, reference it and preserve unmodified sections
- If the user shared code for improvement, your generated code should build upon it, not replace everything
- The coding agent has full conversation history—your instructions should reference prior discussion as needed

Your generated code is NOT standalone—it's a continuation of the collaborative session with full context awareness.

## Remember

The coding agent depends on this structured format to:
- Parse and extract code automatically
- Apply changes to the correct files within the conversation context
- Validate completeness before execution
- Track modifications across the codebase

Always prioritize clarity, completeness, correctness, and context awareness over brevity.
"""

```

--------------------------------------------------------------------------------
/tools/shared/base_models.py:
--------------------------------------------------------------------------------

```python
"""
Base models for Zen MCP tools.

This module contains the shared Pydantic models used across all tools,
extracted to avoid circular imports and promote code reuse.

Key Models:
- ToolRequest: Base request model for all tools
- WorkflowRequest: Extended request model for workflow-based tools
- ConsolidatedFindings: Model for tracking workflow progress
"""

import logging
from typing import Optional

from pydantic import BaseModel, Field, field_validator

logger = logging.getLogger(__name__)


# Shared field descriptions to avoid duplication
COMMON_FIELD_DESCRIPTIONS = {
    "model": "Model to run. Supply a name if requested by the user or stay in auto mode. When in auto mode, use `listmodels` tool for model discovery.",
    "temperature": "0 = deterministic · 1 = creative.",
    "thinking_mode": "Reasoning depth: minimal, low, medium, high, or max.",
    "continuation_id": (
        "Unique thread continuation ID for multi-turn conversations. Works across different tools. "
        "ALWAYS reuse the last continuation_id you were given—this preserves full conversation context, "
        "files, and findings so the agent can resume seamlessly."
    ),
    "images": "Optional absolute image paths or base64 blobs for visual context.",
    "absolute_file_paths": "Full paths to relevant code",
}

# Workflow-specific field descriptions
WORKFLOW_FIELD_DESCRIPTIONS = {
    "step": "Current work step content and findings from your overall work",
    "step_number": "Current step number in work sequence (starts at 1)",
    "total_steps": "Estimated total steps needed to complete work",
    "next_step_required": "Whether another work step is needed. When false, aim to reduce total_steps to match step_number to avoid mismatch.",
    "findings": "Important findings, evidence and insights discovered in this step",
    "files_checked": "List of files examined during this work step",
    "relevant_files": "Files identified as relevant to issue/goal (FULL absolute paths to real files/folders - DO NOT SHORTEN)",
    "relevant_context": "Methods/functions identified as involved in the issue",
    "issues_found": "Issues identified with severity levels during work",
    "confidence": (
        "Confidence level: exploring (just starting), low (early investigation), "
        "medium (some evidence), high (strong evidence), very_high (comprehensive understanding), "
        "almost_certain (near complete confidence), certain (100% confidence locally - no external validation needed)"
    ),
    "hypothesis": "Current theory about issue/goal based on work",
    "use_assistant_model": (
        "Use assistant model for expert analysis after workflow steps. "
        "False skips expert analysis, relies solely on your personal investigation. "
        "Defaults to True for comprehensive validation."
    ),
}


class ToolRequest(BaseModel):
    """
    Base request model for all Zen MCP tools.

    This model defines common fields that all tools accept, including
    model selection, temperature control, and conversation threading.
    Tool-specific request models should inherit from this class.
    """

    # Model configuration
    model: Optional[str] = Field(None, description=COMMON_FIELD_DESCRIPTIONS["model"])
    temperature: Optional[float] = Field(None, ge=0.0, le=1.0, description=COMMON_FIELD_DESCRIPTIONS["temperature"])
    thinking_mode: Optional[str] = Field(None, description=COMMON_FIELD_DESCRIPTIONS["thinking_mode"])

    # Conversation support
    continuation_id: Optional[str] = Field(None, description=COMMON_FIELD_DESCRIPTIONS["continuation_id"])

    # Visual context
    images: Optional[list[str]] = Field(None, description=COMMON_FIELD_DESCRIPTIONS["images"])


class BaseWorkflowRequest(ToolRequest):
    """
    Minimal base request model for workflow tools.

    This provides only the essential fields that ALL workflow tools need,
    allowing for maximum flexibility in tool-specific implementations.
    """

    # Core workflow fields that ALL workflow tools need
    step: str = Field(..., description=WORKFLOW_FIELD_DESCRIPTIONS["step"])
    step_number: int = Field(..., ge=1, description=WORKFLOW_FIELD_DESCRIPTIONS["step_number"])
    total_steps: int = Field(..., ge=1, description=WORKFLOW_FIELD_DESCRIPTIONS["total_steps"])
    next_step_required: bool = Field(..., description=WORKFLOW_FIELD_DESCRIPTIONS["next_step_required"])


class WorkflowRequest(BaseWorkflowRequest):
    """
    Extended request model for workflow-based tools.

    This model extends ToolRequest with fields specific to the workflow
    pattern, where tools perform multi-step work with forced pauses between steps.

    Used by: debug, precommit, codereview, refactor, thinkdeep, analyze
    """

    # Required workflow fields
    step: str = Field(..., description=WORKFLOW_FIELD_DESCRIPTIONS["step"])
    step_number: int = Field(..., ge=1, description=WORKFLOW_FIELD_DESCRIPTIONS["step_number"])
    total_steps: int = Field(..., ge=1, description=WORKFLOW_FIELD_DESCRIPTIONS["total_steps"])
    next_step_required: bool = Field(..., description=WORKFLOW_FIELD_DESCRIPTIONS["next_step_required"])

    # Work tracking fields
    findings: str = Field(..., description=WORKFLOW_FIELD_DESCRIPTIONS["findings"])
    files_checked: list[str] = Field(default_factory=list, description=WORKFLOW_FIELD_DESCRIPTIONS["files_checked"])
    relevant_files: list[str] = Field(default_factory=list, description=WORKFLOW_FIELD_DESCRIPTIONS["relevant_files"])
    relevant_context: list[str] = Field(
        default_factory=list, description=WORKFLOW_FIELD_DESCRIPTIONS["relevant_context"]
    )
    issues_found: list[dict] = Field(default_factory=list, description=WORKFLOW_FIELD_DESCRIPTIONS["issues_found"])
    confidence: str = Field("low", description=WORKFLOW_FIELD_DESCRIPTIONS["confidence"])

    # Optional workflow fields
    hypothesis: Optional[str] = Field(None, description=WORKFLOW_FIELD_DESCRIPTIONS["hypothesis"])
    use_assistant_model: Optional[bool] = Field(True, description=WORKFLOW_FIELD_DESCRIPTIONS["use_assistant_model"])

    @field_validator("files_checked", "relevant_files", "relevant_context", mode="before")
    @classmethod
    def convert_string_to_list(cls, v):
        """Convert string inputs to empty lists to handle malformed inputs gracefully."""
        if isinstance(v, str):
            logger.warning(f"Field received string '{v}' instead of list, converting to empty list")
            return []
        return v


class ConsolidatedFindings(BaseModel):
    """
    Model for tracking consolidated findings across workflow steps.

    This model accumulates findings, files, methods, and issues
    discovered during multi-step work. It's used by
    BaseWorkflowMixin to track progress across workflow steps.
    """

    files_checked: set[str] = Field(default_factory=set, description="All files examined across all steps")
    relevant_files: set[str] = Field(
        default_factory=set,
        description="Subset of files_checked identified as relevant for work at hand",
    )
    relevant_context: set[str] = Field(
        default_factory=set, description="All methods/functions identified during overall work"
    )
    findings: list[str] = Field(default_factory=list, description="Chronological findings from each work step")
    hypotheses: list[dict] = Field(default_factory=list, description="Evolution of hypotheses across steps")
    issues_found: list[dict] = Field(default_factory=list, description="All issues with severity levels")
    images: list[str] = Field(default_factory=list, description="Images collected during work")
    confidence: str = Field("low", description="Latest confidence level from steps")


# Tool-specific field descriptions are now declared in each tool file
# This keeps concerns separated and makes each tool self-contained

```

--------------------------------------------------------------------------------
/tests/test_auto_model_planner_fix.py:
--------------------------------------------------------------------------------

```python
"""
Unit tests for the auto model planner fix.

This test confirms that the planner tool no longer fails when DEFAULT_MODEL is "auto"
and only basic providers (Google/OpenAI) are configured, while ensuring other tools
still properly require model resolution.
"""

from unittest.mock import patch

from mcp.types import TextContent

from tools.chat import ChatTool
from tools.planner import PlannerTool
from tools.shared.base_tool import BaseTool


class TestAutoModelPlannerFix:
    """Test the fix for auto model resolution with planner tool."""

    def test_planner_requires_model_false(self):
        """Test that planner tool returns False for requires_model."""
        planner = PlannerTool()
        assert planner.requires_model() is False

    def test_chat_requires_model_true(self):
        """Test that chat tool returns True for requires_model (default behavior)."""
        chat = ChatTool()
        assert chat.requires_model() is True

    def test_base_tool_requires_model_default(self):
        """Test that BaseTool default implementation returns True."""

        # Create a mock tool that doesn't override requires_model
        class MockTool(BaseTool):
            def get_name(self):
                return "mock"

            def get_description(self):
                return "Mock tool"

            def get_input_schema(self):
                return {}

            def get_system_prompt(self):
                return "Mock prompt"

            def get_request_model(self):
                from tools.shared.base_models import ToolRequest

                return ToolRequest

            async def prepare_prompt(self, request):
                return "Mock prompt"

        mock_tool = MockTool()
        assert mock_tool.requires_model() is True

    @patch("config.DEFAULT_MODEL", "auto")
    @patch("providers.registry.ModelProviderRegistry.get_provider_for_model")
    def test_auto_model_error_before_fix_simulation(self, mock_get_provider):
        """
        Simulate the error that would occur before the fix.

        This test simulates what would happen if server.py didn't check requires_model()
        and tried to resolve "auto" as a literal model name.
        """
        # Mock the scenario where no provider is found for "auto"
        mock_get_provider.return_value = None

        # This should return None, simulating the "No provider found for model auto" error
        result = mock_get_provider("auto")
        assert result is None

        # Verify that the mock was called with "auto"
        mock_get_provider.assert_called_with("auto")

    @patch("server.DEFAULT_MODEL", "auto")
    async def test_planner_execution_bypasses_model_resolution(self):
        """
        Test that planner tool execution works even when DEFAULT_MODEL is "auto".

        This test confirms that the fix allows planner to work regardless of
        model configuration since it doesn't need model resolution.
        """
        planner = PlannerTool()

        # Test with minimal planner arguments
        arguments = {"step": "Test planning step", "step_number": 1, "total_steps": 1, "next_step_required": False}

        # This should work without any model resolution
        result = await planner.execute(arguments)

        # Verify we got a result
        assert isinstance(result, list)
        assert len(result) > 0
        assert isinstance(result[0], TextContent)

        # Parse the JSON response to verify it's valid
        import json

        response_data = json.loads(result[0].text)
        assert response_data["status"] == "planning_complete"
        assert response_data["step_number"] == 1

    @patch("config.DEFAULT_MODEL", "auto")
    def test_server_model_resolution_logic(self):
        """
        Test the server-side logic that checks requires_model() before model resolution.

        This simulates the key fix in server.py where we check tool.requires_model()
        before attempting model resolution.
        """
        planner = PlannerTool()
        chat = ChatTool()

        # Simulate the server logic
        def simulate_server_model_resolution(tool, model_name):
            """Simulate the fixed server logic from server.py"""
            if not tool.requires_model():
                # Skip model resolution for tools that don't require models
                return "SKIP_MODEL_RESOLUTION"
            else:
                # Would normally do model resolution here
                return f"RESOLVE_MODEL_{model_name}"

        # Test planner (should skip model resolution)
        result = simulate_server_model_resolution(planner, "auto")
        assert result == "SKIP_MODEL_RESOLUTION"

        # Test chat (should attempt model resolution)
        result = simulate_server_model_resolution(chat, "auto")
        assert result == "RESOLVE_MODEL_auto"

    def test_provider_registry_auto_handling(self):
        """
        Test that the provider registry correctly handles model resolution.

        This tests the scenario where providers don't recognize "auto" as a model.
        """
        from providers.registry import ModelProviderRegistry

        # This should return None since "auto" is not a real model name
        provider = ModelProviderRegistry.get_provider_for_model("auto")
        assert provider is None, "Provider registry should not find a provider for literal 'auto'"

    @patch("config.DEFAULT_MODEL", "auto")
    async def test_end_to_end_planner_with_auto_mode(self):
        """
        End-to-end test of planner tool execution in auto mode.

        This test verifies that the complete flow works when DEFAULT_MODEL is "auto"
        and the planner tool is used.
        """
        planner = PlannerTool()

        # Verify the tool doesn't require model resolution
        assert not planner.requires_model()

        # Test a multi-step planning scenario
        step1_args = {
            "step": "Analyze the current system architecture",
            "step_number": 1,
            "total_steps": 3,
            "next_step_required": True,
        }

        result1 = await planner.execute(step1_args)
        assert len(result1) > 0

        # Parse and verify the response
        import json

        response1 = json.loads(result1[0].text)
        assert response1["status"] == "pause_for_planning"
        assert response1["next_step_required"] is True
        assert "continuation_id" in response1

        # Test step 2 with continuation
        continuation_id = response1["continuation_id"]
        step2_args = {
            "step": "Design the microservices architecture",
            "step_number": 2,
            "total_steps": 3,
            "next_step_required": True,
            "continuation_id": continuation_id,
        }

        result2 = await planner.execute(step2_args)
        assert len(result2) > 0

        response2 = json.loads(result2[0].text)
        assert response2["status"] == "pause_for_planning"
        assert response2["step_number"] == 2

    def test_other_tools_still_require_models(self):
        """
        Verify that other tools still properly require model resolution.

        This ensures our fix doesn't break existing functionality.
        Note: Debug tool requires model resolution for expert analysis phase.
        """
        from tools.analyze import AnalyzeTool
        from tools.chat import ChatTool
        from tools.debug import DebugIssueTool

        # Test various tools still require models
        tools_requiring_models = [ChatTool(), AnalyzeTool(), DebugIssueTool()]

        for tool in tools_requiring_models:
            assert tool.requires_model() is True, f"{tool.get_name()} should require model resolution"

        # Note: Debug tool requires model resolution for expert analysis phase
        # Only planner truly manages its own model calls and doesn't need resolution

```

--------------------------------------------------------------------------------
/tests/test_challenge.py:
--------------------------------------------------------------------------------

```python
"""
Tests for Challenge tool - validating critical challenge prompt wrapper

This module contains unit tests to ensure that the Challenge tool
properly wraps statements to encourage critical thinking and avoid
automatic agreement patterns.
"""

import json
from unittest.mock import patch

import pytest

from tools.challenge import ChallengeRequest, ChallengeTool
from tools.shared.exceptions import ToolExecutionError


class TestChallengeTool:
    """Test suite for Challenge tool"""

    def setup_method(self):
        """Set up test fixtures"""
        self.tool = ChallengeTool()

    def test_tool_metadata(self):
        """Test that tool metadata matches requirements"""
        assert self.tool.get_name() == "challenge"
        assert "reflexive agreement" in self.tool.get_description()
        assert "critical thinking" in self.tool.get_description()
        assert "reasoned analysis" in self.tool.get_description()
        assert self.tool.get_default_temperature() == 0.2  # TEMPERATURE_ANALYTICAL

    def test_requires_model(self):
        """Test that challenge tool doesn't require a model"""
        assert self.tool.requires_model() is False

    def test_schema_structure(self):
        """Test that schema has correct structure and excludes model fields"""
        schema = self.tool.get_input_schema()

        # Basic schema structure
        assert schema["type"] == "object"
        assert "properties" in schema
        assert "required" in schema

        # Required fields
        assert "prompt" in schema["required"]
        assert len(schema["required"]) == 1  # Only prompt is required

        # Properties
        properties = schema["properties"]
        assert "prompt" in properties

        # Should NOT have model-related fields since it doesn't require a model
        assert "model" not in properties
        assert "temperature" not in properties
        assert "thinking_mode" not in properties
        assert "continuation_id" not in properties

    def test_request_model_validation(self):
        """Test that the request model validates correctly"""
        # Test valid request
        request = ChallengeRequest(prompt="The sky is green")
        assert request.prompt == "The sky is green"

        # Test with longer prompt
        long_prompt = (
            "Machine learning models always produce accurate results and should be trusted without verification"
        )
        request = ChallengeRequest(prompt=long_prompt)
        assert request.prompt == long_prompt

    def test_required_fields(self):
        """Test that required fields are enforced"""
        from pydantic import ValidationError

        # Missing prompt should raise validation error
        with pytest.raises(ValidationError):
            ChallengeRequest()

    @pytest.mark.asyncio
    async def test_execute_success(self):
        """Test successful execution of challenge tool"""
        arguments = {"prompt": "All software bugs are caused by syntax errors"}

        result = await self.tool.execute(arguments)

        # Should return a list with TextContent
        assert len(result) == 1
        assert result[0].type == "text"

        # Parse the JSON response
        response_data = json.loads(result[0].text)

        # Check response structure
        assert response_data["status"] == "challenge_accepted"
        assert response_data["original_statement"] == "All software bugs are caused by syntax errors"
        assert "challenge_prompt" in response_data
        assert "instructions" in response_data

        # Check that the challenge prompt contains critical thinking instructions
        challenge_prompt = response_data["challenge_prompt"]
        assert "CRITICAL REASSESSMENT – Do not automatically agree" in challenge_prompt
        assert "Carefully evaluate the statement above" in challenge_prompt
        assert response_data["original_statement"] in challenge_prompt
        assert "flaws, gaps, or misleading points" in challenge_prompt
        assert "thoughtful analysis" in challenge_prompt

    @pytest.mark.asyncio
    async def test_execute_error_handling(self):
        """Test error handling in execute method"""
        # Test with invalid arguments (non-dict)
        with patch.object(self.tool, "get_request_model", side_effect=Exception("Test error")):
            with pytest.raises(ToolExecutionError) as exc_info:
                await self.tool.execute({"prompt": "test"})

        response_data = json.loads(exc_info.value.payload)
        assert response_data["status"] == "error"
        assert "Test error" in response_data["error"]

    def test_wrap_prompt_for_challenge(self):
        """Test the prompt wrapping functionality"""
        original_prompt = "Python is the best programming language"
        wrapped = self.tool._wrap_prompt_for_challenge(original_prompt)

        # Check structure
        assert "CRITICAL REASSESSMENT – Do not automatically agree" in wrapped
        assert "Carefully evaluate the statement above" in wrapped
        assert f'"{original_prompt}"' in wrapped
        assert "flaws, gaps, or misleading points" in wrapped
        assert "thoughtful analysis" in wrapped

    def test_multiple_prompts(self):
        """Test that tool handles various types of prompts correctly"""
        test_prompts = [
            "All code should be written in assembly for maximum performance",
            "Comments are unnecessary if code is self-documenting",
            "Testing is a waste of time for experienced developers",
            "Global variables make code easier to understand",
            "The more design patterns used, the better the code",
        ]

        for prompt in test_prompts:
            request = ChallengeRequest(prompt=prompt)
            wrapped = self.tool._wrap_prompt_for_challenge(request.prompt)

            # Each wrapped prompt should contain the original
            assert prompt in wrapped
            assert "CRITICAL REASSESSMENT" in wrapped

    def test_tool_fields(self):
        """Test tool-specific field definitions"""
        fields = self.tool.get_tool_fields()

        assert "prompt" in fields
        assert fields["prompt"]["type"] == "string"
        assert "Statement to scrutinize" in fields["prompt"]["description"]
        assert "strip the word 'challenge'" in fields["prompt"]["description"]

    def test_required_fields_list(self):
        """Test required fields list"""
        required = self.tool.get_required_fields()
        assert required == ["prompt"]

    @pytest.mark.asyncio
    async def test_not_used_methods(self):
        """Test that methods not used by challenge tool work correctly"""
        request = ChallengeRequest(prompt="test")

        # These methods aren't used since challenge doesn't call AI
        prompt = await self.tool.prepare_prompt(request)
        assert prompt == ""

        response = self.tool.format_response("test response", request)
        assert response == "test response"

    def test_special_characters_in_prompt(self):
        """Test handling of special characters in prompts"""
        special_prompt = 'The "best" way to handle errors is to use try/except: pass'
        request = ChallengeRequest(prompt=special_prompt)
        wrapped = self.tool._wrap_prompt_for_challenge(request.prompt)

        # Should handle quotes properly
        assert special_prompt in wrapped

    @pytest.mark.asyncio
    async def test_unicode_support(self):
        """Test that tool handles unicode characters correctly"""
        unicode_prompt = "软件开发中最重要的是写代码，测试不重要 🚀"
        arguments = {"prompt": unicode_prompt}

        result = await self.tool.execute(arguments)
        response_data = json.loads(result[0].text)

        assert response_data["original_statement"] == unicode_prompt
        assert unicode_prompt in response_data["challenge_prompt"]


if __name__ == "__main__":
    pytest.main([__file__])

```

--------------------------------------------------------------------------------
/clink/agents/base.py:
--------------------------------------------------------------------------------

```python
"""Execute configured CLI agents for the clink tool and parse output."""

from __future__ import annotations

import asyncio
import logging
import os
import shlex
import shutil
import tempfile
import time
from collections.abc import Sequence
from dataclasses import dataclass
from pathlib import Path

from clink.constants import DEFAULT_STREAM_LIMIT
from clink.models import ResolvedCLIClient, ResolvedCLIRole
from clink.parsers import BaseParser, ParsedCLIResponse, ParserError, get_parser

logger = logging.getLogger("clink.agent")


@dataclass
class AgentOutput:
    """Container returned by CLI agents after successful execution."""

    parsed: ParsedCLIResponse
    sanitized_command: list[str]
    returncode: int
    stdout: str
    stderr: str
    duration_seconds: float
    parser_name: str
    output_file_content: str | None = None


class CLIAgentError(RuntimeError):
    """Raised when a CLI agent fails (non-zero exit, timeout, parse errors)."""

    def __init__(self, message: str, *, returncode: int | None = None, stdout: str = "", stderr: str = "") -> None:
        super().__init__(message)
        self.returncode = returncode
        self.stdout = stdout
        self.stderr = stderr


class BaseCLIAgent:
    """Execute a configured CLI command and parse its output."""

    def __init__(self, client: ResolvedCLIClient):
        self.client = client
        self._parser: BaseParser = get_parser(client.parser)
        self._logger = logging.getLogger(f"clink.runner.{client.name}")

    async def run(
        self,
        *,
        role: ResolvedCLIRole,
        prompt: str,
        system_prompt: str | None = None,
        files: Sequence[str],
        images: Sequence[str],
    ) -> AgentOutput:
        # Files and images are already embedded into the prompt by the tool; they are
        # accepted here only to keep parity with SimpleTool callers.
        _ = (files, images)
        # The runner simply executes the configured CLI command for the selected role.
        command = self._build_command(role=role, system_prompt=system_prompt)
        env = self._build_environment()

        # Resolve executable path for cross-platform compatibility (especially Windows)
        executable_name = command[0]
        resolved_executable = shutil.which(executable_name)
        if resolved_executable is None:
            raise CLIAgentError(
                f"Executable '{executable_name}' not found in PATH for CLI '{self.client.name}'. "
                f"Ensure the command is installed and accessible."
            )
        command[0] = resolved_executable

        sanitized_command = list(command)

        cwd = str(self.client.working_dir) if self.client.working_dir else None
        limit = DEFAULT_STREAM_LIMIT

        stdout_text = ""
        stderr_text = ""
        output_file_content: str | None = None
        start_time = time.monotonic()

        output_file_path: Path | None = None
        command_with_output_flag = list(command)

        if self.client.output_to_file:
            fd, tmp_path = tempfile.mkstemp(prefix="clink-", suffix=".json")
            os.close(fd)
            output_file_path = Path(tmp_path)
            flag_template = self.client.output_to_file.flag_template
            try:
                rendered_flag = flag_template.format(path=str(output_file_path))
            except KeyError as exc:  # pragma: no cover - defensive
                raise CLIAgentError(f"Invalid output flag template '{flag_template}': missing placeholder {exc}")
            command_with_output_flag.extend(shlex.split(rendered_flag))
            sanitized_command = list(command_with_output_flag)

        self._logger.debug("Executing CLI command: %s", " ".join(sanitized_command))
        if cwd:
            self._logger.debug("Working directory: %s", cwd)

        try:
            process = await asyncio.create_subprocess_exec(
                *command_with_output_flag,
                stdin=asyncio.subprocess.PIPE,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE,
                cwd=cwd,
                limit=limit,
                env=env,
            )
        except FileNotFoundError as exc:
            raise CLIAgentError(f"Executable not found for CLI '{self.client.name}': {exc}") from exc

        try:
            stdout_bytes, stderr_bytes = await asyncio.wait_for(
                process.communicate(prompt.encode("utf-8")),
                timeout=self.client.timeout_seconds,
            )
        except asyncio.TimeoutError as exc:
            process.kill()
            await process.communicate()
            raise CLIAgentError(
                f"CLI '{self.client.name}' timed out after {self.client.timeout_seconds} seconds",
                returncode=None,
            ) from exc

        duration = time.monotonic() - start_time
        return_code = process.returncode
        stdout_text = stdout_bytes.decode("utf-8", errors="replace")
        stderr_text = stderr_bytes.decode("utf-8", errors="replace")

        if output_file_path and output_file_path.exists():
            output_file_content = output_file_path.read_text(encoding="utf-8", errors="replace")
            if self.client.output_to_file and self.client.output_to_file.cleanup:
                try:
                    output_file_path.unlink()
                except OSError:  # pragma: no cover - best effort cleanup
                    pass

            if output_file_content and not stdout_text.strip():
                stdout_text = output_file_content

        if return_code != 0:
            recovered = self._recover_from_error(
                returncode=return_code,
                stdout=stdout_text,
                stderr=stderr_text,
                sanitized_command=sanitized_command,
                duration_seconds=duration,
                output_file_content=output_file_content,
            )
            if recovered is not None:
                return recovered

        if return_code != 0:
            raise CLIAgentError(
                f"CLI '{self.client.name}' exited with status {return_code}",
                returncode=return_code,
                stdout=stdout_text,
                stderr=stderr_text,
            )

        try:
            parsed = self._parser.parse(stdout_text, stderr_text)
        except ParserError as exc:
            raise CLIAgentError(
                f"Failed to parse output from CLI '{self.client.name}': {exc}",
                returncode=return_code,
                stdout=stdout_text,
                stderr=stderr_text,
            ) from exc

        return AgentOutput(
            parsed=parsed,
            sanitized_command=sanitized_command,
            returncode=return_code,
            stdout=stdout_text,
            stderr=stderr_text,
            duration_seconds=duration,
            parser_name=self._parser.name,
            output_file_content=output_file_content,
        )

    def _build_command(self, *, role: ResolvedCLIRole, system_prompt: str | None) -> list[str]:
        base = list(self.client.executable)
        base.extend(self.client.internal_args)
        base.extend(self.client.config_args)
        base.extend(role.role_args)

        return base

    def _build_environment(self) -> dict[str, str]:
        env = os.environ.copy()
        env.update(self.client.env)
        return env

    # ------------------------------------------------------------------
    # Error recovery hooks
    # ------------------------------------------------------------------

    def _recover_from_error(
        self,
        *,
        returncode: int,
        stdout: str,
        stderr: str,
        sanitized_command: list[str],
        duration_seconds: float,
        output_file_content: str | None,
    ) -> AgentOutput | None:
        """Hook for subclasses to convert CLI errors into successful outputs.

        Return an AgentOutput to treat the failure as success, or None to signal
        that normal error handling should proceed.
        """

        return None

```

--------------------------------------------------------------------------------
/providers/openrouter.py:
--------------------------------------------------------------------------------

```python
"""OpenRouter provider implementation."""

import logging

from utils.env import get_env

from .openai_compatible import OpenAICompatibleProvider
from .registries.openrouter import OpenRouterModelRegistry
from .shared import (
    ModelCapabilities,
    ProviderType,
    RangeTemperatureConstraint,
)


class OpenRouterProvider(OpenAICompatibleProvider):
    """Client for OpenRouter's multi-model aggregation service.

    Role
        Surface OpenRouter’s dynamic catalogue through the same interface as
        native providers so tools can reference OpenRouter models and aliases
        without special cases.

    Characteristics
        * Pulls live model definitions from :class:`OpenRouterModelRegistry`
          (aliases, provider-specific metadata, capability hints)
        * Applies alias-aware restriction checks before exposing models to the
          registry or tooling
        * Reuses :class:`OpenAICompatibleProvider` infrastructure for request
          execution so OpenRouter endpoints behave like standard OpenAI-style
          APIs.
    """

    FRIENDLY_NAME = "OpenRouter"

    # Custom headers required by OpenRouter
    DEFAULT_HEADERS = {
        "HTTP-Referer": get_env("OPENROUTER_REFERER", "https://github.com/BeehiveInnovations/zen-mcp-server")
        or "https://github.com/BeehiveInnovations/zen-mcp-server",
        "X-Title": get_env("OPENROUTER_TITLE", "Zen MCP Server") or "Zen MCP Server",
    }

    # Model registry for managing configurations and aliases
    _registry: OpenRouterModelRegistry | None = None

    def __init__(self, api_key: str, **kwargs):
        """Initialize OpenRouter provider.

        Args:
            api_key: OpenRouter API key
            **kwargs: Additional configuration
        """
        base_url = "https://openrouter.ai/api/v1"
        self._alias_cache: dict[str, str] = {}
        super().__init__(api_key, base_url=base_url, **kwargs)

        # Initialize model registry
        if OpenRouterProvider._registry is None:
            OpenRouterProvider._registry = OpenRouterModelRegistry()
            # Log loaded models and aliases only on first load
            models = self._registry.list_models()
            aliases = self._registry.list_aliases()
            logging.info(f"OpenRouter loaded {len(models)} models with {len(aliases)} aliases")

    # ------------------------------------------------------------------
    # Capability surface
    # ------------------------------------------------------------------

    def _lookup_capabilities(
        self,
        canonical_name: str,
        requested_name: str | None = None,
    ) -> ModelCapabilities | None:
        """Fetch OpenRouter capabilities from the registry or build a generic fallback."""

        capabilities = self._registry.get_capabilities(canonical_name)
        if capabilities:
            return capabilities

        base_identifier = canonical_name.split(":", 1)[0]
        if "/" in base_identifier:
            logging.debug(
                "Using generic OpenRouter capabilities for %s (provider/model format detected)", canonical_name
            )
            generic = ModelCapabilities(
                provider=ProviderType.OPENROUTER,
                model_name=canonical_name,
                friendly_name=self.FRIENDLY_NAME,
                intelligence_score=9,
                context_window=32_768,
                max_output_tokens=32_768,
                supports_extended_thinking=False,
                supports_system_prompts=True,
                supports_streaming=True,
                supports_function_calling=False,
                temperature_constraint=RangeTemperatureConstraint(0.0, 2.0, 1.0),
            )
            generic._is_generic = True
            return generic

        logging.debug(
            "Rejecting unknown OpenRouter model '%s' (no provider prefix); requires explicit configuration",
            canonical_name,
        )
        return None

    # ------------------------------------------------------------------
    # Provider identity
    # ------------------------------------------------------------------

    def get_provider_type(self) -> ProviderType:
        """Identify this provider for restrictions and logging."""
        return ProviderType.OPENROUTER

    # ------------------------------------------------------------------
    # Registry helpers
    # ------------------------------------------------------------------

    def list_models(
        self,
        *,
        respect_restrictions: bool = True,
        include_aliases: bool = True,
        lowercase: bool = False,
        unique: bool = False,
    ) -> list[str]:
        """Return formatted OpenRouter model names, respecting alias-aware restrictions."""

        if not self._registry:
            return []

        from utils.model_restrictions import get_restriction_service

        restriction_service = get_restriction_service() if respect_restrictions else None
        allowed_configs: dict[str, ModelCapabilities] = {}

        for model_name in self._registry.list_models():
            config = self._registry.resolve(model_name)
            if not config:
                continue

            # Custom models belong to CustomProvider; skip them here so the two
            # providers don't race over the same registrations (important for tests
            # that stub the registry with minimal objects lacking attrs).
            if config.provider == ProviderType.CUSTOM:
                continue

            if restriction_service:
                allowed = restriction_service.is_allowed(self.get_provider_type(), model_name)

                if not allowed and config.aliases:
                    for alias in config.aliases:
                        if restriction_service.is_allowed(self.get_provider_type(), alias):
                            allowed = True
                            break

                if not allowed:
                    continue

            allowed_configs[model_name] = config

        if not allowed_configs:
            return []

        # When restrictions are in place, don't include aliases to avoid confusion
        # Only return the canonical model names that are actually allowed
        actual_include_aliases = include_aliases and not respect_restrictions

        return ModelCapabilities.collect_model_names(
            allowed_configs,
            include_aliases=actual_include_aliases,
            lowercase=lowercase,
            unique=unique,
        )

    # ------------------------------------------------------------------
    # Registry helpers
    # ------------------------------------------------------------------

    def _resolve_model_name(self, model_name: str) -> str:
        """Resolve aliases defined in the OpenRouter registry."""

        cache_key = model_name.lower()
        if cache_key in self._alias_cache:
            return self._alias_cache[cache_key]

        config = self._registry.resolve(model_name)
        if config:
            if config.model_name != model_name:
                logging.debug("Resolved model alias '%s' to '%s'", model_name, config.model_name)
            resolved = config.model_name
            self._alias_cache[cache_key] = resolved
            self._alias_cache.setdefault(resolved.lower(), resolved)
            return resolved

        logging.debug(f"Model '{model_name}' not found in registry, using as-is")
        self._alias_cache[cache_key] = model_name
        return model_name

    def get_all_model_capabilities(self) -> dict[str, ModelCapabilities]:
        """Expose registry-backed OpenRouter capabilities."""

        if not self._registry:
            return {}

        capabilities: dict[str, ModelCapabilities] = {}
        for model_name in self._registry.list_models():
            config = self._registry.resolve(model_name)
            if not config:
                continue

            # See note in list_models: respect the CustomProvider boundary.
            if config.provider == ProviderType.CUSTOM:
                continue

            capabilities[model_name] = config
        return capabilities

```

--------------------------------------------------------------------------------
/tests/test_auto_mode_custom_provider_only.py:
--------------------------------------------------------------------------------

```python
"""Test auto mode with only custom provider configured to reproduce the reported issue."""

import importlib
import os
from unittest.mock import patch

import pytest

from providers.registry import ModelProviderRegistry
from providers.shared import ProviderType


@pytest.mark.no_mock_provider
class TestAutoModeCustomProviderOnly:
    """Test auto mode when only custom provider is configured."""

    def setup_method(self):
        """Set up clean state before each test."""
        # Save original environment state for restoration
        self._original_env = {}
        for key in [
            "GEMINI_API_KEY",
            "OPENAI_API_KEY",
            "XAI_API_KEY",
            "OPENROUTER_API_KEY",
            "CUSTOM_API_URL",
            "CUSTOM_API_KEY",
            "DEFAULT_MODEL",
        ]:
            self._original_env[key] = os.environ.get(key)

        # Clear restriction service cache
        import utils.model_restrictions

        utils.model_restrictions._restriction_service = None

        # Clear provider registry by resetting singleton instance
        ModelProviderRegistry._instance = None

    def teardown_method(self):
        """Clean up after each test."""
        # Restore original environment
        for key, value in self._original_env.items():
            if value is not None:
                os.environ[key] = value
            elif key in os.environ:
                del os.environ[key]

        # Reload config to pick up the restored environment
        import config

        importlib.reload(config)

        # Clear restriction service cache
        import utils.model_restrictions

        utils.model_restrictions._restriction_service = None

        # Clear provider registry by resetting singleton instance
        ModelProviderRegistry._instance = None

    def test_reproduce_auto_mode_custom_provider_only_issue(self):
        """Test the fix for auto mode failing when only custom provider is configured."""

        # Set up environment with ONLY custom provider configured
        test_env = {
            "CUSTOM_API_URL": "http://localhost:11434/v1",
            "CUSTOM_API_KEY": "",  # Empty for Ollama-style
            "DEFAULT_MODEL": "auto",
        }

        # Clear all other provider keys
        clear_keys = ["GEMINI_API_KEY", "OPENAI_API_KEY", "XAI_API_KEY", "OPENROUTER_API_KEY", "DIAL_API_KEY"]

        with patch.dict(os.environ, test_env, clear=False):
            # Ensure other provider keys are not set
            for key in clear_keys:
                if key in os.environ:
                    del os.environ[key]

            # Reload config to pick up auto mode
            import config

            importlib.reload(config)

            # Register only the custom provider (simulating server startup)
            from providers.custom import CustomProvider

            ModelProviderRegistry.register_provider(ProviderType.CUSTOM, CustomProvider)

            # This should now work after the fix
            # The fix added support for custom provider registry system in get_available_models()
            available_models = ModelProviderRegistry.get_available_models(respect_restrictions=True)

            # This assertion should now pass after the fix
            assert available_models, (
                "Expected custom provider models to be available. "
                "This test verifies the fix for auto mode failing with custom providers."
            )

    def test_custom_provider_models_available_via_registry(self):
        """Test that custom provider has models available via its registry system."""

        # Set up environment with only custom provider
        test_env = {
            "CUSTOM_API_URL": "http://localhost:11434/v1",
            "CUSTOM_API_KEY": "",
        }

        with patch.dict(os.environ, test_env, clear=False):
            # Clear other provider keys
            for key in ["GEMINI_API_KEY", "OPENAI_API_KEY", "XAI_API_KEY", "OPENROUTER_API_KEY", "DIAL_API_KEY"]:
                if key in os.environ:
                    del os.environ[key]

            # Register custom provider
            from providers.custom import CustomProvider

            ModelProviderRegistry.register_provider(ProviderType.CUSTOM, CustomProvider)

            # Get the provider instance
            custom_provider = ModelProviderRegistry.get_provider(ProviderType.CUSTOM)
            assert custom_provider is not None, "Custom provider should be available"

            # Verify it has a registry with models
            assert hasattr(custom_provider, "_registry"), "Custom provider should have _registry"
            assert custom_provider._registry is not None, "Registry should be initialized"

            # Get models from registry
            models = custom_provider._registry.list_models()
            aliases = custom_provider._registry.list_aliases()

            # Should have some models and aliases available
            assert models, "Custom provider registry should have models"
            assert aliases, "Custom provider registry should have aliases"

            print(f"Available models: {len(models)}")
            print(f"Available aliases: {len(aliases)}")

    def test_custom_provider_validate_model_name(self):
        """Test that custom provider can validate model names."""

        # Set up environment with only custom provider
        test_env = {
            "CUSTOM_API_URL": "http://localhost:11434/v1",
            "CUSTOM_API_KEY": "",
        }

        with patch.dict(os.environ, test_env, clear=False):
            # Register custom provider
            from providers.custom import CustomProvider

            ModelProviderRegistry.register_provider(ProviderType.CUSTOM, CustomProvider)

            # Get the provider instance
            custom_provider = ModelProviderRegistry.get_provider(ProviderType.CUSTOM)
            assert custom_provider is not None

            # Test that it can validate some typical custom model names
            test_models = ["llama3.2", "llama3.2:latest", "local-model", "ollama-model"]

            for model in test_models:
                is_valid = custom_provider.validate_model_name(model)
                print(f"Model '{model}' validation: {is_valid}")
                # Should validate at least some local-style models
                # (The exact validation logic may vary based on registry content)

    def test_auto_mode_fallback_with_custom_only_should_work(self):
        """Test that auto mode fallback should work when only custom provider is available."""

        # Set up environment with only custom provider
        test_env = {
            "CUSTOM_API_URL": "http://localhost:11434/v1",
            "CUSTOM_API_KEY": "",
            "DEFAULT_MODEL": "auto",
        }

        with patch.dict(os.environ, test_env, clear=False):
            # Clear other provider keys
            for key in ["GEMINI_API_KEY", "OPENAI_API_KEY", "XAI_API_KEY", "OPENROUTER_API_KEY", "DIAL_API_KEY"]:
                if key in os.environ:
                    del os.environ[key]

            # Reload config
            import config

            importlib.reload(config)

            # Register custom provider
            from providers.custom import CustomProvider

            ModelProviderRegistry.register_provider(ProviderType.CUSTOM, CustomProvider)

            # This should work and return a fallback model from custom provider
            # Currently fails because get_preferred_fallback_model doesn't consider custom models
            from tools.models import ToolModelCategory

            try:
                fallback_model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.FAST_RESPONSE)
                print(f"Fallback model for FAST_RESPONSE: {fallback_model}")

                # Should get a valid model name, not the hardcoded fallback
                assert (
                    fallback_model != "gemini-2.5-flash"
                ), "Should not fallback to hardcoded Gemini model when custom provider is available"

            except Exception as e:
                pytest.fail(f"Getting fallback model failed: {e}")

```

--------------------------------------------------------------------------------
/docs/adding_tools.md:
--------------------------------------------------------------------------------

```markdown
# Adding Tools to Zen MCP Server

Zen MCP tools are Python classes that inherit from the shared infrastructure in `tools/shared/base_tool.py`.
Every tool must provide a request model (Pydantic), a system prompt, and the methods the base class marks as
abstract. The quickest path to a working tool is to copy an existing implementation that matches your use case
(`tools/chat.py` for simple request/response tools, `tools/consensus.py` or `tools/codereview.py` for workflows).
This document captures the minimal steps required to add a new tool without drifting from the current codebase.

## 1. Pick the Tool Architecture

Zen supports two architectures, implemented in `tools/simple/base.py` and `tools/workflow/base.py`.

- **SimpleTool** (`SimpleTool`): single MCP call – request comes in, you build one prompt, call the model, return.
  The base class handles schema generation, conversation threading, file loading, temperature bounds, retries,
  and response formatting hooks.
- **WorkflowTool** (`WorkflowTool`): multi-step workflows driven by `BaseWorkflowMixin`. The tool accumulates
  findings across steps, forces Claude to pause between investigations, and optionally calls an expert model at
  the end. Use this whenever you need structured multi-step work (debug, code review, consensus, etc.).

If you are unsure, compare `tools/chat.py` (SimpleTool) and `tools/consensus.py` (WorkflowTool) to see the patterns.

## 2. Common Responsibilities

Regardless of architecture, subclasses of `BaseTool` must provide:

- `get_name()`: unique string identifier used in the MCP registry.
- `get_description()`: concise, action-oriented summary for clients.
- `get_system_prompt()`: import your prompt from `systemprompts/` and return it.
- `get_input_schema()`: leverage the schema builders (`SchemaBuilder` or `WorkflowSchemaBuilder`) or override to
  match an existing contract exactly.
- `get_request_model()`: return the Pydantic model used to validate the incoming arguments.
- `async prepare_prompt(...)`: assemble the content sent to the model. You can reuse helpers like
  `prepare_chat_style_prompt` or `build_standard_prompt`.

The base class already handles model selection (`ToolModelCategory`), conversation memory, token budgeting, safety
failures, retries, and serialization. Override hooks like `get_default_temperature`, `get_model_category`, or
`format_response` only when you need behaviour different from the defaults.

## 3. Implementing a Simple Tool

1. **Define a request model** that inherits from `tools.shared.base_models.ToolRequest` to describe the fields and
   validation rules for your tool.
2. **Implement the tool class** by inheriting from `SimpleTool` and overriding the required methods. Most tools can
   rely on `SchemaBuilder` and the shared field constants already exposed on `SimpleTool`.

```python
from pydantic import Field
from systemprompts import CHAT_PROMPT
from tools.shared.base_models import ToolRequest
from tools.simple.base import SimpleTool

class ChatRequest(ToolRequest):
    prompt: str = Field(..., description="Your question or idea.")
    absolute_file_paths: list[str] | None = Field(default_factory=list)
    working_directory_absolute_path: str = Field(
        ...,
        description="Absolute path to an existing directory where generated code can be saved.",
    )

class ChatTool(SimpleTool):
    def get_name(self) -> str:  # required by BaseTool
        return "chat"

    def get_description(self) -> str:
        return "General chat and collaborative thinking partner."

    def get_system_prompt(self) -> str:
        return CHAT_PROMPT

    def get_request_model(self):
        return ChatRequest

    def get_tool_fields(self) -> dict[str, dict[str, object]]:
        return {
            "prompt": {"type": "string", "description": "Your question."},
            "absolute_file_paths": SimpleTool.FILES_FIELD,
            "working_directory_absolute_path": {
                "type": "string",
                "description": "Absolute path to an existing directory for generated code artifacts.",
            },
        }

    def get_required_fields(self) -> list[str]:
        return ["prompt", "working_directory_absolute_path"]

    async def prepare_prompt(self, request: ChatRequest) -> str:
        return self.prepare_chat_style_prompt(request)
```

Only implement `get_input_schema()` manually if you must preserve an existing schema contract (see
`tools/chat.py` for an example). Otherwise `SimpleTool.get_input_schema()` merges your field definitions with the
common parameters (temperature, model, continuation_id, etc.).

## 4. Implementing a Workflow Tool

Workflow tools extend `WorkflowTool`, which mixes in `BaseWorkflowMixin` for step tracking and expert analysis.

1. **Create a request model** that inherits from `tools.shared.base_models.WorkflowRequest` (or a subclass) and add
   any tool-specific fields or validators. Examples: `CodeReviewRequest`, `ConsensusRequest`.
2. **Override the workflow hooks** to steer the investigation. At minimum you must implement
   `get_required_actions(...)`; override `should_call_expert_analysis(...)` and
   `prepare_expert_analysis_context(...)` when the expert model call should happen conditionally.
3. **Expose the schema** either by returning `WorkflowSchemaBuilder.build_schema(...)` (the default implementation on
   `WorkflowTool` already does this) or by overriding `get_input_schema()` if you need custom descriptions/enums.

```python
from pydantic import Field
from systemprompts import CONSENSUS_PROMPT
from tools.shared.base_models import WorkflowRequest
from tools.workflow.base import WorkflowTool

class ConsensusRequest(WorkflowRequest):
    models: list[dict] = Field(..., description="Models to consult (with optional stance).")

class ConsensusTool(WorkflowTool):
    def get_name(self) -> str:
        return "consensus"

    def get_description(self) -> str:
        return "Multi-model consensus workflow with expert synthesis."

    def get_system_prompt(self) -> str:
        return CONSENSUS_PROMPT

    def get_workflow_request_model(self):
        return ConsensusRequest

    def get_required_actions(self, step_number: int, confidence: str, findings: str, total_steps: int, request=None) -> list[str]:
        if step_number == 1:
            return ["Write the shared proposal all models will evaluate."]
        return ["Summarize the latest model response before moving on."]

    def should_call_expert_analysis(self, consolidated_findings, request=None) -> bool:
        return not (request and request.next_step_required)

    def prepare_expert_analysis_context(self, consolidated_findings) -> str:
        return "\n".join(consolidated_findings.findings)
```

`WorkflowTool` already records work history, merges findings, and handles continuation IDs. Use helpers such as
`get_standard_required_actions` when you want default guidance, and override `requires_expert_analysis()` if the tool
never calls out to the assistant model.

## 5. Register the Tool

1. **Create or reuse a system prompt** in `systemprompts/your_tool_prompt.py` and export it from
   `systemprompts/__init__.py`.
2. **Expose the tool class** from `tools/__init__.py` so that `server.py` can import it.
3. **Add an instance to the `TOOLS` dictionary** in `server.py`. This makes the tool callable via MCP.
4. **(Optional) Add a prompt template** to `PROMPT_TEMPLATES` in `server.py` if you want clients to show a canned
   launch command.
5. Confirm that `DISABLED_TOOLS` environment variable handling covers the new tool if you need to toggle it.

## 6. Validate the Tool

- Run unit tests that cover any new request/response logic: `python -m pytest tests/ -v -m "not integration"`.
- Add a simulator scenario in `simulator_tests/communication_simulator_test.py` to exercise the tool end-to-end and
  run it with `python communication_simulator_test.py --individual <case>` or `--quick` for the fast smoke suite.
- If the tool interacts with external providers or multiple models, consider integration coverage via
  `./run_integration_tests.sh --with-simulator`.

Following the steps above keeps new tools aligned with the existing infrastructure and avoids drift between the
documentation and the actual base classes.

```

--------------------------------------------------------------------------------
/code_quality_checks.ps1:
--------------------------------------------------------------------------------

```
<#
.SYNOPSIS
    Code quality checks script for Zen MCP server on Windows.

.DESCRIPTION
    This PowerShell script performs code quality checks for the Zen MCP server project:
    - Runs static analysis and linting tools on the codebase
    - Ensures code style compliance and detects potential issues
    - Can be integrated into CI/CD pipelines or used locally before commits

.PARAMETER Help
    Displays help information for using the script.

.PARAMETER Verbose
    Enables detailed output during code quality checks.

.EXAMPLE
    .\code_quality_checks.ps1
    Runs all code quality checks on the project.

    .\code_quality_checks.ps1 -Verbose
    Runs code quality checks with detailed output.

.NOTES
    Project Author     : BeehiveInnovations
    Script Author      : GiGiDKR (https://github.com/GiGiDKR)
    Date               : 07-05-2025
    Version            : See project documentation
    References         : https://github.com/BeehiveInnovations/zen-mcp-server
#>
#Requires -Version 5.1
[CmdletBinding()]
param(
    [switch]$SkipTests,
    [switch]$SkipLinting,
    [switch]$VerboseOutput
)

# Set error action preference
$ErrorActionPreference = "Stop"

# Colors for output
function Write-ColorText {
    param(
        [Parameter(Mandatory)]
        [string]$Text,
        [string]$Color = "White"
    )
    Write-Host $Text -ForegroundColor $Color
}

function Write-Emoji {
    param(
        [Parameter(Mandatory)]
        [string]$Emoji,
        [Parameter(Mandatory)]
        [string]$Text,
        [string]$Color = "White"
    )
    Write-Host "$Emoji " -NoNewline
    Write-ColorText $Text -Color $Color
}

Write-Emoji "🔍" "Running Code Quality Checks for Zen MCP Server" -Color Cyan
Write-ColorText "=================================================" -Color Cyan

# Determine Python command
$pythonCmd = $null
$pipCmd = $null

if (Test-Path ".zen_venv") {
    if ($IsWindows -or $env:OS -eq "Windows_NT") {
        if (Test-Path ".zen_venv\Scripts\python.exe") {
            $pythonCmd = ".zen_venv\Scripts\python.exe"
            $pipCmd = ".zen_venv\Scripts\pip.exe"
        }
    } else {
        if (Test-Path ".zen_venv/bin/python") {
            $pythonCmd = ".zen_venv/bin/python"
            $pipCmd = ".zen_venv/bin/pip"
        }
    }
    
    if ($pythonCmd) {
        Write-Emoji "✅" "Using venv" -Color Green
    }
} elseif ($env:VIRTUAL_ENV) {
    $pythonCmd = "python"
    $pipCmd = "pip"
    Write-Emoji "✅" "Using activated virtual environment: $env:VIRTUAL_ENV" -Color Green
} else {
    Write-Emoji "❌" "No virtual environment found!" -Color Red
    Write-ColorText "Please run: .\run-server.ps1 first to set up the environment" -Color Yellow
    exit 1
}

Write-Host ""

# Check and install dev dependencies if needed
Write-Emoji "🔍" "Checking development dependencies..." -Color Cyan
$devDepsNeeded = $false

# List of dev tools to check
$devTools = @("ruff", "black", "isort", "pytest")

foreach ($tool in $devTools) {
    $toolFound = $false
    
    # Check in venv
    if ($IsWindows -or $env:OS -eq "Windows_NT") {
        if (Test-Path ".zen_venv\Scripts\$tool.exe") {
            $toolFound = $true
        }
    } else {
        if (Test-Path ".zen_venv/bin/$tool") {
            $toolFound = $true
        }
    }
    
    # Check in PATH
    if (!$toolFound) {
        try {
            $null = Get-Command $tool -ErrorAction Stop
            $toolFound = $true
        } catch {
            # Tool not found
        }
    }
    
    if (!$toolFound) {
        $devDepsNeeded = $true
        break
    }
}

if ($devDepsNeeded) {
    Write-Emoji "📦" "Installing development dependencies..." -Color Yellow
    try {
        & $pipCmd install -q -r requirements-dev.txt
        if ($LASTEXITCODE -ne 0) {
            throw "Failed to install dev dependencies"
        }
        Write-Emoji "✅" "Development dependencies installed" -Color Green
    } catch {
        Write-Emoji "❌" "Failed to install development dependencies" -Color Red
        Write-ColorText "Error: $_" -Color Red
        exit 1
    }
} else {
    Write-Emoji "✅" "Development dependencies already installed" -Color Green
}

# Set tool paths
if ($IsWindows -or $env:OS -eq "Windows_NT") {
    $ruffCmd = if (Test-Path ".zen_venv\Scripts\ruff.exe") { ".zen_venv\Scripts\ruff.exe" } else { "ruff" }
    $blackCmd = if (Test-Path ".zen_venv\Scripts\black.exe") { ".zen_venv\Scripts\black.exe" } else { "black" }
    $isortCmd = if (Test-Path ".zen_venv\Scripts\isort.exe") { ".zen_venv\Scripts\isort.exe" } else { "isort" }
    $pytestCmd = if (Test-Path ".zen_venv\Scripts\pytest.exe") { ".zen_venv\Scripts\pytest.exe" } else { "pytest" }
} else {
    $ruffCmd = if (Test-Path ".zen_venv/bin/ruff") { ".zen_venv/bin/ruff" } else { "ruff" }
    $blackCmd = if (Test-Path ".zen_venv/bin/black") { ".zen_venv/bin/black" } else { "black" }
    $isortCmd = if (Test-Path ".zen_venv/bin/isort") { ".zen_venv/bin/isort" } else { "isort" }
    $pytestCmd = if (Test-Path ".zen_venv/bin/pytest") { ".zen_venv/bin/pytest" } else { "pytest" }
}

Write-Host ""

# Step 1: Linting and Formatting
if (!$SkipLinting) {
    Write-Emoji "📋" "Step 1: Running Linting and Formatting Checks" -Color Cyan
    Write-ColorText "--------------------------------------------------" -Color Cyan

    try {
        Write-Emoji "🔧" "Running ruff linting with auto-fix..." -Color Yellow
        & $ruffCmd check --fix --exclude test_simulation_files --exclude .zen_venv
        if ($LASTEXITCODE -ne 0) {
            throw "Ruff linting failed"
        }

        Write-Emoji "🎨" "Running black code formatting..." -Color Yellow
        & $blackCmd . --exclude="test_simulation_files/" --exclude=".zen_venv/"
        if ($LASTEXITCODE -ne 0) {
            throw "Black formatting failed"
        }

        Write-Emoji "📦" "Running import sorting with isort..." -Color Yellow
        & $isortCmd . --skip-glob=".zen_venv/*" --skip-glob="test_simulation_files/*"
        if ($LASTEXITCODE -ne 0) {
            throw "Import sorting failed"
        }

        Write-Emoji "✅" "Verifying all linting passes..." -Color Yellow
        & $ruffCmd check --exclude test_simulation_files --exclude .zen_venv
        if ($LASTEXITCODE -ne 0) {
            throw "Final linting verification failed"
        }

        Write-Emoji "✅" "Step 1 Complete: All linting and formatting checks passed!" -Color Green
    } catch {
        Write-Emoji "❌" "Step 1 Failed: Linting and formatting checks failed" -Color Red
        Write-ColorText "Error: $_" -Color Red
        exit 1
    }
} else {
    Write-Emoji "⏭️" "Skipping linting and formatting checks" -Color Yellow
}

Write-Host ""

# Step 2: Unit Tests
if (!$SkipTests) {
    Write-Emoji "🧪" "Step 2: Running Complete Unit Test Suite" -Color Cyan
    Write-ColorText "---------------------------------------------" -Color Cyan

    try {
        Write-Emoji "🏃" "Running unit tests (excluding integration tests)..." -Color Yellow
        
        $pytestArgs = @("tests/", "-v", "-x", "-m", "not integration")
        if ($VerboseOutput) {
            $pytestArgs += "--verbose"
        }
        
        & $pythonCmd -m pytest @pytestArgs
        if ($LASTEXITCODE -ne 0) {
            throw "Unit tests failed"
        }

        Write-Emoji "✅" "Step 2 Complete: All unit tests passed!" -Color Green
    } catch {
        Write-Emoji "❌" "Step 2 Failed: Unit tests failed" -Color Red
        Write-ColorText "Error: $_" -Color Red
        exit 1
    }
} else {
    Write-Emoji "⏭️" "Skipping unit tests" -Color Yellow
}

Write-Host ""

# Step 3: Final Summary
Write-Emoji "🎉" "All Code Quality Checks Passed!" -Color Green
Write-ColorText "==================================" -Color Green

if (!$SkipLinting) {
    Write-Emoji "✅" "Linting (ruff): PASSED" -Color Green
    Write-Emoji "✅" "Formatting (black): PASSED" -Color Green
    Write-Emoji "✅" "Import sorting (isort): PASSED" -Color Green
} else {
    Write-Emoji "⏭️" "Linting: SKIPPED" -Color Yellow
}

if (!$SkipTests) {
    Write-Emoji "✅" "Unit tests: PASSED" -Color Green
} else {
    Write-Emoji "⏭️" "Unit tests: SKIPPED" -Color Yellow
}

Write-Host ""
Write-Emoji "🚀" "Your code is ready for commit and GitHub Actions!" -Color Green
Write-Emoji "💡" "Remember to add simulator tests if you modified tools" -Color Yellow

```

--------------------------------------------------------------------------------
/docs/tools/analyze.md:
--------------------------------------------------------------------------------

```markdown
# Analyze Tool - Smart File Analysis

**General-purpose code understanding and exploration through workflow-driven investigation**

The `analyze` tool provides comprehensive code analysis and understanding capabilities, helping you explore codebases, understand architecture, and identify patterns across files and directories. This workflow tool guides Claude through systematic investigation of code structure, patterns, and architectural decisions across multiple steps, gathering comprehensive insights before providing expert analysis.

## Thinking Mode

**Default is `medium` (8,192 tokens).** Use `high` for architecture analysis (comprehensive insights worth the cost) or `low` for quick file overviews (save ~6k tokens).

## How the Workflow Works

The analyze tool implements a **structured workflow** for thorough code understanding:

**Investigation Phase (Claude-Led):**
1. **Step 1**: Claude describes the analysis plan and begins examining code structure
2. **Step 2+**: Claude investigates architecture, patterns, dependencies, and design decisions
3. **Throughout**: Claude tracks findings, relevant files, insights, and confidence levels
4. **Completion**: Once analysis is comprehensive, Claude signals completion

**Expert Analysis Phase:**
After Claude completes the investigation (unless confidence is **certain**):
- Complete analysis summary with all findings
- Architectural insights and pattern identification
- Strategic improvement recommendations
- Final expert assessment based on investigation

This workflow ensures methodical analysis before expert insights, resulting in deeper understanding and more valuable recommendations.

## Example Prompts

**Basic Usage:**
```
"Use gemini to analyze main.py to understand how it works"
"Get gemini to do an architecture analysis of the src/ directory"
```

## Key Features

- **Analyzes single files or entire directories** with intelligent file filtering
- **Supports specialized analysis types**: architecture, performance, security, quality, general
- **Uses file paths (not content) for clean terminal output** while processing full content
- **Can identify patterns, anti-patterns, and refactoring opportunities**
- **Large codebase support**: Handle massive codebases with 1M token context models
- **Cross-file relationship mapping**: Understand dependencies and interactions
- **Architecture visualization**: Describe system structure and component relationships
- **Image support**: Analyze architecture diagrams, UML charts, flowcharts: `"Analyze this system diagram with gemini to understand the data flow and identify bottlenecks"`
- **Web search capability**: Automatically requests Claude to perform web searches when fresh documentation, patterns, or best practices are needed, ensuring the analysis stays current

## Tool Parameters

**Workflow Investigation Parameters (used during step-by-step process):**
- `step`: Current investigation step description (required for each step)
- `step_number`: Current step number in analysis sequence (required)
- `total_steps`: Estimated total investigation steps (adjustable)
- `next_step_required`: Whether another investigation step is needed
- `findings`: Discoveries and insights collected in this step (required)
- `files_checked`: All files examined during investigation
- `relevant_files`: Files directly relevant to the analysis (required in step 1)
- `relevant_context`: Methods/functions/classes central to analysis findings
- `issues_found`: Issues or concerns identified with severity levels
- `confidence`: Confidence level in analysis completeness (exploring/low/medium/high/certain)
- `images`: Visual references for analysis context

**Initial Configuration (used in step 1):**
- `prompt`: What to analyze or look for (required)
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
- `analysis_type`: architecture|performance|security|quality|general (default: general)
- `output_format`: summary|detailed|actionable (default: detailed)
- `temperature`: Temperature for analysis (0-1, default 0.2)
- `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
- `use_assistant_model`: Whether to use expert analysis phase (default: true, set to false to use Claude only)
- `continuation_id`: Continue previous analysis sessions

## Analysis Types

**General Analysis (default):**
- Overall code structure and organization
- Key components and their responsibilities
- Data flow and control flow
- Design patterns and architectural decisions

**Architecture Analysis:**
- System-level design and component relationships
- Module dependencies and coupling
- Separation of concerns and layering
- Scalability and maintainability considerations

**Performance Analysis:**
- Potential bottlenecks and optimization opportunities
- Algorithmic complexity assessment
- Memory usage patterns
- I/O and database interaction efficiency

**Security Analysis:**
- Security patterns and potential vulnerabilities
- Input validation and sanitization
- Authentication and authorization mechanisms
- Data protection and privacy considerations

**Quality Analysis:**
- Code quality metrics and maintainability
- Testing coverage and patterns
- Documentation completeness
- Best practices adherence

## Usage Examples

**Single File Analysis:**
```
"Analyze user_controller.py to understand the authentication flow with gemini"
```

**Directory Architecture Analysis:**
```
"Use pro to analyze the src/ directory architecture and identify the main components"
```

**Performance-Focused Analysis:**
```
"Analyze backend/api/ for performance bottlenecks with o3, focus on database queries"
```

**Security Assessment:**
```
"Use gemini pro to analyze the authentication module for security patterns and potential issues"
```

**Visual + Code Analysis:**
```
"Analyze this system architecture diagram along with the src/core/ implementation to understand the data flow"
```

**Large Codebase Analysis:**
```
"Analyze the entire project structure with gemini pro to understand how all components work together"
```

## Output Formats

**Summary Format:**
- High-level overview with key findings
- Main components and their purposes
- Critical insights and recommendations

**Detailed Format (default):**
- Comprehensive analysis with specific examples
- Code snippets and file references
- Detailed explanations of patterns and structures

**Actionable Format:**
- Specific recommendations and next steps
- Prioritized list of improvements
- Implementation guidance and examples

## Best Practices

- **Be specific about goals**: Clearly state what you want to understand or discover
- **Use appropriate analysis types**: Choose the type that matches your needs
- **Include related files**: Analyze modules together for better context understanding
- **Leverage large context models**: Use Gemini Pro for comprehensive codebase analysis
- **Combine with visual context**: Include architecture diagrams or documentation
- **Use continuation**: Build on previous analysis for deeper understanding

## Advanced Features

**Large Codebase Support:**
With models like Gemini Pro (1M context), you can analyze extensive codebases:
```
"Analyze the entire microservices architecture across all service directories"
```

**Cross-File Relationship Mapping:**
Understand how components interact across multiple files:
```
"Analyze the data processing pipeline across input/, processing/, and output/ directories"
```

**Pattern Recognition:**
Identify design patterns, anti-patterns, and architectural decisions:
```
"Analyze src/ to identify all design patterns used and assess their implementation quality"
```

**Web Search Enhancement:**
The tool can recommend searches for current best practices and documentation:
```
After analysis: "Recommended searches for Claude: 'FastAPI async best practices 2024', 'SQLAlchemy ORM performance optimization patterns'"
```

## When to Use Analyze vs Other Tools

- **Use `analyze`** for: Understanding code structure, exploring unfamiliar codebases, architecture assessment
- **Use `codereview`** for: Finding bugs and security issues with actionable fixes
- **Use `debug`** for: Diagnosing specific runtime errors or performance problems
- **Use `refactor`** for: Getting specific refactoring recommendations and implementation plans
- **Use `chat`** for: Open-ended discussions about code without structured analysis

```

--------------------------------------------------------------------------------
/systemprompts/testgen_prompt.py:
--------------------------------------------------------------------------------

```python
"""
TestGen tool system prompt
"""

TESTGEN_PROMPT = """
ROLE
You are a principal software engineer who specialises in writing bullet-proof production code **and** surgical,
high-signal test suites. You reason about control flow, data flow, mutation, concurrency, failure modes, and security
in equal measure. Your mission: design and write tests that surface real-world defects before code ever leaves CI.

CRITICAL LINE NUMBER INSTRUCTIONS
Code is presented with line number markers "LINE│ code". These markers are for reference ONLY and MUST NOT be
included in any code you generate. Always reference specific line numbers in your replies in order to locate
exact positions if needed to point to exact locations. Include a very short code excerpt alongside for clarity.
Include context_start_text and context_end_text as backup references. Never include "LINE│" markers in generated code
snippets.

IF MORE INFORMATION IS NEEDED
If you need additional context (e.g., test framework details, dependencies, existing test patterns) to provide
accurate test generation, you MUST respond ONLY with this JSON format (and nothing else). Do NOT ask for the
same file you've been provided unless for some reason its content is missing or incomplete:
{
  "status": "files_required_to_continue",
  "mandatory_instructions": "<your critical instructions for the agent>",
  "files_needed": ["[file name here]", "[or some folder/]"]
}

MULTI-AGENT WORKFLOW
You sequentially inhabit five expert personas—each passes a concise artefact to the next:

1. **Context Profiler** – derives language(s), test framework(s), build tooling, domain constraints, and existing
test idioms from the code snapshot provided.
2. **Path Analyzer** – builds a map of reachable code paths (happy, error, exceptional) plus any external interactions
 that are directly involved (network, DB, file-system, IPC).
3. **Adversarial Thinker** – enumerates realistic failures, boundary conditions, race conditions, and misuse patterns
 that historically break similar systems.
4. **Risk Prioritizer** – ranks findings by production impact and likelihood; discards speculative or
out-of-scope cases.
5. **Test Scaffolder** – produces deterministic, isolated tests that follow the *project's* conventions (assert style,
fixture layout, naming, any mocking strategy, language and tooling etc).

TEST-GENERATION STRATEGY
- If a specific test, function, class, or scenario is **explicitly** requested by the agent, focus ONLY on that specific
request and do not generate broader test coverage unless explicitly asked to do so.
- Start from public API / interface boundaries, then walk inward to critical private helpers.
- Analyze function signatures, parameters, return types, and side effects
- Map all code paths including happy paths and error conditions
- Test behaviour, not implementation details, unless white-box inspection is required to reach untestable paths.
- Include both positive and negative test cases
- Prefer property-based or table-driven tests where inputs form simple algebraic domains.
- Stub or fake **only** the minimal surface area needed; prefer in-memory fakes over mocks when feasible.
- Flag any code that cannot be tested deterministically and suggest realistic refactors (seams, dependency injection,
pure functions).
- Surface concurrency hazards with stress or fuzz tests when the language/runtime supports them.
- Focus on realistic failure modes that actually occur in production
- Remain within scope of language, framework, project. Do not over-step. Do not add unnecessary dependencies.
- No bogus, fake tests that seemingly pass for no reason at all

EDGE-CASE TAXONOMY (REAL-WORLD, HIGH-VALUE)
- **Data Shape Issues**: `null` / `undefined`, zero-length, surrogate-pair emojis, malformed UTF-8, mixed EOLs.
- **Numeric Boundaries**: −1, 0, 1, `MAX_…`, floating-point rounding, 64-bit truncation.
- **Temporal Pitfalls**: DST shifts, leap seconds, 29 Feb, Unix epoch 2038, timezone conversions.
- **Collections & Iteration**: off-by-one, concurrent modification, empty vs singleton vs large (>10⁶ items).
- **State & Sequence**: API calls out of order, idempotency violations, replay attacks.
- **External Dependencies**: slow responses, 5xx, malformed JSON/XML, TLS errors, retry storms, cancelled promises.
- **Concurrency / Async**: race conditions, deadlocks, promise rejection leaks, thread starvation.
- **Resource Exhaustion**: memory spikes, file-descriptor leaks, connection-pool saturation.
- **Locale & Encoding**: RTL scripts, uncommon locales, locale-specific formatting.
- **Security Surfaces**: injection (SQL, shell, LDAP), path traversal, privilege escalation on shared state.

TEST QUALITY PRINCIPLES
- Clear Arrange-Act-Assert sections (or given/when/then per project style) but retain and apply project norms, language
norms and framework norms and best practices.
- One behavioural assertion per test unless grouping is conventional.
- Fast: sub-100 ms/unit test; parallelisable; no remote calls.
- Deterministic: seeded randomness only; fixed stable clocks when time matters.
- Self-documenting: names read like specs; failures explain *why*, not just *what*.

FRAMEWORK SELECTION
Always autodetect from the repository. When a test framework or existing tests are not found, detect from existing
code; examples:
- **Swift / Objective-C** → XCTest (Xcode default) or Swift Testing (Apple provided frameworks)
- **C# / .NET** → xUnit.net preferred; fall back to NUnit or MSTest if they dominate the repo.
- **C / C++** → GoogleTest (gtest/gmock) or Catch2, matching existing tooling.
- **JS/TS** → Jest, Vitest, Mocha, or project-specific wrapper.
- **Python** → pytest, unittest.
- **Java/Kotlin** → JUnit 5, TestNG.
- **Go** → built-in `testing`, `testify`.
- **Rust** → `#[test]`, `proptest`.
- **Anything Else** → follow existing conventions; never introduce a new framework without strong justification.

IF FRAMEWORK SELECTION FAILS
If you are unable to confidently determine which framework to use based on the existing test samples supplied, or if
additional test samples would help in making a final decision, you MUST respond ONLY with this JSON
format (and nothing else). Do NOT ask for the same file you've been provided unless for some reason its content
is missing or incomplete:
{"status": "test_sample_needed", "reason": "<brief reason why additional sampling is required>"}

SCOPE CONTROL
Stay strictly within the presented codebase, tech stack, and domain.
Do **not** invent features, frameworks, or speculative integrations.
Do **not** write tests for functions or classes that do not exist.
If a test idea falls outside project scope, discard it.
If a test would be a "good to have" but seems impossible given the current structure, setup of the project, highlight
it but do not approach or offer refactoring ideas.

DELIVERABLE
Return only the artefacts (analysis summary, coverage plan, and generated tests) that fit the detected framework
and code / project layout.
Group related tests but separate them into files where this is the convention and most suitable for the project at hand.
Prefer adding tests to an existing test file if one was provided and grouping these tests makes sense.
Must document logic, test reason/hypothesis in delivered code.
MUST NOT add any additional information, introduction, or summaries around generated code. Deliver only the essentials
relevant to the test.

IF ADDITIONAL TEST CASES ARE REQUIRED
If you determine that comprehensive test coverage requires generating multiple test files or a large number of
test cases for each file that would risk exceeding context limits, you MUST follow this structured approach:

1. **Generate Essential Tests First**: Create only the most critical and high-impact tests (typically 3-5 key test
   cases covering the most important paths and failure modes). Clearly state the file these tests belong to, even if
   these should be added to an existing test file.

2. **Request Continuation**: You MUST your message with the following added in JSON format (and nothing
   more after this). This will list the pending tests and their respective files (even if they belong to the same or
   an existing test file) as this will be used for the next follow-up test generation request.
{"status": "more_tests_required",
"pending_tests": "test_name (file_name), another_test_name (file_name)"}

This approach ensures comprehensive test coverage while maintaining quality and avoiding context overflow.

Remember: your value is catching the hard bugs—not inflating coverage numbers.
"""

```

--------------------------------------------------------------------------------
/tests/test_providers.py:
--------------------------------------------------------------------------------

```python
"""Tests for the model provider abstraction system"""

import os
from unittest.mock import Mock, patch

import pytest

from providers import ModelProviderRegistry, ModelResponse
from providers.gemini import GeminiModelProvider
from providers.openai import OpenAIModelProvider
from providers.shared import ProviderType


class TestModelProviderRegistry:
    """Test the model provider registry"""

    def setup_method(self):
        """Clear registry before each test"""
        # Store the original providers to restore them later
        registry = ModelProviderRegistry()
        self._original_providers = registry._providers.copy()
        registry._providers.clear()
        registry._initialized_providers.clear()

    def teardown_method(self):
        """Restore original providers after each test"""
        # Restore the original providers that were registered in conftest.py
        registry = ModelProviderRegistry()
        registry._providers.clear()
        registry._initialized_providers.clear()
        registry._providers.update(self._original_providers)

    def test_register_provider(self):
        """Test registering a provider"""
        ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)

        registry = ModelProviderRegistry()
        assert ProviderType.GOOGLE in registry._providers
        assert registry._providers[ProviderType.GOOGLE] == GeminiModelProvider

    @patch.dict(os.environ, {"GEMINI_API_KEY": "test-key"})
    def test_get_provider(self):
        """Test getting a provider instance"""
        ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)

        provider = ModelProviderRegistry.get_provider(ProviderType.GOOGLE)

        assert provider is not None
        assert isinstance(provider, GeminiModelProvider)
        assert provider.api_key == "test-key"

    @patch.dict(os.environ, {}, clear=True)
    def test_get_provider_no_api_key(self):
        """Test getting provider without API key returns None"""
        ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)

        provider = ModelProviderRegistry.get_provider(ProviderType.GOOGLE)

        assert provider is None

    @patch.dict(os.environ, {"GEMINI_API_KEY": "test-key"})
    @pytest.mark.no_mock_provider
    def test_get_provider_for_model(self):
        """Test getting provider for a specific model"""
        ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)

        provider = ModelProviderRegistry.get_provider_for_model("gemini-2.5-flash")

        assert provider is not None
        assert isinstance(provider, GeminiModelProvider)

    def test_get_available_providers(self):
        """Test getting list of available providers"""
        ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
        ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)

        providers = ModelProviderRegistry.get_available_providers()

        assert len(providers) == 2
        assert ProviderType.GOOGLE in providers
        assert ProviderType.OPENAI in providers


class TestGeminiProvider:
    """Test Gemini model provider"""

    def test_provider_initialization(self):
        """Test provider initialization"""
        provider = GeminiModelProvider(api_key="test-key")

        assert provider.api_key == "test-key"
        assert provider.get_provider_type() == ProviderType.GOOGLE

    def test_get_capabilities(self):
        """Test getting model capabilities"""
        provider = GeminiModelProvider(api_key="test-key")

        capabilities = provider.get_capabilities("gemini-2.5-flash")

        assert capabilities.provider == ProviderType.GOOGLE
        assert capabilities.model_name == "gemini-2.5-flash"
        assert capabilities.context_window == 1_048_576
        assert capabilities.supports_extended_thinking

    def test_get_capabilities_pro_model(self):
        """Test getting capabilities for Pro model with thinking support"""
        provider = GeminiModelProvider(api_key="test-key")

        capabilities = provider.get_capabilities("gemini-2.5-pro")

        assert capabilities.supports_extended_thinking

    def test_model_shorthand_resolution(self):
        """Test model shorthand resolution"""
        provider = GeminiModelProvider(api_key="test-key")

        assert provider.validate_model_name("flash")
        assert provider.validate_model_name("pro")

        capabilities = provider.get_capabilities("flash")
        assert capabilities.model_name == "gemini-2.5-flash"

    @patch("google.genai.Client")
    def test_generate_content(self, mock_client_class):
        """Test content generation"""
        # Mock the client
        mock_client = Mock()
        mock_response = Mock()
        mock_response.text = "Generated content"
        # Mock candidates for finish_reason
        mock_candidate = Mock()
        mock_candidate.finish_reason = "STOP"
        mock_response.candidates = [mock_candidate]
        # Mock usage metadata
        mock_usage = Mock()
        mock_usage.prompt_token_count = 10
        mock_usage.candidates_token_count = 20
        mock_response.usage_metadata = mock_usage
        mock_client.models.generate_content.return_value = mock_response
        mock_client_class.return_value = mock_client

        provider = GeminiModelProvider(api_key="test-key")

        response = provider.generate_content(prompt="Test prompt", model_name="gemini-2.5-flash", temperature=0.7)

        assert isinstance(response, ModelResponse)
        assert response.content == "Generated content"
        assert response.model_name == "gemini-2.5-flash"
        assert response.provider == ProviderType.GOOGLE
        assert response.usage["input_tokens"] == 10
        assert response.usage["output_tokens"] == 20
        assert response.usage["total_tokens"] == 30


class TestOpenAIProvider:
    """Test OpenAI model provider"""

    def setup_method(self):
        """Clear restriction service cache before each test"""
        import utils.model_restrictions

        utils.model_restrictions._restriction_service = None

    def teardown_method(self):
        """Clear restriction service cache after each test"""
        import utils.model_restrictions

        utils.model_restrictions._restriction_service = None

    def test_provider_initialization(self):
        """Test provider initialization"""
        provider = OpenAIModelProvider(api_key="test-key", organization="test-org")

        assert provider.api_key == "test-key"
        assert provider.organization == "test-org"
        assert provider.get_provider_type() == ProviderType.OPENAI

    def test_get_capabilities_o3(self):
        """Test getting O3 model capabilities"""
        provider = OpenAIModelProvider(api_key="test-key")

        capabilities = provider.get_capabilities("o3-mini")

        assert capabilities.provider == ProviderType.OPENAI
        assert capabilities.model_name == "o3-mini"
        assert capabilities.context_window == 200_000
        assert not capabilities.supports_extended_thinking

    def test_get_capabilities_o4_mini(self):
        """Test getting O4-mini model capabilities"""
        provider = OpenAIModelProvider(api_key="test-key")

        capabilities = provider.get_capabilities("o4-mini")

        assert capabilities.provider == ProviderType.OPENAI
        assert capabilities.model_name == "o4-mini"
        assert capabilities.context_window == 200_000
        assert not capabilities.supports_extended_thinking
        # Check temperature constraint is fixed at 1.0
        assert capabilities.temperature_constraint.value == 1.0

    def test_validate_model_names(self):
        """Test model name validation"""
        provider = OpenAIModelProvider(api_key="test-key")

        assert provider.validate_model_name("o3")
        assert provider.validate_model_name("o3mini")
        assert provider.validate_model_name("o3-mini")  # Backwards compatibility
        assert provider.validate_model_name("o4-mini")
        assert provider.validate_model_name("o4mini")
        assert provider.validate_model_name("o4-mini")
        assert not provider.validate_model_name("gpt-4o")
        assert not provider.validate_model_name("invalid-model")

    def test_openai_models_do_not_support_extended_thinking(self):
        """OpenAI catalogue exposes extended thinking capability via ModelCapabilities."""
        provider = OpenAIModelProvider(api_key="test-key")

        aliases = ["o3", "o3mini", "o3-mini", "o4-mini", "o4mini"]
        for alias in aliases:
            assert not provider.get_capabilities(alias).supports_extended_thinking

```

--------------------------------------------------------------------------------
/docs/tools/codereview.md:
--------------------------------------------------------------------------------

```markdown
# CodeReview Tool - Professional Code Review

**Comprehensive code analysis with prioritized feedback through workflow-driven investigation**

The `codereview` tool provides professional code review capabilities with actionable feedback, severity-based issue prioritization, and support for various review types from quick style checks to comprehensive security audits. This workflow tool guides Claude through systematic investigation steps with forced pauses between each step to ensure thorough code examination, issue identification, and quality assessment before providing expert analysis.

## Thinking Mode

**Default is `medium` (8,192 tokens).** Use `high` for security-critical code (worth the extra tokens) or `low` for quick style checks (saves ~6k tokens).

## How the Workflow Works

The codereview tool implements a **structured workflow** that ensures thorough code examination:

**Investigation Phase (Claude-Led):**
1. **Step 1**: Claude describes the review plan and begins systematic analysis of code structure
2. **Step 2+**: Claude examines code quality, security implications, performance concerns, and architectural patterns
3. **Throughout**: Claude tracks findings, relevant files, issues, and confidence levels
4. **Completion**: Once review is comprehensive, Claude signals completion

**Expert Analysis Phase:**
After Claude completes the investigation (unless confidence is **certain**):
- Complete review summary with all findings and evidence
- Relevant files and code patterns identified
- Issues categorized by severity levels
- Final recommendations based on investigation

**Special Note**: If you want Claude to perform the entire review without calling another model, you can include "don't use any other model" in your prompt, and Claude will complete the full workflow independently.

## Model Recommendation

This tool particularly benefits from Gemini Pro or Flash models due to their 1M context window, which allows comprehensive analysis of large codebases. Claude's context limitations make it challenging to see the "big picture" in complex projects - this is a concrete example where utilizing a secondary model with larger context provides significant value beyond just experimenting with different AI capabilities.

## Example Prompts

```
Perform a codereview with gemini pro and review auth.py for security issues and potential vulnerabilities.
I need an actionable plan but break it down into smaller quick-wins that we can implement and test rapidly 
```

## Pro Tip: Multiple Parallel Reviews

**You can start more than one codereview session with Claude:**

```
Start separate sub-tasks for codereview one with o3 finding critical issues and one with flash finding low priority issues
and quick-wins and give me the final single combined review highlighting only the critical issues 
```

The above prompt will simultaneously run two separate `codereview` tools with two separate models and combine the output into a single summary for you to consume.

## Key Features

- **Issues prioritized by severity** (🔴 CRITICAL → 🟢 LOW)
- **Supports specialized reviews**: security, performance, quick
- **Coding standards enforcement**: `"Use gemini to review src/ against PEP8 standards"`
- **Severity filtering**: `"Get gemini to review auth/ - only report critical vulnerabilities"`
- **Image support**: Review code from screenshots, error dialogs, or visual bug reports: `"Review this error screenshot and the related auth.py file for potential security issues"`
- **Multi-file analysis**: Comprehensive review of entire directories or codebases
- **Actionable feedback**: Specific recommendations with line numbers and code examples
- **Language-specific expertise**: Tailored analysis for Python, JavaScript, Java, C#, Swift, and more
- **Integration issue detection**: Identifies cross-file dependencies and architectural problems
- **Security vulnerability scanning**: Focused on common security patterns and anti-patterns

## Tool Parameters

**Workflow Investigation Parameters (used during step-by-step process):**
- `step`: Current investigation step description (required for each step)
- `step_number`: Current step number in review sequence (required)
- `total_steps`: Estimated total investigation steps (adjustable)
- `next_step_required`: Whether another investigation step is needed
- `findings`: Discoveries and evidence collected in this step (required)
- `files_checked`: All files examined during investigation
- `relevant_files`: Files directly relevant to the review (required in step 1)
- `relevant_context`: Methods/functions/classes central to review findings
- `issues_found`: Issues identified with severity levels
- `confidence`: Confidence level in review completeness (exploring/low/medium/high/certain)
- `images`: Visual references for review context

**Initial Review Configuration (used in step 1):**
- `prompt`: User's summary of what the code does, expected behavior, constraints, and review objectives (required)
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
- `review_type`: full|security|performance|quick (default: full)
- `focus_on`: Specific aspects to focus on (e.g., "security vulnerabilities", "performance bottlenecks")
- `standards`: Coding standards to enforce (e.g., "PEP8", "ESLint", "Google Style Guide")
- `severity_filter`: critical|high|medium|low|all (default: all)
- `temperature`: Temperature for consistency (0-1, default 0.2)
- `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
- `use_assistant_model`: Whether to use expert analysis phase (default: true, set to false to use Claude only)
- `continuation_id`: Continue previous review discussions

## Review Types

**Full Review (default):**
- Comprehensive analysis including bugs, security, performance, maintainability
- Best for new features or significant code changes

**Security Review:**
- Focused on security vulnerabilities and attack vectors
- Checks for common security anti-patterns
- Best for authentication, authorization, data handling code

**Performance Review:**
- Analyzes performance bottlenecks and optimization opportunities
- Memory usage, algorithmic complexity, resource management
- Best for performance-critical code paths

**Quick Review:**
- Fast style and basic issue check
- Lower token usage for rapid feedback
- Best for code formatting and simple validation

## Severity Levels

Issues are categorized and prioritized:

- **🔴 CRITICAL**: Security vulnerabilities, crashes, data corruption
- **🟠 HIGH**: Logic errors, performance issues, reliability problems  
- **🟡 MEDIUM**: Code smells, maintainability issues, minor bugs
- **🟢 LOW**: Style issues, documentation, minor improvements

## Usage Examples

**Basic Security Review:**
```
"Review the authentication module in auth/ for security vulnerabilities with gemini pro"
```

**Performance-Focused Review:**
```
"Use o3 to review backend/api.py for performance issues, focus on database queries and caching"
```

**Quick Style Check:**
```
"Quick review of utils.py with flash, only report critical and high severity issues"
```

**Standards Enforcement:**
```
"Review src/ directory against PEP8 standards with gemini, focus on code formatting and structure"
```

**Visual Context Review:**
```
"Review this authentication code along with the error dialog screenshot to understand the security implications"
```

## Best Practices

- **Provide context**: Describe what the code is supposed to do and any constraints
- **Use appropriate review types**: Security for auth code, performance for critical paths
- **Set severity filters**: Focus on critical issues for quick wins
- **Include relevant files**: Review related modules together for better context
- **Use parallel reviews**: Run multiple reviews with different models for comprehensive coverage
- **Follow up on findings**: Use the continuation feature to discuss specific issues in detail

## Output Format

Reviews include:
- **Executive Summary**: Overview of code quality and main concerns
- **Detailed Findings**: Specific issues with severity levels, line numbers, and recommendations
- **Quick Wins**: Easy-to-implement improvements with high impact
- **Long-term Improvements**: Structural changes for better maintainability
- **Security Considerations**: Specific security recommendations when relevant

## When to Use CodeReview vs Other Tools

- **Use `codereview`** for: Finding bugs, security issues, performance problems, code quality assessment
- **Use `analyze`** for: Understanding code structure without finding issues
- **Use `debug`** for: Diagnosing specific runtime errors or exceptions
- **Use `refactor`** for: Identifying structural improvements and modernization opportunities

```

--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------

```python
"""
Tests for utility functions
"""

from utils import check_token_limit, estimate_tokens, read_file_content, read_files


class TestFileUtils:
    """Test file reading utilities"""

    def test_read_file_content_success(self, project_path):
        """Test successful file reading"""
        test_file = project_path / "test.py"
        test_file.write_text("def hello():\n    return 'world'", encoding="utf-8")

        content, tokens = read_file_content(str(test_file))
        assert "--- BEGIN FILE:" in content
        assert "--- END FILE:" in content
        assert "def hello():" in content
        assert "return 'world'" in content
        assert tokens > 0  # Should have estimated tokens

    def test_read_file_content_not_found(self, project_path):
        """Test reading non-existent file"""
        # Use a non-existent file within the project path
        nonexistent = project_path / "nonexistent" / "file.py"
        content, tokens = read_file_content(str(nonexistent))
        assert "--- FILE NOT FOUND:" in content
        assert "Error: File does not exist" in content
        assert tokens > 0

    def test_read_file_content_safe_files_allowed(self):
        """Test that safe files outside the original project root are now allowed"""
        # In the new security model, safe files like /etc/passwd
        # can be read as they're not in the dangerous paths list
        content, tokens = read_file_content("/etc/passwd")
        # Should successfully read the file (with timestamp in header)
        assert "--- BEGIN FILE: /etc/passwd (Last modified:" in content
        assert "--- END FILE: /etc/passwd ---" in content
        assert tokens > 0

    def test_read_file_content_relative_path_rejected(self):
        """Test that relative paths are rejected"""
        # Try to use a relative path
        content, tokens = read_file_content("./some/relative/path.py")
        assert "--- ERROR ACCESSING FILE:" in content
        assert "Relative paths are not supported" in content
        assert tokens > 0

    def test_read_file_content_directory(self, project_path):
        """Test reading a directory"""
        content, tokens = read_file_content(str(project_path))
        assert "--- NOT A FILE:" in content
        assert "Error: Path is not a file" in content
        assert tokens > 0

    def test_read_files_multiple(self, project_path):
        """Test reading multiple files"""
        file1 = project_path / "file1.py"
        file1.write_text("print('file1')", encoding="utf-8")
        file2 = project_path / "file2.py"
        file2.write_text("print('file2')", encoding="utf-8")

        content = read_files([str(file1), str(file2)])

        assert "--- BEGIN FILE:" in content
        assert "file1.py" in content
        assert "file2.py" in content
        assert "print('file1')" in content
        assert "print('file2')" in content

        # Check that both files are included
        assert "file1.py" in content and "file2.py" in content

    def test_read_files_with_code(self):
        """Test reading with direct code"""
        code = "def test():\n    pass"
        content = read_files([], code)

        assert "--- BEGIN DIRECT CODE ---" in content
        assert "--- END DIRECT CODE ---" in content
        assert code in content

        # Check that direct code is included
        assert code in content

    def test_read_files_directory_support(self, project_path):
        """Test reading all files from a directory"""
        # Create directory structure
        (project_path / "file1.py").write_text("print('file1')", encoding="utf-8")
        (project_path / "file2.js").write_text("console.log('file2')", encoding="utf-8")
        (project_path / "readme.md").write_text("# README", encoding="utf-8")

        # Create subdirectory
        subdir = project_path / "src"
        subdir.mkdir()
        (subdir / "module.py").write_text("class Module: pass", encoding="utf-8")

        # Create hidden file (should be skipped)
        (project_path / ".hidden").write_text("secret", encoding="utf-8")

        # Read the directory
        content = read_files([str(project_path)])

        # Check files are included
        assert "file1.py" in content
        assert "file2.js" in content
        assert "readme.md" in content
        # Handle both forward and backslashes for cross-platform compatibility
        assert "module.py" in content
        assert "class Module: pass" in content

        # Check content
        assert "print('file1')" in content
        assert "console.log('file2')" in content
        assert "# README" in content
        assert "class Module: pass" in content

        # Hidden file should not be included
        assert ".hidden" not in content
        assert "secret" not in content

        # Check that all files are included
        assert all(filename in content for filename in ["file1.py", "file2.js", "readme.md", "module.py"])

    def test_read_files_mixed_paths(self, project_path):
        """Test reading mix of files and directories"""
        # Create files
        file1 = project_path / "direct.py"
        file1.write_text("# Direct file", encoding="utf-8")

        # Create directory with files
        subdir = project_path / "subdir"
        subdir.mkdir()
        (subdir / "sub1.py").write_text("# Sub file 1", encoding="utf-8")
        (subdir / "sub2.py").write_text("# Sub file 2", encoding="utf-8")

        # Read mix of direct file and directory
        content = read_files([str(file1), str(subdir)])

        assert "direct.py" in content
        assert "sub1.py" in content
        assert "sub2.py" in content
        assert "# Direct file" in content
        assert "# Sub file 1" in content
        assert "# Sub file 2" in content

        # Check that all files are included
        assert all(filename in content for filename in ["direct.py", "sub1.py", "sub2.py"])

    def test_read_files_token_limit(self, project_path):
        """Test token limit handling"""
        # Create files with known token counts
        # ~250 tokens each (1000 chars)
        large_content = "x" * 1000

        for i in range(5):
            (project_path / f"file{i}.txt").write_text(large_content, encoding="utf-8")

        # Read with small token limit (should skip some files)
        # Reserve 50k tokens, limit to 51k total = 1k available
        # Each file ~250 tokens, so should read ~3-4 files
        content = read_files([str(project_path)], max_tokens=51_000)

        # Check that token limit handling is present
        assert "--- SKIPPED FILES (TOKEN LIMIT) ---" in content

        # Count how many files were read
        read_count = content.count("--- BEGIN FILE:")
        assert 2 <= read_count <= 4  # Should read some but not all

    def test_read_files_large_file(self, project_path):
        """Test handling of large files"""
        # Create a file larger than max_size (1MB)
        large_file = project_path / "large.txt"
        large_file.write_text("x" * 2_000_000, encoding="utf-8")  # 2MB

        content = read_files([str(large_file)])

        assert "--- FILE TOO LARGE:" in content
        assert "2,000,000 bytes" in content
        # File too large message should be present
        assert "--- FILE TOO LARGE:" in content

    def test_read_files_file_extensions(self, project_path):
        """Test file extension filtering"""
        # Create various file types
        (project_path / "code.py").write_text("python", encoding="utf-8")
        (project_path / "style.css").write_text("css", encoding="utf-8")
        (project_path / "binary.exe").write_text("exe", encoding="utf-8")
        (project_path / "image.jpg").write_text("jpg", encoding="utf-8")

        content = read_files([str(project_path)])

        # Code files should be included
        assert "code.py" in content
        assert "style.css" in content

        # Binary files should not be included (not in CODE_EXTENSIONS)
        assert "binary.exe" not in content
        assert "image.jpg" not in content


class TestTokenUtils:
    """Test token counting utilities"""

    def test_estimate_tokens(self):
        """Test token estimation"""
        # Rough estimate: 1 token ≈ 4 characters
        text = "a" * 400  # 400 characters
        assert estimate_tokens(text) == 100

    def test_check_token_limit_within(self):
        """Test token limit check - within limit"""
        text = "a" * 4000  # 1000 tokens
        within_limit, tokens = check_token_limit(text)
        assert within_limit is True
        assert tokens == 1000

    def test_check_token_limit_exceeded(self):
        """Test token limit check - exceeded"""
        text = "a" * 5_000_000  # 1.25M tokens
        within_limit, tokens = check_token_limit(text)
        assert within_limit is False
        assert tokens == 1_250_000

```