beehiveinnovations/gemini-mcp-server # codebase.md

This is page 16 of 19. Use http://codebase.md/beehiveinnovations/gemini-mcp-server?page={x} to view the full context.

# Directory Structure

```
├── .claude
│   ├── commands
│   │   └── fix-github-issue.md
│   └── settings.json
├── .coveragerc
├── .dockerignore
├── .env.example
├── .gitattributes
├── .github
│   ├── FUNDING.yml
│   ├── ISSUE_TEMPLATE
│   │   ├── bug_report.yml
│   │   ├── config.yml
│   │   ├── documentation.yml
│   │   ├── feature_request.yml
│   │   └── tool_addition.yml
│   ├── pull_request_template.md
│   └── workflows
│       ├── docker-pr.yml
│       ├── docker-release.yml
│       ├── semantic-pr.yml
│       ├── semantic-release.yml
│       └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── AGENTS.md
├── CHANGELOG.md
├── claude_config_example.json
├── CLAUDE.md
├── clink
│   ├── __init__.py
│   ├── agents
│   │   ├── __init__.py
│   │   ├── base.py
│   │   ├── claude.py
│   │   ├── codex.py
│   │   └── gemini.py
│   ├── constants.py
│   ├── models.py
│   ├── parsers
│   │   ├── __init__.py
│   │   ├── base.py
│   │   ├── claude.py
│   │   ├── codex.py
│   │   └── gemini.py
│   └── registry.py
├── code_quality_checks.ps1
├── code_quality_checks.sh
├── communication_simulator_test.py
├── conf
│   ├── __init__.py
│   ├── azure_models.json
│   ├── cli_clients
│   │   ├── claude.json
│   │   ├── codex.json
│   │   └── gemini.json
│   ├── custom_models.json
│   ├── dial_models.json
│   ├── gemini_models.json
│   ├── openai_models.json
│   ├── openrouter_models.json
│   └── xai_models.json
├── config.py
├── docker
│   ├── README.md
│   └── scripts
│       ├── build.ps1
│       ├── build.sh
│       ├── deploy.ps1
│       ├── deploy.sh
│       └── healthcheck.py
├── docker-compose.yml
├── Dockerfile
├── docs
│   ├── adding_providers.md
│   ├── adding_tools.md
│   ├── advanced-usage.md
│   ├── ai_banter.md
│   ├── ai-collaboration.md
│   ├── azure_openai.md
│   ├── configuration.md
│   ├── context-revival.md
│   ├── contributions.md
│   ├── custom_models.md
│   ├── docker-deployment.md
│   ├── gemini-setup.md
│   ├── getting-started.md
│   ├── index.md
│   ├── locale-configuration.md
│   ├── logging.md
│   ├── model_ranking.md
│   ├── testing.md
│   ├── tools
│   │   ├── analyze.md
│   │   ├── apilookup.md
│   │   ├── challenge.md
│   │   ├── chat.md
│   │   ├── clink.md
│   │   ├── codereview.md
│   │   ├── consensus.md
│   │   ├── debug.md
│   │   ├── docgen.md
│   │   ├── listmodels.md
│   │   ├── planner.md
│   │   ├── precommit.md
│   │   ├── refactor.md
│   │   ├── secaudit.md
│   │   ├── testgen.md
│   │   ├── thinkdeep.md
│   │   ├── tracer.md
│   │   └── version.md
│   ├── troubleshooting.md
│   ├── vcr-testing.md
│   └── wsl-setup.md
├── examples
│   ├── claude_config_macos.json
│   └── claude_config_wsl.json
├── LICENSE
├── providers
│   ├── __init__.py
│   ├── azure_openai.py
│   ├── base.py
│   ├── custom.py
│   ├── dial.py
│   ├── gemini.py
│   ├── openai_compatible.py
│   ├── openai.py
│   ├── openrouter.py
│   ├── registries
│   │   ├── __init__.py
│   │   ├── azure.py
│   │   ├── base.py
│   │   ├── custom.py
│   │   ├── dial.py
│   │   ├── gemini.py
│   │   ├── openai.py
│   │   ├── openrouter.py
│   │   └── xai.py
│   ├── registry_provider_mixin.py
│   ├── registry.py
│   ├── shared
│   │   ├── __init__.py
│   │   ├── model_capabilities.py
│   │   ├── model_response.py
│   │   ├── provider_type.py
│   │   └── temperature.py
│   └── xai.py
├── pyproject.toml
├── pytest.ini
├── README.md
├── requirements-dev.txt
├── requirements.txt
├── run_integration_tests.ps1
├── run_integration_tests.sh
├── run-server.ps1
├── run-server.sh
├── scripts
│   └── sync_version.py
├── server.py
├── simulator_tests
│   ├── __init__.py
│   ├── base_test.py
│   ├── conversation_base_test.py
│   ├── log_utils.py
│   ├── test_analyze_validation.py
│   ├── test_basic_conversation.py
│   ├── test_chat_simple_validation.py
│   ├── test_codereview_validation.py
│   ├── test_consensus_conversation.py
│   ├── test_consensus_three_models.py
│   ├── test_consensus_workflow_accurate.py
│   ├── test_content_validation.py
│   ├── test_conversation_chain_validation.py
│   ├── test_cross_tool_comprehensive.py
│   ├── test_cross_tool_continuation.py
│   ├── test_debug_certain_confidence.py
│   ├── test_debug_validation.py
│   ├── test_line_number_validation.py
│   ├── test_logs_validation.py
│   ├── test_model_thinking_config.py
│   ├── test_o3_model_selection.py
│   ├── test_o3_pro_expensive.py
│   ├── test_ollama_custom_url.py
│   ├── test_openrouter_fallback.py
│   ├── test_openrouter_models.py
│   ├── test_per_tool_deduplication.py
│   ├── test_planner_continuation_history.py
│   ├── test_planner_validation_old.py
│   ├── test_planner_validation.py
│   ├── test_precommitworkflow_validation.py
│   ├── test_prompt_size_limit_bug.py
│   ├── test_refactor_validation.py
│   ├── test_secaudit_validation.py
│   ├── test_testgen_validation.py
│   ├── test_thinkdeep_validation.py
│   ├── test_token_allocation_validation.py
│   ├── test_vision_capability.py
│   └── test_xai_models.py
├── systemprompts
│   ├── __init__.py
│   ├── analyze_prompt.py
│   ├── chat_prompt.py
│   ├── clink
│   │   ├── codex_codereviewer.txt
│   │   ├── default_codereviewer.txt
│   │   ├── default_planner.txt
│   │   └── default.txt
│   ├── codereview_prompt.py
│   ├── consensus_prompt.py
│   ├── debug_prompt.py
│   ├── docgen_prompt.py
│   ├── generate_code_prompt.py
│   ├── planner_prompt.py
│   ├── precommit_prompt.py
│   ├── refactor_prompt.py
│   ├── secaudit_prompt.py
│   ├── testgen_prompt.py
│   ├── thinkdeep_prompt.py
│   └── tracer_prompt.py
├── tests
│   ├── __init__.py
│   ├── CASSETTE_MAINTENANCE.md
│   ├── conftest.py
│   ├── gemini_cassettes
│   │   ├── chat_codegen
│   │   │   └── gemini25_pro_calculator
│   │   │       └── mldev.json
│   │   ├── chat_cross
│   │   │   └── step1_gemini25_flash_number
│   │   │       └── mldev.json
│   │   └── consensus
│   │       └── step2_gemini25_flash_against
│   │           └── mldev.json
│   ├── http_transport_recorder.py
│   ├── mock_helpers.py
│   ├── openai_cassettes
│   │   ├── chat_cross_step2_gpt5_reminder.json
│   │   ├── chat_gpt5_continuation.json
│   │   ├── chat_gpt5_moon_distance.json
│   │   ├── consensus_step1_gpt5_for.json
│   │   └── o3_pro_basic_math.json
│   ├── pii_sanitizer.py
│   ├── sanitize_cassettes.py
│   ├── test_alias_target_restrictions.py
│   ├── test_auto_mode_comprehensive.py
│   ├── test_auto_mode_custom_provider_only.py
│   ├── test_auto_mode_model_listing.py
│   ├── test_auto_mode_provider_selection.py
│   ├── test_auto_mode.py
│   ├── test_auto_model_planner_fix.py
│   ├── test_azure_openai_provider.py
│   ├── test_buggy_behavior_prevention.py
│   ├── test_cassette_semantic_matching.py
│   ├── test_challenge.py
│   ├── test_chat_codegen_integration.py
│   ├── test_chat_cross_model_continuation.py
│   ├── test_chat_openai_integration.py
│   ├── test_chat_simple.py
│   ├── test_clink_claude_agent.py
│   ├── test_clink_claude_parser.py
│   ├── test_clink_codex_agent.py
│   ├── test_clink_gemini_agent.py
│   ├── test_clink_gemini_parser.py
│   ├── test_clink_integration.py
│   ├── test_clink_parsers.py
│   ├── test_clink_tool.py
│   ├── test_collaboration.py
│   ├── test_config.py
│   ├── test_consensus_integration.py
│   ├── test_consensus_schema.py
│   ├── test_consensus.py
│   ├── test_conversation_continuation_integration.py
│   ├── test_conversation_field_mapping.py
│   ├── test_conversation_file_features.py
│   ├── test_conversation_memory.py
│   ├── test_conversation_missing_files.py
│   ├── test_custom_openai_temperature_fix.py
│   ├── test_custom_provider.py
│   ├── test_debug.py
│   ├── test_deploy_scripts.py
│   ├── test_dial_provider.py
│   ├── test_directory_expansion_tracking.py
│   ├── test_disabled_tools.py
│   ├── test_docker_claude_desktop_integration.py
│   ├── test_docker_config_complete.py
│   ├── test_docker_healthcheck.py
│   ├── test_docker_implementation.py
│   ├── test_docker_mcp_validation.py
│   ├── test_docker_security.py
│   ├── test_docker_volume_persistence.py
│   ├── test_file_protection.py
│   ├── test_gemini_token_usage.py
│   ├── test_image_support_integration.py
│   ├── test_image_validation.py
│   ├── test_integration_utf8.py
│   ├── test_intelligent_fallback.py
│   ├── test_issue_245_simple.py
│   ├── test_large_prompt_handling.py
│   ├── test_line_numbers_integration.py
│   ├── test_listmodels_restrictions.py
│   ├── test_listmodels.py
│   ├── test_mcp_error_handling.py
│   ├── test_model_enumeration.py
│   ├── test_model_metadata_continuation.py
│   ├── test_model_resolution_bug.py
│   ├── test_model_restrictions.py
│   ├── test_o3_pro_output_text_fix.py
│   ├── test_o3_temperature_fix_simple.py
│   ├── test_openai_compatible_token_usage.py
│   ├── test_openai_provider.py
│   ├── test_openrouter_provider.py
│   ├── test_openrouter_registry.py
│   ├── test_parse_model_option.py
│   ├── test_per_tool_model_defaults.py
│   ├── test_pii_sanitizer.py
│   ├── test_pip_detection_fix.py
│   ├── test_planner.py
│   ├── test_precommit_workflow.py
│   ├── test_prompt_regression.py
│   ├── test_prompt_size_limit_bug_fix.py
│   ├── test_provider_retry_logic.py
│   ├── test_provider_routing_bugs.py
│   ├── test_provider_utf8.py
│   ├── test_providers.py
│   ├── test_rate_limit_patterns.py
│   ├── test_refactor.py
│   ├── test_secaudit.py
│   ├── test_server.py
│   ├── test_supported_models_aliases.py
│   ├── test_thinking_modes.py
│   ├── test_tools.py
│   ├── test_tracer.py
│   ├── test_utf8_localization.py
│   ├── test_utils.py
│   ├── test_uvx_resource_packaging.py
│   ├── test_uvx_support.py
│   ├── test_workflow_file_embedding.py
│   ├── test_workflow_metadata.py
│   ├── test_workflow_prompt_size_validation_simple.py
│   ├── test_workflow_utf8.py
│   ├── test_xai_provider.py
│   ├── transport_helpers.py
│   └── triangle.png
├── tools
│   ├── __init__.py
│   ├── analyze.py
│   ├── apilookup.py
│   ├── challenge.py
│   ├── chat.py
│   ├── clink.py
│   ├── codereview.py
│   ├── consensus.py
│   ├── debug.py
│   ├── docgen.py
│   ├── listmodels.py
│   ├── models.py
│   ├── planner.py
│   ├── precommit.py
│   ├── refactor.py
│   ├── secaudit.py
│   ├── shared
│   │   ├── __init__.py
│   │   ├── base_models.py
│   │   ├── base_tool.py
│   │   ├── exceptions.py
│   │   └── schema_builders.py
│   ├── simple
│   │   ├── __init__.py
│   │   └── base.py
│   ├── testgen.py
│   ├── thinkdeep.py
│   ├── tracer.py
│   ├── version.py
│   └── workflow
│       ├── __init__.py
│       ├── base.py
│       ├── schema_builders.py
│       └── workflow_mixin.py
├── utils
│   ├── __init__.py
│   ├── client_info.py
│   ├── conversation_memory.py
│   ├── env.py
│   ├── file_types.py
│   ├── file_utils.py
│   ├── image_utils.py
│   ├── model_context.py
│   ├── model_restrictions.py
│   ├── security_config.py
│   ├── storage_backend.py
│   └── token_utils.py
└── zen-mcp-server
```

# Files

--------------------------------------------------------------------------------
/tools/simple/base.py:
--------------------------------------------------------------------------------

```python
"""
Base class for simple MCP tools.

Simple tools follow a straightforward pattern:
1. Receive request
2. Prepare prompt (with absolute file paths, context, etc.)
3. Call AI model
4. Format and return response

They use the shared SchemaBuilder for consistent schema generation
and inherit all the conversation, file processing, and model handling
capabilities from BaseTool.
"""

from abc import abstractmethod
from typing import Any, Optional

from tools.shared.base_models import ToolRequest
from tools.shared.base_tool import BaseTool
from tools.shared.exceptions import ToolExecutionError
from tools.shared.schema_builders import SchemaBuilder


class SimpleTool(BaseTool):
    """
    Base class for simple (non-workflow) tools.

    Simple tools are request/response tools that don't require multi-step workflows.
    They benefit from:
    - Automatic schema generation using SchemaBuilder
    - Inherited conversation handling and file processing
    - Standardized model integration
    - Consistent error handling and response formatting

    To create a simple tool:
    1. Inherit from SimpleTool
    2. Implement get_tool_fields() to define tool-specific fields
    3. Implement prepare_prompt() for prompt preparation
    4. Optionally override format_response() for custom formatting
    5. Optionally override get_required_fields() for custom requirements

    Example:
        class ChatTool(SimpleTool):
            def get_name(self) -> str:
                return "chat"

            def get_tool_fields(self) -> Dict[str, Dict[str, Any]]:
                return {
                    "prompt": {
                        "type": "string",
                        "description": "Your question or idea...",
                    },
                    "absolute_file_paths": SimpleTool.FILES_FIELD,
                }

            def get_required_fields(self) -> List[str]:
                return ["prompt"]
    """

    # Common field definitions that simple tools can reuse
    FILES_FIELD = SchemaBuilder.SIMPLE_FIELD_SCHEMAS["absolute_file_paths"]
    IMAGES_FIELD = SchemaBuilder.COMMON_FIELD_SCHEMAS["images"]

    @abstractmethod
    def get_tool_fields(self) -> dict[str, dict[str, Any]]:
        """
        Return tool-specific field definitions.

        This method should return a dictionary mapping field names to their
        JSON schema definitions. Common fields (model, temperature, etc.)
        are added automatically by the base class.

        Returns:
            Dict mapping field names to JSON schema objects

        Example:
            return {
                "prompt": {
                    "type": "string",
                    "description": "The user's question or request",
                },
                "absolute_file_paths": SimpleTool.FILES_FIELD,  # Reuse common field
                "max_tokens": {
                    "type": "integer",
                    "minimum": 1,
                    "description": "Maximum tokens for response",
                }
            }
        """
        pass

    def get_required_fields(self) -> list[str]:
        """
        Return list of required field names.

        Override this to specify which fields are required for your tool.
        The model field is automatically added if in auto mode.

        Returns:
            List of required field names
        """
        return []

    def get_annotations(self) -> Optional[dict[str, Any]]:
        """
        Return tool annotations. Simple tools are read-only by default.

        All simple tools perform operations without modifying the environment.
        They may call external AI models for analysis or conversation, but they
        don't write files or make system changes.

        Override this method if your simple tool needs different annotations.

        Returns:
            Dictionary with readOnlyHint set to True
        """
        return {"readOnlyHint": True}

    def format_response(self, response: str, request, model_info: Optional[dict] = None) -> str:
        """
        Format the AI response before returning to the client.

        This is a hook method that subclasses can override to customize
        response formatting. The default implementation returns the response as-is.

        Args:
            response: The raw response from the AI model
            request: The validated request object
            model_info: Optional model information dictionary

        Returns:
            Formatted response string
        """
        return response

    def get_input_schema(self) -> dict[str, Any]:
        """
        Generate the complete input schema using SchemaBuilder.

        This method automatically combines:
        - Tool-specific fields from get_tool_fields()
        - Common fields (temperature, thinking_mode, etc.)
        - Model field with proper auto-mode handling
        - Required fields from get_required_fields()

        Tools can override this method for custom schema generation while
        still benefiting from SimpleTool's convenience methods.

        Returns:
            Complete JSON schema for the tool
        """
        required_fields = list(self.get_required_fields())
        return SchemaBuilder.build_schema(
            tool_specific_fields=self.get_tool_fields(),
            required_fields=required_fields,
            model_field_schema=self.get_model_field_schema(),
            auto_mode=self.is_effective_auto_mode(),
        )

    def get_request_model(self):
        """
        Return the request model class.

        Simple tools use the base ToolRequest by default.
        Override this if your tool needs a custom request model.
        """
        return ToolRequest

    # Hook methods for safe attribute access without hasattr/getattr

    def get_request_model_name(self, request) -> Optional[str]:
        """Get model name from request. Override for custom model name handling."""
        try:
            return request.model
        except AttributeError:
            return None

    def get_request_images(self, request) -> list:
        """Get images from request. Override for custom image handling."""
        try:
            return request.images if request.images is not None else []
        except AttributeError:
            return []

    def get_request_continuation_id(self, request) -> Optional[str]:
        """Get continuation_id from request. Override for custom continuation handling."""
        try:
            return request.continuation_id
        except AttributeError:
            return None

    def get_request_prompt(self, request) -> str:
        """Get prompt from request. Override for custom prompt handling."""
        try:
            return request.prompt
        except AttributeError:
            return ""

    def get_request_temperature(self, request) -> Optional[float]:
        """Get temperature from request. Override for custom temperature handling."""
        try:
            return request.temperature
        except AttributeError:
            return None

    def get_validated_temperature(self, request, model_context: Any) -> tuple[float, list[str]]:
        """
        Get temperature from request and validate it against model constraints.

        This is a convenience method that combines temperature extraction and validation
        for simple tools. It ensures temperature is within valid range for the model.

        Args:
            request: The request object containing temperature
            model_context: Model context object containing model info

        Returns:
            Tuple of (validated_temperature, warning_messages)
        """
        temperature = self.get_request_temperature(request)
        if temperature is None:
            temperature = self.get_default_temperature()
        return self.validate_and_correct_temperature(temperature, model_context)

    def get_request_thinking_mode(self, request) -> Optional[str]:
        """Get thinking_mode from request. Override for custom thinking mode handling."""
        try:
            return request.thinking_mode
        except AttributeError:
            return None

    def get_request_files(self, request) -> list:
        """Get absolute file paths from request. Override for custom file handling."""
        try:
            files = request.absolute_file_paths
        except AttributeError:
            files = None
        if files is None:
            return []
        return files

    def get_request_as_dict(self, request) -> dict:
        """Convert request to dictionary. Override for custom serialization."""
        try:
            # Try Pydantic v2 method first
            return request.model_dump()
        except AttributeError:
            try:
                # Fall back to Pydantic v1 method
                return request.dict()
            except AttributeError:
                # Last resort - convert to dict manually
                return {"prompt": self.get_request_prompt(request)}

    def set_request_files(self, request, files: list) -> None:
        """Set absolute file paths on request. Override for custom file setting."""
        try:
            request.absolute_file_paths = files
        except AttributeError:
            pass

    def get_actually_processed_files(self) -> list:
        """Get actually processed files. Override for custom file tracking."""
        try:
            return self._actually_processed_files
        except AttributeError:
            return []

    async def execute(self, arguments: dict[str, Any]) -> list:
        """
        Execute the simple tool using the comprehensive flow from old base.py.

        This method replicates the proven execution pattern while using SimpleTool hooks.
        """
        import logging

        from mcp.types import TextContent

        from tools.models import ToolOutput

        logger = logging.getLogger(f"tools.{self.get_name()}")

        try:
            # Store arguments for access by helper methods
            self._current_arguments = arguments

            logger.info(f"🔧 {self.get_name()} tool called with arguments: {list(arguments.keys())}")

            # Validate request using the tool's Pydantic model
            request_model = self.get_request_model()
            request = request_model(**arguments)
            logger.debug(f"Request validation successful for {self.get_name()}")

            # Validate file paths for security
            # This prevents path traversal attacks and ensures proper access control
            path_error = self._validate_file_paths(request)
            if path_error:
                error_output = ToolOutput(
                    status="error",
                    content=path_error,
                    content_type="text",
                )
                logger.error("Path validation failed for %s: %s", self.get_name(), path_error)
                raise ToolExecutionError(error_output.model_dump_json())

            # Handle model resolution like old base.py
            model_name = self.get_request_model_name(request)
            if not model_name:
                from config import DEFAULT_MODEL

                model_name = DEFAULT_MODEL

            # Store the current model name for later use
            self._current_model_name = model_name

            # Handle model context from arguments (for in-process testing)
            if "_model_context" in arguments:
                self._model_context = arguments["_model_context"]
                logger.debug(f"{self.get_name()}: Using model context from arguments")
            else:
                # Create model context if not provided
                from utils.model_context import ModelContext

                self._model_context = ModelContext(model_name)
                logger.debug(f"{self.get_name()}: Created model context for {model_name}")

            # Get images if present
            images = self.get_request_images(request)
            continuation_id = self.get_request_continuation_id(request)

            # Handle conversation history and prompt preparation
            if continuation_id:
                # Check if conversation history is already embedded
                field_value = self.get_request_prompt(request)
                if "=== CONVERSATION HISTORY ===" in field_value:
                    # Use pre-embedded history
                    prompt = field_value
                    logger.debug(f"{self.get_name()}: Using pre-embedded conversation history")
                else:
                    # No embedded history - reconstruct it (for in-process calls)
                    logger.debug(f"{self.get_name()}: No embedded history found, reconstructing conversation")

                    # Get thread context
                    from utils.conversation_memory import add_turn, build_conversation_history, get_thread

                    thread_context = get_thread(continuation_id)

                    if thread_context:
                        # Add user's new input to conversation
                        user_prompt = self.get_request_prompt(request)
                        user_files = self.get_request_files(request)
                        if user_prompt:
                            add_turn(continuation_id, "user", user_prompt, files=user_files)

                            # Get updated thread context after adding the turn
                            thread_context = get_thread(continuation_id)
                            logger.debug(
                                f"{self.get_name()}: Retrieved updated thread with {len(thread_context.turns)} turns"
                            )

                        # Build conversation history with updated thread context
                        conversation_history, conversation_tokens = build_conversation_history(
                            thread_context, self._model_context
                        )

                        # Get the base prompt from the tool
                        base_prompt = await self.prepare_prompt(request)

                        # Combine with conversation history
                        if conversation_history:
                            prompt = f"{conversation_history}\n\n=== NEW USER INPUT ===\n{base_prompt}"
                        else:
                            prompt = base_prompt
                    else:
                        # Thread not found, prepare normally
                        logger.warning(f"Thread {continuation_id} not found, preparing prompt normally")
                        prompt = await self.prepare_prompt(request)
            else:
                # New conversation, prepare prompt normally
                prompt = await self.prepare_prompt(request)

                # Add follow-up instructions for new conversations
                from server import get_follow_up_instructions

                follow_up_instructions = get_follow_up_instructions(0)
                prompt = f"{prompt}\n\n{follow_up_instructions}"
                logger.debug(
                    f"Added follow-up instructions for new {self.get_name()} conversation"
                )  # Validate images if any were provided
            if images:
                image_validation_error = self._validate_image_limits(
                    images, model_context=self._model_context, continuation_id=continuation_id
                )
                if image_validation_error:
                    error_output = ToolOutput(
                        status=image_validation_error.get("status", "error"),
                        content=image_validation_error.get("content"),
                        content_type=image_validation_error.get("content_type", "text"),
                        metadata=image_validation_error.get("metadata"),
                    )
                    payload = error_output.model_dump_json()
                    logger.error("Image validation failed for %s: %s", self.get_name(), payload)
                    raise ToolExecutionError(payload)

            # Get and validate temperature against model constraints
            temperature, temp_warnings = self.get_validated_temperature(request, self._model_context)

            # Log any temperature corrections
            for warning in temp_warnings:
                # Get thinking mode with defaults
                logger.warning(warning)
            thinking_mode = self.get_request_thinking_mode(request)
            if thinking_mode is None:
                thinking_mode = self.get_default_thinking_mode()

            # Get the provider from model context (clean OOP - no re-fetching)
            provider = self._model_context.provider
            capabilities = self._model_context.capabilities

            # Get system prompt for this tool
            base_system_prompt = self.get_system_prompt()
            capability_augmented_prompt = self._augment_system_prompt_with_capabilities(
                base_system_prompt, capabilities
            )
            language_instruction = self.get_language_instruction()
            system_prompt = language_instruction + capability_augmented_prompt

            # Generate AI response using the provider
            logger.info(f"Sending request to {provider.get_provider_type().value} API for {self.get_name()}")
            logger.info(
                f"Using model: {self._model_context.model_name} via {provider.get_provider_type().value} provider"
            )

            # Estimate tokens for logging
            from utils.token_utils import estimate_tokens

            estimated_tokens = estimate_tokens(prompt)
            logger.debug(f"Prompt length: {len(prompt)} characters (~{estimated_tokens:,} tokens)")

            # Resolve model capabilities for feature gating
            supports_thinking = capabilities.supports_extended_thinking

            # Generate content with provider abstraction
            model_response = provider.generate_content(
                prompt=prompt,
                model_name=self._current_model_name,
                system_prompt=system_prompt,
                temperature=temperature,
                thinking_mode=thinking_mode if supports_thinking else None,
                images=images if images else None,
            )

            logger.info(f"Received response from {provider.get_provider_type().value} API for {self.get_name()}")

            # Process the model's response
            if model_response.content:
                raw_text = model_response.content

                # Create model info for conversation tracking
                model_info = {
                    "provider": provider,
                    "model_name": self._current_model_name,
                    "model_response": model_response,
                }

                # Parse response using the same logic as old base.py
                tool_output = self._parse_response(raw_text, request, model_info)
                logger.info(f"✅ {self.get_name()} tool completed successfully")

            else:
                # Handle cases where the model couldn't generate a response
                metadata = model_response.metadata or {}
                finish_reason = metadata.get("finish_reason", "Unknown")

                if metadata.get("is_blocked_by_safety"):
                    # Specific handling for content safety blocks
                    safety_details = metadata.get("safety_feedback") or "details not provided"
                    logger.warning(
                        f"Response blocked by content safety policy for {self.get_name()}. "
                        f"Reason: {finish_reason}, Details: {safety_details}"
                    )
                    tool_output = ToolOutput(
                        status="error",
                        content="Your request was blocked by the content safety policy. "
                        "Please try modifying your prompt.",
                        content_type="text",
                    )
                else:
                    # Handle other empty responses - could be legitimate completion or unclear blocking
                    if finish_reason == "STOP":
                        # Model completed normally but returned empty content - retry with clarification
                        logger.info(
                            f"Model completed with empty response for {self.get_name()}, retrying with clarification"
                        )

                        # Retry the same request with modified prompt asking for explicit response
                        original_prompt = prompt
                        retry_prompt = f"{original_prompt}\n\nIMPORTANT: Please provide a substantive response. If you cannot respond to the above request, please explain why and suggest alternatives."

                        try:
                            retry_response = provider.generate_content(
                                prompt=retry_prompt,
                                model_name=self._current_model_name,
                                system_prompt=system_prompt,
                                temperature=temperature,
                                thinking_mode=thinking_mode if supports_thinking else None,
                                images=images if images else None,
                            )

                            if retry_response.content:
                                # Successful retry - use the retry response
                                logger.info(f"Retry successful for {self.get_name()}")
                                raw_text = retry_response.content

                                # Update model info for the successful retry
                                model_info = {
                                    "provider": provider,
                                    "model_name": self._current_model_name,
                                    "model_response": retry_response,
                                }

                                # Parse the retry response
                                tool_output = self._parse_response(raw_text, request, model_info)
                                logger.info(f"✅ {self.get_name()} tool completed successfully after retry")
                            else:
                                # Retry also failed - inspect metadata to find out why
                                retry_metadata = retry_response.metadata or {}
                                if retry_metadata.get("is_blocked_by_safety"):
                                    # The retry was blocked by safety filters
                                    safety_details = retry_metadata.get("safety_feedback") or "details not provided"
                                    logger.warning(
                                        f"Retry for {self.get_name()} was blocked by content safety policy. "
                                        f"Details: {safety_details}"
                                    )
                                    tool_output = ToolOutput(
                                        status="error",
                                        content="Your request was also blocked by the content safety policy after a retry. "
                                        "Please try rephrasing your prompt significantly.",
                                        content_type="text",
                                    )
                                else:
                                    # Retry failed for other reasons (e.g., another STOP)
                                    tool_output = ToolOutput(
                                        status="error",
                                        content="The model repeatedly returned empty responses. This may indicate content filtering or a model issue.",
                                        content_type="text",
                                    )
                        except Exception as retry_error:
                            logger.warning(f"Retry failed for {self.get_name()}: {retry_error}")
                            tool_output = ToolOutput(
                                status="error",
                                content=f"Model returned empty response and retry failed: {str(retry_error)}",
                                content_type="text",
                            )
                    else:
                        # Non-STOP finish reasons are likely actual errors
                        logger.warning(
                            f"Response blocked or incomplete for {self.get_name()}. Finish reason: {finish_reason}"
                        )
                        tool_output = ToolOutput(
                            status="error",
                            content=f"Response blocked or incomplete. Finish reason: {finish_reason}",
                            content_type="text",
                        )

            # Return the tool output as TextContent, marking protocol errors appropriately
            payload = tool_output.model_dump_json()
            if tool_output.status == "error":
                logger.error("%s reported error status - raising ToolExecutionError", self.get_name())
                raise ToolExecutionError(payload)
            return [TextContent(type="text", text=payload)]

        except ToolExecutionError:
            raise
        except Exception as e:
            # Special handling for MCP size check errors
            if str(e).startswith("MCP_SIZE_CHECK:"):
                # Extract the JSON content after the prefix
                json_content = str(e)[len("MCP_SIZE_CHECK:") :]
                raise ToolExecutionError(json_content)

            logger.error(f"Error in {self.get_name()}: {str(e)}")
            error_output = ToolOutput(
                status="error",
                content=f"Error in {self.get_name()}: {str(e)}",
                content_type="text",
            )
            raise ToolExecutionError(error_output.model_dump_json()) from e

    def _parse_response(self, raw_text: str, request, model_info: Optional[dict] = None):
        """
        Parse the raw response and format it using the hook method.

        This simplified version focuses on the SimpleTool pattern: format the response
        using the format_response hook, then handle conversation continuation.
        """
        from tools.models import ToolOutput

        # Format the response using the hook method
        formatted_response = self.format_response(raw_text, request, model_info)

        # Handle conversation continuation like old base.py
        continuation_id = self.get_request_continuation_id(request)
        if continuation_id:
            self._record_assistant_turn(continuation_id, raw_text, request, model_info)

        # Create continuation offer like old base.py
        continuation_data = self._create_continuation_offer(request, model_info)
        if continuation_data:
            return self._create_continuation_offer_response(formatted_response, continuation_data, request, model_info)
        else:
            # Build metadata with model and provider info for success response
            metadata = {}
            if model_info:
                model_name = model_info.get("model_name")
                if model_name:
                    metadata["model_used"] = model_name
                provider = model_info.get("provider")
                if provider:
                    # Handle both provider objects and string values
                    if isinstance(provider, str):
                        metadata["provider_used"] = provider
                    else:
                        try:
                            metadata["provider_used"] = provider.get_provider_type().value
                        except AttributeError:
                            # Fallback if provider doesn't have get_provider_type method
                            metadata["provider_used"] = str(provider)

            return ToolOutput(
                status="success",
                content=formatted_response,
                content_type="text",
                metadata=metadata if metadata else None,
            )

    def _create_continuation_offer(self, request, model_info: Optional[dict] = None):
        """Create continuation offer following old base.py pattern"""
        continuation_id = self.get_request_continuation_id(request)

        try:
            from utils.conversation_memory import create_thread, get_thread

            if continuation_id:
                # Existing conversation
                thread_context = get_thread(continuation_id)
                if thread_context and thread_context.turns:
                    turn_count = len(thread_context.turns)
                    from utils.conversation_memory import MAX_CONVERSATION_TURNS

                    if turn_count >= MAX_CONVERSATION_TURNS - 1:
                        return None  # No more turns allowed

                    remaining_turns = MAX_CONVERSATION_TURNS - turn_count - 1
                    return {
                        "continuation_id": continuation_id,
                        "remaining_turns": remaining_turns,
                        "note": f"You can continue this conversation for {remaining_turns} more exchanges.",
                    }
            else:
                # New conversation - create thread and offer continuation
                # Convert request to dict for initial_context
                initial_request_dict = self.get_request_as_dict(request)

                new_thread_id = create_thread(tool_name=self.get_name(), initial_request=initial_request_dict)

                # Add the initial user turn to the new thread
                from utils.conversation_memory import MAX_CONVERSATION_TURNS, add_turn

                user_prompt = self.get_request_prompt(request)
                user_files = self.get_request_files(request)
                user_images = self.get_request_images(request)

                # Add user's initial turn
                add_turn(
                    new_thread_id, "user", user_prompt, files=user_files, images=user_images, tool_name=self.get_name()
                )

                return {
                    "continuation_id": new_thread_id,
                    "remaining_turns": MAX_CONVERSATION_TURNS - 1,
                    "note": f"You can continue this conversation for {MAX_CONVERSATION_TURNS - 1} more exchanges.",
                }
        except Exception:
            return None

    def _create_continuation_offer_response(
        self, content: str, continuation_data: dict, request, model_info: Optional[dict] = None
    ):
        """Create response with continuation offer following old base.py pattern"""
        from tools.models import ContinuationOffer, ToolOutput

        try:
            if not self.get_request_continuation_id(request):
                self._record_assistant_turn(
                    continuation_data["continuation_id"],
                    content,
                    request,
                    model_info,
                )

            continuation_offer = ContinuationOffer(
                continuation_id=continuation_data["continuation_id"],
                note=continuation_data["note"],
                remaining_turns=continuation_data["remaining_turns"],
            )

            # Build metadata with model and provider info
            metadata = {"tool_name": self.get_name(), "conversation_ready": True}
            if model_info:
                model_name = model_info.get("model_name")
                if model_name:
                    metadata["model_used"] = model_name
                provider = model_info.get("provider")
                if provider:
                    # Handle both provider objects and string values
                    if isinstance(provider, str):
                        metadata["provider_used"] = provider
                    else:
                        try:
                            metadata["provider_used"] = provider.get_provider_type().value
                        except AttributeError:
                            # Fallback if provider doesn't have get_provider_type method
                            metadata["provider_used"] = str(provider)

            return ToolOutput(
                status="continuation_available",
                content=content,
                content_type="text",
                continuation_offer=continuation_offer,
                metadata=metadata,
            )
        except Exception:
            # Fallback to simple success if continuation offer fails
            return ToolOutput(status="success", content=content, content_type="text")

    def _record_assistant_turn(
        self, continuation_id: str, response_text: str, request, model_info: Optional[dict]
    ) -> None:
        """Persist an assistant response in conversation memory."""

        if not continuation_id:
            return

        from utils.conversation_memory import add_turn

        model_provider = None
        model_name = None
        model_metadata = None

        if model_info:
            provider = model_info.get("provider")
            if provider:
                if isinstance(provider, str):
                    model_provider = provider
                else:
                    try:
                        model_provider = provider.get_provider_type().value
                    except AttributeError:
                        model_provider = str(provider)
            model_name = model_info.get("model_name")
            model_response = model_info.get("model_response")
            if model_response:
                model_metadata = {"usage": model_response.usage, "metadata": model_response.metadata}

        add_turn(
            continuation_id,
            "assistant",
            response_text,
            files=self.get_request_files(request),
            images=self.get_request_images(request),
            tool_name=self.get_name(),
            model_provider=model_provider,
            model_name=model_name,
            model_metadata=model_metadata,
        )

    # Convenience methods for common tool patterns

    def build_standard_prompt(
        self, system_prompt: str, user_content: str, request, file_context_title: str = "CONTEXT FILES"
    ) -> str:
        """
        Build a standard prompt with system prompt, user content, and optional files.

        This is a convenience method that handles the common pattern of:
        1. Adding file content if present
        2. Checking token limits
        3. Adding web search instructions
        4. Combining everything into a well-formatted prompt

        Args:
            system_prompt: The system prompt for the tool
            user_content: The main user request/content
            request: The validated request object
            file_context_title: Title for the file context section

        Returns:
            Complete formatted prompt ready for the AI model
        """
        # Check size limits against raw user input before enriching with internal context
        content_to_validate = self.get_prompt_content_for_size_validation(user_content)
        self._validate_token_limit(content_to_validate, "Content")

        # Add context files if provided (does not affect MCP boundary enforcement)
        files = self.get_request_files(request)
        if files:
            file_content, processed_files = self._prepare_file_content_for_prompt(
                files,
                self.get_request_continuation_id(request),
                "Context files",
                model_context=getattr(self, "_model_context", None),
            )
            self._actually_processed_files = processed_files
            if file_content:
                user_content = f"{user_content}\n\n=== {file_context_title} ===\n{file_content}\n=== END CONTEXT ===="

        # Add standardized web search guidance
        websearch_instruction = self.get_websearch_instruction(self.get_websearch_guidance())

        # Combine system prompt with user content
        full_prompt = f"""{system_prompt}{websearch_instruction}

=== USER REQUEST ===
{user_content}
=== END REQUEST ===

Please provide a thoughtful, comprehensive response:"""

        return full_prompt

    def get_prompt_content_for_size_validation(self, user_content: str) -> str:
        """
        Override to use original user prompt for size validation when conversation history is embedded.

        When server.py embeds conversation history into the prompt field, it also stores
        the original user prompt in _original_user_prompt. We use that for size validation
        to avoid incorrectly triggering size limits due to conversation history.

        Args:
            user_content: The user content (may include conversation history)

        Returns:
            The original user prompt if available, otherwise the full user content
        """
        # Check if we have the current arguments from execute() method
        current_args = getattr(self, "_current_arguments", None)
        if current_args:
            # If server.py embedded conversation history, it stores original prompt separately
            original_user_prompt = current_args.get("_original_user_prompt")
            if original_user_prompt is not None:
                # Use original user prompt for size validation (excludes conversation history)
                return original_user_prompt

        # Fallback to default behavior (validate full user content)
        return user_content

    def get_websearch_guidance(self) -> Optional[str]:
        """
        Return tool-specific web search guidance.

        Override this to provide tool-specific guidance for when web searches
        would be helpful. Return None to use the default guidance.

        Returns:
            Tool-specific web search guidance or None for default
        """
        return None

    def handle_prompt_file_with_fallback(self, request) -> str:
        """
        Handle prompt.txt files with fallback to request field.

        This is a convenience method for tools that accept prompts either
        as a field or as a prompt.txt file. It handles the extraction
        and validation automatically.

        Args:
            request: The validated request object

        Returns:
            The effective prompt content

        Raises:
            ValueError: If prompt is too large for MCP transport
        """
        # Check for prompt.txt in provided absolute file paths
        files = self.get_request_files(request)
        if files:
            prompt_content, updated_files = self.handle_prompt_file(files)

            # Update request files list if needed
            if updated_files is not None:
                self.set_request_files(request, updated_files)
        else:
            prompt_content = None

        # Use prompt.txt content if available, otherwise use the prompt field
        user_content = prompt_content if prompt_content else self.get_request_prompt(request)

        # Check user input size at MCP transport boundary (excluding conversation history)
        validation_content = self.get_prompt_content_for_size_validation(user_content)
        size_check = self.check_prompt_size(validation_content)
        if size_check:
            from tools.models import ToolOutput

            raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**size_check).model_dump_json()}")

        return user_content

    def get_chat_style_websearch_guidance(self) -> str:
        """
        Get Chat tool-style web search guidance.

        Returns web search guidance that matches the original Chat tool pattern.
        This is useful for tools that want to maintain the same search behavior.

        Returns:
            Web search guidance text
        """
        return """When discussing topics, consider if searches for these would help:
- Documentation for any technologies or concepts mentioned
- Current best practices and patterns
- Recent developments or updates
- Community discussions and solutions"""

    def supports_custom_request_model(self) -> bool:
        """
        Indicate whether this tool supports custom request models.

        Simple tools support custom request models by default. Tools that override
        get_request_model() to return something other than ToolRequest should
        return True here.

        Returns:
            True if the tool uses a custom request model
        """
        return self.get_request_model() != ToolRequest

    def _validate_file_paths(self, request) -> Optional[str]:
        """
        Validate that all file paths in the request are absolute paths.

        This is a security measure to prevent path traversal attacks and ensure
        proper access control. All file paths must be absolute (starting with '/').

        Args:
            request: The validated request object

        Returns:
            Optional[str]: Error message if validation fails, None if all paths are valid
        """
        import os

        # Check if request has absolute file paths attribute (legacy tools may still provide 'files')
        files = self.get_request_files(request)
        if files:
            for file_path in files:
                if not os.path.isabs(file_path):
                    return (
                        f"Error: All file paths must be FULL absolute paths to real files / folders - DO NOT SHORTEN. "
                        f"Received relative path: {file_path}\n"
                        f"Please provide the full absolute path starting with '/' (must be FULL absolute paths to real files / folders - DO NOT SHORTEN)"
                    )

        return None

    def prepare_chat_style_prompt(self, request, system_prompt: str = None) -> str:
        """
        Prepare a prompt using Chat tool-style patterns.

        This convenience method replicates the Chat tool's prompt preparation logic:
        1. Handle prompt.txt file if present
        2. Add file context with specific formatting
        3. Add web search guidance
        4. Format with system prompt

        Args:
            request: The validated request object
            system_prompt: System prompt to use (uses get_system_prompt() if None)

        Returns:
            Complete formatted prompt
        """
        # Use provided system prompt or get from tool
        if system_prompt is None:
            system_prompt = self.get_system_prompt()

        # Get user content (handles prompt.txt files)
        user_content = self.handle_prompt_file_with_fallback(request)

        # Build standard prompt with Chat-style web search guidance
        websearch_guidance = self.get_chat_style_websearch_guidance()

        # Override the websearch guidance temporarily
        original_guidance = self.get_websearch_guidance
        self.get_websearch_guidance = lambda: websearch_guidance

        try:
            full_prompt = self.build_standard_prompt(system_prompt, user_content, request, "CONTEXT FILES")
        finally:
            # Restore original guidance method
            self.get_websearch_guidance = original_guidance

        if system_prompt:
            marker = "\n\n=== USER REQUEST ===\n"
            if marker in full_prompt:
                _, user_section = full_prompt.split(marker, 1)
                return f"=== USER REQUEST ===\n{user_section}"

        return full_prompt

```

--------------------------------------------------------------------------------
/simulator_tests/test_codereview_validation.py:
--------------------------------------------------------------------------------

```python
#!/usr/bin/env python3
"""
CodeReview Tool Validation Test

Tests the codereview tool's capabilities using the new workflow architecture.
This validates that the workflow-based code review provides step-by-step
analysis with proper investigation guidance and expert analysis integration.
"""

import json
from typing import Optional

from .conversation_base_test import ConversationBaseTest


class CodeReviewValidationTest(ConversationBaseTest):
    """Test codereview tool with new workflow architecture"""

    @property
    def test_name(self) -> str:
        return "codereview_validation"

    @property
    def test_description(self) -> str:
        return "CodeReview tool validation with new workflow architecture"

    def run_test(self) -> bool:
        """Test codereview tool capabilities"""
        # Set up the test environment
        self.setUp()

        try:
            self.logger.info("Test: CodeReviewWorkflow tool validation (new architecture)")

            # Create test code with various issues for review
            self._create_test_code_for_review()

            # Test 1: Single review session with multiple steps
            if not self._test_single_review_session():
                return False

            # Test 2: Review flow that requires refocusing
            if not self._test_review_refocus_flow():
                return False

            # Test 3: Complete review with expert analysis
            if not self._test_complete_review_with_analysis():
                return False

            # Test 4: Certain confidence behavior
            if not self._test_certain_confidence():
                return False

            # Test 5: Context-aware file embedding
            if not self._test_context_aware_file_embedding():
                return False

            # Test 6: Multi-step file context optimization
            if not self._test_multi_step_file_context():
                return False

            self.logger.info("  ✅ All codereview validation tests passed")
            return True

        except Exception as e:
            self.logger.error(f"CodeReviewWorkflow validation test failed: {e}")
            return False

    def _create_test_code_for_review(self):
        """Create test files with various code quality issues for review"""
        # Create a payment processing module with multiple issues
        payment_code = """#!/usr/bin/env python3
import hashlib
import requests
import json
from datetime import datetime

class PaymentProcessor:
    def __init__(self, api_key):
        self.api_key = api_key  # Security issue: API key stored in plain text
        self.base_url = "https://payment-gateway.example.com"
        self.session = requests.Session()
        self.failed_payments = []  # Performance issue: unbounded list

    def process_payment(self, amount, card_number, cvv, user_id):
        \"\"\"Process a payment transaction\"\"\"
        # Security issue: No input validation
        # Performance issue: Inefficient nested loops
        for attempt in range(3):
            for retry in range(5):
                try:
                    # Security issue: Logging sensitive data
                    print(f"Processing payment: {card_number}, CVV: {cvv}")

                    # Over-engineering: Complex hashing that's not needed
                    payment_hash = self._generate_complex_hash(amount, card_number, cvv, user_id, datetime.now())

                    # Security issue: Insecure HTTP request construction
                    url = f"{self.base_url}/charge?amount={amount}&card={card_number}&api_key={self.api_key}"

                    response = self.session.get(url)  # Security issue: using GET for sensitive data

                    if response.status_code == 200:
                        return {"status": "success", "hash": payment_hash}
                    else:
                        # Code smell: Generic exception handling without specific error types
                        self.failed_payments.append({"amount": amount, "timestamp": datetime.now()})

                except Exception as e:
                    # Code smell: Bare except clause and poor error handling
                    print(f"Payment failed: {e}")
                    continue

        return {"status": "failed"}

    def _generate_complex_hash(self, amount, card_number, cvv, user_id, timestamp):
        \"\"\"Over-engineered hash generation with unnecessary complexity\"\"\"
        # Over-engineering: Overly complex for no clear benefit
        combined = f"{amount}-{card_number}-{cvv}-{user_id}-{timestamp}"

        # Security issue: Weak hashing algorithm
        hash1 = hashlib.md5(combined.encode()).hexdigest()
        hash2 = hashlib.sha1(hash1.encode()).hexdigest()
        hash3 = hashlib.md5(hash2.encode()).hexdigest()

        # Performance issue: Unnecessary string operations in loop
        result = ""
        for i in range(len(hash3)):
            for j in range(3):  # Arbitrary nested loop
                result += hash3[i] if i % 2 == 0 else hash3[i].upper()

        return result[:32]  # Arbitrary truncation

    def get_payment_history(self, user_id):
        \"\"\"Get payment history - has scalability issues\"\"\"
        # Performance issue: No pagination, could return massive datasets
        # Performance issue: Inefficient algorithm O(n²)
        all_payments = self._fetch_all_payments()  # Could be millions of records
        user_payments = []

        for payment in all_payments:
            for field in payment:  # Unnecessary nested iteration
                if field == "user_id" and payment[field] == user_id:
                    user_payments.append(payment)
                    break

        return user_payments

    def _fetch_all_payments(self):
        \"\"\"Simulated method that would fetch all payments\"\"\"
        # Maintainability issue: Hard-coded test data
        return [
            {"user_id": 1, "amount": 100, "status": "success"},
            {"user_id": 2, "amount": 200, "status": "failed"},
            {"user_id": 1, "amount": 150, "status": "success"},
        ]
"""

        # Create test file with multiple issues
        self.payment_file = self.create_additional_test_file("payment_processor.py", payment_code)
        self.logger.info(f"  ✅ Created test file with code issues: {self.payment_file}")

        # Create configuration file with additional issues
        config_code = """#!/usr/bin/env python3
import os

# Security issue: Hardcoded secrets
DATABASE_PASSWORD = "admin123"
SECRET_KEY = "my-secret-key-12345"

# Over-engineering: Unnecessarily complex configuration class
class ConfigurationManager:
    def __init__(self):
        self.config_cache = {}
        self.config_hierarchy = {}
        self.config_validators = {}
        self.config_transformers = {}
        self.config_listeners = []

    def get_config(self, key, default=None):
        # Over-engineering: Complex caching for simple config lookup
        if key in self.config_cache:
            cached_value = self.config_cache[key]
            if self._validate_cached_value(cached_value):
                return self._transform_value(key, cached_value)

        # Code smell: Complex nested conditionals
        if key in self.config_hierarchy:
            hierarchy = self.config_hierarchy[key]
            for level in hierarchy:
                if level == "env":
                    value = os.getenv(key.upper(), default)
                elif level == "file":
                    value = self._read_from_file(key, default)
                elif level == "database":
                    value = self._read_from_database(key, default)
                else:
                    value = default

                if value is not None:
                    self.config_cache[key] = value
                    return self._transform_value(key, value)

        return default

    def _validate_cached_value(self, value):
        # Maintainability issue: Unclear validation logic
        if isinstance(value, str) and len(value) > 1000:
            return False
        return True

    def _transform_value(self, key, value):
        # Code smell: Unnecessary abstraction
        if key in self.config_transformers:
            transformer = self.config_transformers[key]
            return transformer(value)
        return value

    def _read_from_file(self, key, default):
        # Maintainability issue: No error handling for file operations
        with open(f"/etc/app/{key}.conf") as f:
            return f.read().strip()

    def _read_from_database(self, key, default):
        # Performance issue: Database query for every config read
        # No connection pooling or caching
        import sqlite3
        conn = sqlite3.connect("config.db")
        cursor = conn.cursor()
        cursor.execute("SELECT value FROM config WHERE key = ?", (key,))
        result = cursor.fetchone()
        conn.close()
        return result[0] if result else default
"""

        self.config_file = self.create_additional_test_file("config.py", config_code)
        self.logger.info(f"  ✅ Created configuration file with issues: {self.config_file}")

    def _test_single_review_session(self) -> bool:
        """Test a complete code review session with multiple steps"""
        try:
            self.logger.info("  1.1: Testing single code review session")

            # Step 1: Start review
            self.logger.info("    1.1.1: Step 1 - Initial review")
            response1, continuation_id = self.call_mcp_tool(
                "codereview",
                {
                    "step": "I need to perform a comprehensive code review of the payment processing module. Let me start by examining the code structure and identifying potential issues.",
                    "step_number": 1,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Initial examination reveals a payment processing class with potential security and performance concerns.",
                    "files_checked": [self.payment_file],
                    "relevant_files": [self.payment_file],
                    "absolute_file_paths": [self.payment_file],  # Required for step 1
                    "review_type": "full",
                    "severity_filter": "all",
                },
            )

            if not response1 or not continuation_id:
                self.logger.error("Failed to get initial review response")
                return False

            # Parse and validate JSON response
            response1_data = self._parse_review_response(response1)
            if not response1_data:
                return False

            # Validate step 1 response structure - expect pause_for_code_review for next_step_required=True
            if not self._validate_step_response(response1_data, 1, 4, True, "pause_for_code_review"):
                return False

            self.logger.info(f"    ✅ Step 1 successful, continuation_id: {continuation_id}")

            # Step 2: Detailed analysis
            self.logger.info("    1.1.2: Step 2 - Detailed security analysis")
            response2, _ = self.call_mcp_tool(
                "codereview",
                {
                    "step": "Now performing detailed security analysis of the payment processor code to identify vulnerabilities and code quality issues.",
                    "step_number": 2,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Found multiple security issues: API key stored in plain text, sensitive data logging, insecure HTTP methods, and weak hashing algorithms.",
                    "files_checked": [self.payment_file],
                    "relevant_files": [self.payment_file],
                    "relevant_context": ["PaymentProcessor.__init__", "PaymentProcessor.process_payment"],
                    "issues_found": [
                        {"severity": "critical", "description": "API key stored in plain text in memory"},
                        {"severity": "critical", "description": "Credit card and CVV logged in plain text"},
                        {"severity": "high", "description": "Using GET method for sensitive payment data"},
                        {"severity": "medium", "description": "Weak MD5 hashing algorithm used"},
                    ],
                    "confidence": "high",
                    "continuation_id": continuation_id,
                },
            )

            if not response2:
                self.logger.error("Failed to continue review to step 2")
                return False

            response2_data = self._parse_review_response(response2)
            if not self._validate_step_response(response2_data, 2, 4, True, "pause_for_code_review"):
                return False

            # Check review status tracking
            review_status = response2_data.get("code_review_status", {})
            if review_status.get("files_checked", 0) < 1:
                self.logger.error("Files checked count not properly tracked")
                return False

            if review_status.get("relevant_context", 0) != 2:
                self.logger.error("Relevant context not properly tracked")
                return False

            # Check issues by severity
            issues_by_severity = review_status.get("issues_by_severity", {})
            if issues_by_severity.get("critical", 0) != 2:
                self.logger.error("Critical issues not properly tracked")
                return False

            if issues_by_severity.get("high", 0) != 1:
                self.logger.error("High severity issues not properly tracked")
                return False

            self.logger.info("    ✅ Step 2 successful with proper issue tracking")

            # Store continuation_id for next test
            self.review_continuation_id = continuation_id
            return True

        except Exception as e:
            self.logger.error(f"Single review session test failed: {e}")
            return False

    def _test_review_refocus_flow(self) -> bool:
        """Test code review flow that revises findings by refocusing"""
        try:
            self.logger.info("  1.2: Testing code review refocus workflow")

            # Start a new review for testing refocus behaviour
            self.logger.info("    1.2.1: Start review for refocus test")
            response1, continuation_id = self.call_mcp_tool(
                "codereview",
                {
                    "step": "Reviewing configuration management code for best practices",
                    "step_number": 1,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Initial analysis shows complex configuration class",
                    "files_checked": [self.config_file],
                    "relevant_files": [self.config_file],
                    "absolute_file_paths": [self.config_file],
                    "review_type": "full",
                },
            )

            if not response1 or not continuation_id:
                self.logger.error("Failed to start refocus test review")
                return False

            # Step 2: Initial direction
            self.logger.info("    1.2.2: Step 2 - Initial analysis direction")
            response2, _ = self.call_mcp_tool(
                "codereview",
                {
                    "step": "Focusing on configuration architecture patterns",
                    "step_number": 2,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Architecture seems overly complex, but need to look more carefully at security issues",
                    "files_checked": [self.config_file],
                    "relevant_files": [self.config_file],
                    "issues_found": [
                        {"severity": "medium", "description": "Complex configuration hierarchy"},
                    ],
                    "confidence": "low",
                    "continuation_id": continuation_id,
                },
            )

            if not response2:
                self.logger.error("Failed to continue to step 2")
                return False

            # Step 3: Shift focus based on new evidence
            self.logger.info("    1.2.3: Step 3 - Refocus on security issues")
            response3, _ = self.call_mcp_tool(
                "codereview",
                {
                    "step": "Refocusing - need to concentrate on the critical security issues I initially missed. Found hardcoded secrets and credentials in plain text.",
                    "step_number": 3,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Found critical security vulnerabilities: hardcoded DATABASE_PASSWORD and SECRET_KEY in plain text",
                    "files_checked": [self.config_file],
                    "relevant_files": [self.config_file],
                    "relevant_context": ["ConfigurationManager.__init__"],
                    "issues_found": [
                        {"severity": "critical", "description": "Hardcoded database password in source code"},
                        {"severity": "critical", "description": "Hardcoded secret key in source code"},
                        {"severity": "high", "description": "Over-engineered configuration system"},
                    ],
                    "confidence": "high",
                    "continuation_id": continuation_id,
                },
            )

            if not response3:
                self.logger.error("Failed to refocus")
                return False

            response3_data = self._parse_review_response(response3)
            if not self._validate_step_response(response3_data, 3, 4, True, "pause_for_code_review"):
                return False

            self.logger.info("    ✅ Refocus flow working correctly")
            return True

        except Exception as e:
            self.logger.error(f"Refocus test failed: {e}")
            return False

    def _test_complete_review_with_analysis(self) -> bool:
        """Test complete code review ending with expert analysis"""
        try:
            self.logger.info("  1.3: Testing complete review with expert analysis")

            # Use the continuation from first test
            continuation_id = getattr(self, "review_continuation_id", None)
            if not continuation_id:
                # Start fresh if no continuation available
                self.logger.info("    1.3.0: Starting fresh review")
                response0, continuation_id = self.call_mcp_tool(
                    "codereview",
                    {
                        "step": "Reviewing payment processor for security and quality issues",
                        "step_number": 1,
                        "total_steps": 2,
                        "next_step_required": True,
                        "findings": "Found multiple security and performance issues",
                        "files_checked": [self.payment_file],
                        "relevant_files": [self.payment_file],
                        "absolute_file_paths": [self.payment_file],
                        "relevant_context": ["PaymentProcessor.process_payment"],
                    },
                )
                if not response0 or not continuation_id:
                    self.logger.error("Failed to start fresh review")
                    return False

            # Final step - trigger expert analysis
            self.logger.info("    1.3.1: Final step - complete review")
            response_final, _ = self.call_mcp_tool(
                "codereview",
                {
                    "step": "Code review complete. Identified comprehensive security, performance, and maintainability issues throughout the payment processing module.",
                    "step_number": 2,
                    "total_steps": 2,
                    "next_step_required": False,  # Final step - triggers expert analysis
                    "findings": "Complete analysis reveals critical security vulnerabilities, performance bottlenecks, over-engineering patterns, and maintainability concerns. All issues documented with severity levels.",
                    "files_checked": [self.payment_file],
                    "relevant_files": [self.payment_file],
                    "relevant_context": [
                        "PaymentProcessor.process_payment",
                        "PaymentProcessor._generate_complex_hash",
                        "PaymentProcessor.get_payment_history",
                    ],
                    "issues_found": [
                        {"severity": "critical", "description": "API key stored in plain text"},
                        {"severity": "critical", "description": "Sensitive payment data logged"},
                        {"severity": "high", "description": "SQL injection vulnerability potential"},
                        {"severity": "medium", "description": "Over-engineered hash generation"},
                        {"severity": "low", "description": "Poor error handling patterns"},
                    ],
                    "confidence": "high",
                    "continuation_id": continuation_id,
                    "model": "flash",  # Use flash for expert analysis
                },
            )

            if not response_final:
                self.logger.error("Failed to complete review")
                return False

            response_final_data = self._parse_review_response(response_final)
            if not response_final_data:
                return False

            # Validate final response structure - expect calling_expert_analysis for next_step_required=False
            if response_final_data.get("status") != "calling_expert_analysis":
                self.logger.error(
                    f"Expected status 'calling_expert_analysis', got '{response_final_data.get('status')}'"
                )
                return False

            if not response_final_data.get("code_review_complete"):
                self.logger.error("Expected code_review_complete=true for final step")
                return False

            # Check for expert analysis
            if "expert_analysis" not in response_final_data:
                self.logger.error("Missing expert_analysis in final response")
                return False

            expert_analysis = response_final_data.get("expert_analysis", {})

            # Check for expected analysis content (checking common patterns)
            analysis_text = json.dumps(expert_analysis, ensure_ascii=False).lower()

            # Look for code review identification
            review_indicators = ["security", "vulnerability", "performance", "critical", "api", "key"]
            found_indicators = sum(1 for indicator in review_indicators if indicator in analysis_text)

            if found_indicators >= 3:
                self.logger.info("    ✅ Expert analysis identified the issues correctly")
            else:
                self.logger.warning(
                    f"    ⚠️ Expert analysis may not have fully identified the issues (found {found_indicators}/6 indicators)"
                )

            # Check complete review summary
            if "complete_code_review" not in response_final_data:
                self.logger.error("Missing complete_code_review in final response")
                return False

            complete_review = response_final_data["complete_code_review"]
            if not complete_review.get("relevant_context"):
                self.logger.error("Missing relevant context in complete review")
                return False

            if "PaymentProcessor.process_payment" not in complete_review["relevant_context"]:
                self.logger.error("Expected method not found in review summary")
                return False

            self.logger.info("    ✅ Complete review with expert analysis successful")
            return True

        except Exception as e:
            self.logger.error(f"Complete review test failed: {e}")
            return False

    def _test_certain_confidence(self) -> bool:
        """Test certain confidence behavior - should skip expert analysis"""
        try:
            self.logger.info("  1.4: Testing certain confidence behavior")

            # Test certain confidence - should skip expert analysis
            self.logger.info("    1.4.1: Certain confidence review")
            response_certain, _ = self.call_mcp_tool(
                "codereview",
                {
                    "step": "I have completed a thorough code review with 100% certainty of all issues identified.",
                    "step_number": 1,
                    "total_steps": 1,
                    "next_step_required": False,  # Final step
                    "findings": "Complete review identified all critical security issues, performance problems, and code quality concerns. All issues are documented with clear severity levels and specific recommendations.",
                    "files_checked": [self.payment_file],
                    "relevant_files": [self.payment_file],
                    "absolute_file_paths": [self.payment_file],
                    "relevant_context": ["PaymentProcessor.process_payment"],
                    "issues_found": [
                        {"severity": "critical", "description": "Hardcoded API key security vulnerability"},
                        {"severity": "high", "description": "Performance bottleneck in payment history"},
                    ],
                    "review_validation_type": "internal",  # This should skip expert analysis
                    "model": "flash",
                },
            )

            if not response_certain:
                self.logger.error("Failed to test certain confidence")
                return False

            response_certain_data = self._parse_review_response(response_certain)
            if not response_certain_data:
                return False

            # Validate certain confidence response - should skip expert analysis
            if response_certain_data.get("status") != "code_review_complete_ready_for_implementation":
                self.logger.error(
                    f"Expected status 'code_review_complete_ready_for_implementation', got '{response_certain_data.get('status')}'"
                )
                return False

            if not response_certain_data.get("skip_expert_analysis"):
                self.logger.error("Expected skip_expert_analysis=true for certain confidence")
                return False

            expert_analysis = response_certain_data.get("expert_analysis", {})
            if expert_analysis.get("status") not in [
                "skipped_due_to_certain_review_confidence",
                "skipped_due_to_internal_analysis_type",
            ]:
                self.logger.error("Expert analysis should be skipped for certain confidence")
                return False

            self.logger.info("    ✅ Certain confidence behavior working correctly")
            return True

        except Exception as e:
            self.logger.error(f"Certain confidence test failed: {e}")
            return False

    def _test_context_aware_file_embedding(self) -> bool:
        """Test context-aware file embedding optimization"""
        try:
            self.logger.info("  1.5: Testing context-aware file embedding")

            # Create multiple test files for context testing
            utils_content = """#!/usr/bin/env python3
def calculate_discount(price, discount_percent):
    \"\"\"Calculate discount amount\"\"\"
    if discount_percent < 0 or discount_percent > 100:
        raise ValueError("Invalid discount percentage")

    return price * (discount_percent / 100)

def format_currency(amount):
    \"\"\"Format amount as currency\"\"\"
    return f"${amount:.2f}"
"""

            validator_content = """#!/usr/bin/env python3
import re

def validate_email(email):
    \"\"\"Validate email format\"\"\"
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$'
    return re.match(pattern, email) is not None

def validate_credit_card(card_number):
    \"\"\"Basic credit card validation\"\"\"
    # Remove spaces and dashes
    card_number = re.sub(r'[\\s-]', '', card_number)

    # Check if all digits
    if not card_number.isdigit():
        return False

    # Basic length check
    return len(card_number) in [13, 14, 15, 16]
"""

            # Create test files
            utils_file = self.create_additional_test_file("utils.py", utils_content)
            validator_file = self.create_additional_test_file("validator.py", validator_content)

            # Test 1: New conversation, intermediate step - should only reference files
            self.logger.info("    1.5.1: New conversation intermediate step (should reference only)")
            response1, continuation_id = self.call_mcp_tool(
                "codereview",
                {
                    "step": "Starting comprehensive code review of utility modules",
                    "step_number": 1,
                    "total_steps": 3,
                    "next_step_required": True,  # Intermediate step
                    "findings": "Initial analysis of utility and validation functions",
                    "files_checked": [utils_file, validator_file],
                    "relevant_files": [utils_file],  # This should be referenced, not embedded
                    "absolute_file_paths": [utils_file, validator_file],  # Required for step 1
                    "relevant_context": ["calculate_discount"],
                    "confidence": "low",
                    "model": "flash",
                },
            )

            if not response1 or not continuation_id:
                self.logger.error("Failed to start context-aware file embedding test")
                return False

            response1_data = self._parse_review_response(response1)
            if not response1_data:
                return False

            # Check file context - should be reference_only for intermediate step
            file_context = response1_data.get("file_context", {})
            if file_context.get("type") != "reference_only":
                self.logger.error(f"Expected reference_only file context, got: {file_context.get('type')}")
                return False

            if "Files referenced but not embedded" not in file_context.get("context_optimization", ""):
                self.logger.error("Expected context optimization message for reference_only")
                return False

            self.logger.info("    ✅ Intermediate step correctly uses reference_only file context")

            # Test 2: Final step - should embed files for expert analysis
            self.logger.info("    1.5.2: Final step (should embed files)")
            response3, _ = self.call_mcp_tool(
                "codereview",
                {
                    "step": "Code review complete - identified all issues and recommendations",
                    "step_number": 3,
                    "total_steps": 3,
                    "next_step_required": False,  # Final step - should embed files
                    "continuation_id": continuation_id,
                    "findings": "Complete review: utility functions have proper error handling, validation functions are robust",
                    "files_checked": [utils_file, validator_file],
                    "relevant_files": [utils_file, validator_file],  # Should be fully embedded
                    "relevant_context": ["calculate_discount", "validate_email", "validate_credit_card"],
                    "issues_found": [
                        {"severity": "low", "description": "Could add more comprehensive email validation"},
                        {"severity": "medium", "description": "Credit card validation logic could be more robust"},
                    ],
                    "confidence": "medium",
                    "model": "flash",
                },
            )

            if not response3:
                self.logger.error("Failed to complete to final step")
                return False

            response3_data = self._parse_review_response(response3)
            if not response3_data:
                return False

            # Check file context - should be fully_embedded for final step
            file_context3 = response3_data.get("file_context", {})
            if file_context3.get("type") != "fully_embedded":
                self.logger.error(
                    f"Expected fully_embedded file context for final step, got: {file_context3.get('type')}"
                )
                return False

            if "Full file content embedded for expert analysis" not in file_context3.get("context_optimization", ""):
                self.logger.error("Expected expert analysis optimization message for fully_embedded")
                return False

            self.logger.info("    ✅ Final step correctly uses fully_embedded file context")

            # Verify expert analysis was called for final step
            if response3_data.get("status") != "calling_expert_analysis":
                self.logger.error("Final step should trigger expert analysis")
                return False

            if "expert_analysis" not in response3_data:
                self.logger.error("Expert analysis should be present in final step")
                return False

            self.logger.info("    ✅ Context-aware file embedding test completed successfully")
            return True

        except Exception as e:
            self.logger.error(f"Context-aware file embedding test failed: {e}")
            return False

    def _test_multi_step_file_context(self) -> bool:
        """Test multi-step workflow with proper file context transitions"""
        try:
            self.logger.info("  1.6: Testing multi-step file context optimization")

            # Use existing payment and config files for multi-step test
            files_to_review = [self.payment_file, self.config_file]

            # Step 1: Start review (new conversation)
            self.logger.info("    1.6.1: Step 1 - Start comprehensive review")
            response1, continuation_id = self.call_mcp_tool(
                "codereview",
                {
                    "step": "Starting comprehensive security and quality review of payment system components",
                    "step_number": 1,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Initial review of payment processor and configuration management modules",
                    "files_checked": files_to_review,
                    "relevant_files": [self.payment_file],
                    "absolute_file_paths": files_to_review,
                    "relevant_context": [],
                    "confidence": "low",
                    "review_type": "security",
                    "model": "flash",
                },
            )

            if not response1 or not continuation_id:
                self.logger.error("Failed to start multi-step file context test")
                return False

            response1_data = self._parse_review_response(response1)

            # Validate step 1 - should use reference_only
            file_context1 = response1_data.get("file_context", {})
            if file_context1.get("type") != "reference_only":
                self.logger.error("Step 1 should use reference_only file context")
                return False

            self.logger.info("    ✅ Step 1: reference_only file context")

            # Step 2: Security analysis
            self.logger.info("    1.6.2: Step 2 - Security analysis")
            response2, _ = self.call_mcp_tool(
                "codereview",
                {
                    "step": "Focusing on critical security vulnerabilities across both modules",
                    "step_number": 2,
                    "total_steps": 4,
                    "next_step_required": True,
                    "continuation_id": continuation_id,
                    "findings": "Found critical security issues: hardcoded secrets in config, API key exposure in payment processor",
                    "files_checked": files_to_review,
                    "relevant_files": files_to_review,
                    "relevant_context": ["PaymentProcessor.__init__", "ConfigurationManager"],
                    "issues_found": [
                        {"severity": "critical", "description": "Hardcoded database password"},
                        {"severity": "critical", "description": "API key stored in plain text"},
                    ],
                    "confidence": "medium",
                    "model": "flash",
                },
            )

            if not response2:
                self.logger.error("Failed to continue to step 2")
                return False

            response2_data = self._parse_review_response(response2)

            # Validate step 2 - should still use reference_only
            file_context2 = response2_data.get("file_context", {})
            if file_context2.get("type") != "reference_only":
                self.logger.error("Step 2 should use reference_only file context")
                return False

            self.logger.info("    ✅ Step 2: reference_only file context")

            # Step 3: Performance and architecture analysis
            self.logger.info("    1.6.3: Step 3 - Performance and architecture analysis")
            response3, _ = self.call_mcp_tool(
                "codereview",
                {
                    "step": "Analyzing performance bottlenecks and architectural concerns",
                    "step_number": 3,
                    "total_steps": 4,
                    "next_step_required": True,
                    "continuation_id": continuation_id,
                    "findings": "Performance issues: unbounded lists, inefficient algorithms, over-engineered patterns",
                    "files_checked": files_to_review,
                    "relevant_files": files_to_review,
                    "relevant_context": [
                        "PaymentProcessor.get_payment_history",
                        "PaymentProcessor._generate_complex_hash",
                    ],
                    "issues_found": [
                        {"severity": "high", "description": "O(n²) algorithm in payment history"},
                        {"severity": "medium", "description": "Over-engineered hash generation"},
                        {"severity": "medium", "description": "Unbounded failed_payments list"},
                    ],
                    "confidence": "high",
                    "model": "flash",
                },
            )

            if not response3:
                self.logger.error("Failed to continue to step 3")
                return False

            response3_data = self._parse_review_response(response3)

            # Validate step 3 - should still use reference_only
            file_context3 = response3_data.get("file_context", {})
            if file_context3.get("type") != "reference_only":
                self.logger.error("Step 3 should use reference_only file context")
                return False

            self.logger.info("    ✅ Step 3: reference_only file context")

            # Step 4: Final comprehensive analysis
            self.logger.info("    1.6.4: Step 4 - Final comprehensive analysis")
            response4, _ = self.call_mcp_tool(
                "codereview",
                {
                    "step": "Code review complete - comprehensive analysis of all security, performance, and quality issues",
                    "step_number": 4,
                    "total_steps": 4,
                    "next_step_required": False,  # Final step - should embed files
                    "continuation_id": continuation_id,
                    "findings": "Complete review: identified critical security vulnerabilities, performance bottlenecks, over-engineering patterns, and maintainability concerns across payment and configuration modules.",
                    "files_checked": files_to_review,
                    "relevant_files": files_to_review,
                    "relevant_context": ["PaymentProcessor.process_payment", "ConfigurationManager.get_config"],
                    "issues_found": [
                        {"severity": "critical", "description": "Multiple hardcoded secrets"},
                        {"severity": "high", "description": "Performance and security issues in payment processing"},
                        {"severity": "medium", "description": "Over-engineered architecture patterns"},
                    ],
                    "confidence": "high",
                    "model": "flash",
                },
            )

            if not response4:
                self.logger.error("Failed to complete to final step")
                return False

            response4_data = self._parse_review_response(response4)

            # Validate step 4 - should use fully_embedded for expert analysis
            file_context4 = response4_data.get("file_context", {})
            if file_context4.get("type") != "fully_embedded":
                self.logger.error("Step 4 (final) should use fully_embedded file context")
                return False

            if "expert analysis" not in file_context4.get("context_optimization", "").lower():
                self.logger.error("Final step should mention expert analysis in context optimization")
                return False

            # Verify expert analysis was triggered
            if response4_data.get("status") != "calling_expert_analysis":
                self.logger.error("Final step should trigger expert analysis")
                return False

            # Check that expert analysis has content
            expert_analysis = response4_data.get("expert_analysis", {})
            if not expert_analysis:
                self.logger.error("Expert analysis should be present in final step")
                return False

            self.logger.info("    ✅ Step 4: fully_embedded file context with expert analysis")

            # Validate the complete workflow progression
            progression_summary = {
                "step_1": "reference_only (new conversation, intermediate)",
                "step_2": "reference_only (continuation, intermediate)",
                "step_3": "reference_only (continuation, intermediate)",
                "step_4": "fully_embedded (continuation, final)",
            }

            self.logger.info("    📋 File context progression:")
            for step, context_type in progression_summary.items():
                self.logger.info(f"      {step}: {context_type}")

            self.logger.info("    ✅ Multi-step file context optimization test completed successfully")
            return True

        except Exception as e:
            self.logger.error(f"Multi-step file context test failed: {e}")
            return False

    def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
        """Call an MCP tool in-process - override for codereview-specific response handling"""
        # Use in-process implementation to maintain conversation memory
        response_text, _ = self.call_mcp_tool_direct(tool_name, params)

        if not response_text:
            return None, None

        # Extract continuation_id from codereview response specifically
        continuation_id = self._extract_review_continuation_id(response_text)

        return response_text, continuation_id

    def _extract_review_continuation_id(self, response_text: str) -> Optional[str]:
        """Extract continuation_id from codereview response"""
        try:
            # Parse the response
            response_data = json.loads(response_text)
            return response_data.get("continuation_id")

        except json.JSONDecodeError as e:
            self.logger.debug(f"Failed to parse response for review continuation_id: {e}")
            return None

    def _parse_review_response(self, response_text: str) -> dict:
        """Parse codereview tool JSON response"""
        try:
            # Parse the response - it should be direct JSON
            return json.loads(response_text)

        except json.JSONDecodeError as e:
            self.logger.error(f"Failed to parse review response as JSON: {e}")
            self.logger.error(f"Response text: {response_text[:500]}...")
            return {}

    def _validate_step_response(
        self,
        response_data: dict,
        expected_step: int,
        expected_total: int,
        expected_next_required: bool,
        expected_status: str,
    ) -> bool:
        """Validate a codereview step response structure"""
        try:
            # Check status
            if response_data.get("status") != expected_status:
                self.logger.error(f"Expected status '{expected_status}', got '{response_data.get('status')}'")
                return False

            # Check step number
            if response_data.get("step_number") != expected_step:
                self.logger.error(f"Expected step_number {expected_step}, got {response_data.get('step_number')}")
                return False

            # Check total steps
            if response_data.get("total_steps") != expected_total:
                self.logger.error(f"Expected total_steps {expected_total}, got {response_data.get('total_steps')}")
                return False

            # Check next_step_required
            if response_data.get("next_step_required") != expected_next_required:
                self.logger.error(
                    f"Expected next_step_required {expected_next_required}, got {response_data.get('next_step_required')}"
                )
                return False

            # Check code_review_status exists
            if "code_review_status" not in response_data:
                self.logger.error("Missing code_review_status in response")
                return False

            # Check next_steps guidance
            if not response_data.get("next_steps"):
                self.logger.error("Missing next_steps guidance in response")
                return False

            return True

        except Exception as e:
            self.logger.error(f"Error validating step response: {e}")
            return False

```

--------------------------------------------------------------------------------
/simulator_tests/test_analyze_validation.py:
--------------------------------------------------------------------------------

```python
#!/usr/bin/env python3
"""
Analyze Tool Validation Test

Tests the analyze tool's capabilities using the new workflow architecture.
This validates that the new workflow-based implementation provides step-by-step
analysis with expert validation following the same patterns as debug/codereview tools.
"""

import json
from typing import Optional

from .conversation_base_test import ConversationBaseTest


class AnalyzeValidationTest(ConversationBaseTest):
    """Test analyze tool with new workflow architecture"""

    @property
    def test_name(self) -> str:
        return "analyze_validation"

    @property
    def test_description(self) -> str:
        return "AnalyzeWorkflow tool validation with new workflow architecture"

    def run_test(self) -> bool:
        """Test analyze tool capabilities"""
        # Set up the test environment
        self.setUp()

        try:
            self.logger.info("Test: AnalyzeWorkflow tool validation (new architecture)")

            # Create test files for analysis
            self._create_analysis_codebase()

            # Test 1: Single analysis session with multiple steps
            if not self._test_single_analysis_session():
                return False

            # Test 2: Analysis flow that requires refocusing
            if not self._test_analysis_refocus_flow():
                return False

            # Test 3: Complete analysis with expert validation
            if not self._test_complete_analysis_with_expert():
                return False

            # Test 4: Certain confidence behavior
            if not self._test_certain_confidence():
                return False

            # Test 5: Context-aware file embedding
            if not self._test_context_aware_file_embedding():
                return False

            # Test 6: Different analysis types
            if not self._test_analysis_types():
                return False

            self.logger.info("  ✅ All analyze validation tests passed")
            return True

        except Exception as e:
            self.logger.error(f"AnalyzeWorkflow validation test failed: {e}")
            return False

    def _create_analysis_codebase(self):
        """Create test files representing a realistic codebase for analysis"""
        # Create a Python microservice with various architectural patterns
        main_service = """#!/usr/bin/env python3
import asyncio
import json
from datetime import datetime
from typing import Dict, List, Optional

from fastapi import FastAPI, HTTPException, Depends
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
from sqlalchemy.orm import sessionmaker
import redis
import logging

# Global configurations - could be improved
DATABASE_URL = "postgresql://user:pass@localhost/db"
REDIS_URL = "redis://localhost:6379"

app = FastAPI(title="User Management Service")

# Database setup
engine = create_async_engine(DATABASE_URL, echo=True)
AsyncSessionLocal = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)

# Redis connection - potential singleton pattern issue
redis_client = redis.Redis.from_url(REDIS_URL)

class UserService:
    def __init__(self, db: AsyncSession):
        self.db = db
        self.cache = redis_client  # Direct dependency on global

    async def get_user(self, user_id: int) -> Optional[Dict]:
        # Cache key generation - could be centralized
        cache_key = f"user:{user_id}"

        # Check cache first
        cached = self.cache.get(cache_key)
        if cached:
            return json.loads(cached)

        # Database query - no error handling
        result = await self.db.execute(
            "SELECT * FROM users WHERE id = %s", (user_id,)
        )
        user_data = result.fetchone()        if user_data:
            # Cache for 1 hour - magic number
            self.cache.setex(cache_key, 3600, json.dumps(user_data, ensure_ascii=False))

        return user_data

    async def create_user(self, user_data: Dict) -> Dict:
        # Input validation missing
        # No transaction handling
        # No audit logging

        query = "INSERT INTO users (name, email) VALUES (%s, %s) RETURNING id"
        result = await self.db.execute(query, (user_data['name'], user_data['email']))
        user_id = result.fetchone()[0]

        # Cache invalidation strategy missing

        return {"id": user_id, **user_data}

@app.get("/users/{user_id}")
async def get_user_endpoint(user_id: int, db: AsyncSession = Depends(get_db)):
    service = UserService(db)
    user = await service.get_user(user_id)

    if not user:
        raise HTTPException(status_code=404, detail="User not found")

    return user

@app.post("/users")
async def create_user_endpoint(user_data: dict, db: AsyncSession = Depends(get_db)):
    service = UserService(db)
    return await service.create_user(user_data)

async def get_db():
    async with AsyncSessionLocal() as session:
        yield session
"""

        # Create config module with various architectural concerns
        config_module = """#!/usr/bin/env python3
import os
from dataclasses import dataclass
from typing import Optional

# Configuration approach could be improved
@dataclass
class DatabaseConfig:
    url: str = os.getenv("DATABASE_URL", "postgresql://localhost/app")
    pool_size: int = int(os.getenv("DB_POOL_SIZE", "5"))
    max_overflow: int = int(os.getenv("DB_MAX_OVERFLOW", "10"))
    echo: bool = os.getenv("DB_ECHO", "false").lower() == "true"

@dataclass
class CacheConfig:
    redis_url: str = os.getenv("REDIS_URL", "redis://localhost:6379")
    default_ttl: int = int(os.getenv("CACHE_TTL", "3600"))
    max_connections: int = int(os.getenv("REDIS_MAX_CONN", "20"))

@dataclass
class AppConfig:
    environment: str = os.getenv("ENVIRONMENT", "development")
    debug: bool = os.getenv("DEBUG", "false").lower() == "true"
    log_level: str = os.getenv("LOG_LEVEL", "INFO")

    # Nested config objects
    database: DatabaseConfig = DatabaseConfig()
    cache: CacheConfig = CacheConfig()

    # Security settings scattered
    secret_key: str = os.getenv("SECRET_KEY", "dev-key-not-secure")
    jwt_algorithm: str = "HS256"
    jwt_expiration: int = 86400  # 24 hours

    def __post_init__(self):
        # Validation logic could be centralized
        if self.environment == "production" and self.secret_key == "dev-key-not-secure":
            raise ValueError("Production environment requires secure secret key")

# Global configuration instance - potential issues
config = AppConfig()

# Helper functions that could be methods
def get_database_url() -> str:
    return config.database.url

def get_cache_config() -> dict:
    return {
        "url": config.cache.redis_url,
        "ttl": config.cache.default_ttl,
        "max_connections": config.cache.max_connections
    }

def is_production() -> bool:
    return config.environment == "production"

def should_enable_debug() -> bool:
    return config.debug and not is_production()
"""

        # Create models module with database concerns
        models_module = """#!/usr/bin/env python3
from datetime import datetime
from typing import Optional, List
from sqlalchemy import Column, Integer, String, DateTime, Boolean, ForeignKey, Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship
import json

Base = declarative_base()

class User(Base):
    __tablename__ = "users"

    id = Column(Integer, primary_key=True)
    email = Column(String(255), unique=True, nullable=False)
    name = Column(String(255), nullable=False)
    is_active = Column(Boolean, default=True)
    created_at = Column(DateTime, default=datetime.utcnow)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)

    # Relationship could be optimized
    profiles = relationship("UserProfile", back_populates="user", lazy="select")
    audit_logs = relationship("AuditLog", back_populates="user")

    def to_dict(self) -> dict:
        # Serialization logic mixed with model - could be separated
        return {
            "id": self.id,
            "email": self.email,
            "name": self.name,
            "is_active": self.is_active,
            "created_at": self.created_at.isoformat() if self.created_at else None,
            "updated_at": self.updated_at.isoformat() if self.updated_at else None
        }

    def update_from_dict(self, data: dict):
        # Update logic could be more robust
        for key, value in data.items():
            if hasattr(self, key) and key not in ['id', 'created_at']:
                setattr(self, key, value)
        self.updated_at = datetime.utcnow()

class UserProfile(Base):
    __tablename__ = "user_profiles"

    id = Column(Integer, primary_key=True)
    user_id = Column(Integer, ForeignKey("users.id"), nullable=False)
    bio = Column(Text)
    avatar_url = Column(String(500))
    preferences = Column(Text)  # JSON stored as text - could use JSON column

    user = relationship("User", back_populates="profiles")

    def get_preferences(self) -> dict:
        # JSON handling could be centralized
        try:
            return json.loads(self.preferences) if self.preferences else {}
        except json.JSONDecodeError:
            return {}    def set_preferences(self, prefs: dict):
        self.preferences = json.dumps(prefs, ensure_ascii=False)

class AuditLog(Base):
    __tablename__ = "audit_logs"

    id = Column(Integer, primary_key=True)
    user_id = Column(Integer, ForeignKey("users.id"), nullable=False)
    action = Column(String(100), nullable=False)
    details = Column(Text)  # JSON stored as text
    ip_address = Column(String(45))  # IPv6 support
    user_agent = Column(Text)
    timestamp = Column(DateTime, default=datetime.utcnow)

    user = relationship("User", back_populates="audit_logs")

    @classmethod
    def log_action(cls, db_session, user_id: int, action: str, details: dict = None,
                   ip_address: str = None, user_agent: str = None):
        # Factory method pattern - could be improved
        log = cls(
            user_id=user_id,
            action=action,
            details=json.dumps(details, ensure_ascii=False) if details else None,
            ip_address=ip_address,
            user_agent=user_agent
        )
        db_session.add(log)
        return log
"""

        # Create utility module with various helper functions
        utils_module = """#!/usr/bin/env python3
import hashlib
import secrets
import re
from datetime import datetime, timedelta
from typing import Optional, Dict, Any
import logging

# Logging setup - could be centralized
logger = logging.getLogger(__name__)

class ValidationError(Exception):
    \"\"\"Custom exception for validation errors\"\"\"
    pass

def validate_email(email: str) -> bool:
    # Email validation - could use more robust library
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$'
    return bool(re.match(pattern, email))

def validate_password(password: str) -> tuple[bool, str]:
    # Password validation rules - could be configurable
    if len(password) < 8:
        return False, "Password must be at least 8 characters"

    if not re.search(r'[A-Z]', password):
        return False, "Password must contain uppercase letter"

    if not re.search(r'[a-z]', password):
        return False, "Password must contain lowercase letter"

    if not re.search(r'[0-9]', password):
        return False, "Password must contain number"

    return True, "Valid password"

def hash_password(password: str) -> str:
    # Password hashing - could use more secure algorithm
    salt = secrets.token_hex(32)
    password_hash = hashlib.pbkdf2_hmac('sha256', password.encode(), salt.encode(), 100000)
    return f"{salt}:{password_hash.hex()}"

def verify_password(password: str, hashed: str) -> bool:
    # Password verification
    try:
        salt, hash_hex = hashed.split(':', 1)
        password_hash = hashlib.pbkdf2_hmac('sha256', password.encode(), salt.encode(), 100000)
        return password_hash.hex() == hash_hex
    except ValueError:
        return False

def generate_cache_key(*args, prefix: str = "", separator: str = ":") -> str:
    # Cache key generation - could be more sophisticated
    parts = [str(arg) for arg in args if arg is not None]
    if prefix:
        parts.insert(0, prefix)
    return separator.join(parts)

def parse_datetime(date_string: str) -> Optional[datetime]:
    # Date parsing with multiple format support
    formats = [
        "%Y-%m-%d %H:%M:%S",
        "%Y-%m-%dT%H:%M:%S",
        "%Y-%m-%dT%H:%M:%S.%f",
        "%Y-%m-%d"
    ]

    for fmt in formats:
        try:
            return datetime.strptime(date_string, fmt)
        except ValueError:
            continue

    logger.warning(f"Unable to parse datetime: {date_string}")
    return None

def calculate_expiry(hours: int = 24) -> datetime:
    # Expiry calculation - could be more flexible
    return datetime.utcnow() + timedelta(hours=hours)

def sanitize_input(data: Dict[str, Any]) -> Dict[str, Any]:
    # Input sanitization - basic implementation
    sanitized = {}

    for key, value in data.items():
        if isinstance(value, str):
            # Basic HTML/script tag removal
            value = re.sub(r'<[^>]*>', '', value)
            value = value.strip()

        # Type validation could be more comprehensive
        if value is not None and value != "":
            sanitized[key] = value

    return sanitized

def format_response(data: Any, status: str = "success", message: str = None) -> Dict[str, Any]:
    # Response formatting - could be more standardized
    response = {
        "status": status,
        "data": data,
        "timestamp": datetime.utcnow().isoformat()
    }

    if message:
        response["message"] = message

    return response

class PerformanceTimer:
    # Performance measurement utility
    def __init__(self, name: str):
        self.name = name
        self.start_time = None

    def __enter__(self):
        self.start_time = datetime.now()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.start_time:
            duration = datetime.now() - self.start_time
            logger.info(f"Performance: {self.name} took {duration.total_seconds():.3f}s")
"""

        # Create test files
        self.main_service_file = self.create_additional_test_file("main_service.py", main_service)
        self.config_file = self.create_additional_test_file("config.py", config_module)
        self.models_file = self.create_additional_test_file("models.py", models_module)
        self.utils_file = self.create_additional_test_file("utils.py", utils_module)

        self.logger.info("  ✅ Created test codebase with 4 files for analysis")

    def _test_single_analysis_session(self) -> bool:
        """Test a complete analysis session with multiple steps"""
        try:
            self.logger.info("  1.1: Testing single analysis session")

            # Step 1: Start analysis
            self.logger.info("    1.1.1: Step 1 - Initial analysis")
            response1, continuation_id = self.call_mcp_tool(
                "analyze",
                {
                    "step": "I need to analyze this Python microservice codebase for architectural patterns, design decisions, and improvement opportunities. Let me start by examining the overall structure and understanding the technology stack.",
                    "step_number": 1,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Starting analysis of FastAPI microservice with PostgreSQL, Redis, and SQLAlchemy. Initial examination shows user management functionality with caching layer.",
                    "files_checked": [self.main_service_file],
                    "relevant_files": [self.main_service_file, self.config_file, self.models_file, self.utils_file],
                    "prompt": "Analyze this microservice architecture for scalability, maintainability, and design patterns",
                    "analysis_type": "architecture",
                },
            )

            if not response1 or not continuation_id:
                self.logger.error("Failed to get initial analysis response")
                return False

            # Parse and validate JSON response
            response1_data = self._parse_analyze_response(response1)
            if not response1_data:
                return False

            # Validate step 1 response structure - expect pause_for_analysis for next_step_required=True
            if not self._validate_step_response(response1_data, 1, 4, True, "pause_for_analysis"):
                return False

            self.logger.info(f"    ✅ Step 1 successful, continuation_id: {continuation_id}")

            # Step 2: Deeper examination
            self.logger.info("    1.1.2: Step 2 - Architecture examination")
            response2, _ = self.call_mcp_tool(
                "analyze",
                {
                    "step": "Now examining the configuration and models modules to understand data architecture and configuration management patterns.",
                    "step_number": 2,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Found several architectural concerns: direct Redis dependency in service class, global configuration instance, missing error handling in database operations, and mixed serialization logic in models.",
                    "files_checked": [self.main_service_file, self.config_file, self.models_file],
                    "relevant_files": [self.main_service_file, self.config_file, self.models_file],
                    "relevant_context": ["UserService", "AppConfig", "User.to_dict"],
                    "issues_found": [
                        {
                            "severity": "medium",
                            "description": "Direct dependency on global Redis client in UserService",
                        },
                        {"severity": "low", "description": "Global configuration instance could cause testing issues"},
                    ],
                    "confidence": "medium",
                    "continuation_id": continuation_id,
                },
            )

            if not response2:
                self.logger.error("Failed to continue analysis to step 2")
                return False

            response2_data = self._parse_analyze_response(response2)
            if not self._validate_step_response(response2_data, 2, 4, True, "pause_for_analysis"):
                return False

            # Check analysis status tracking
            analysis_status = response2_data.get("analysis_status", {})
            if analysis_status.get("files_checked", 0) < 3:
                self.logger.error("Files checked count not properly tracked")
                return False

            if analysis_status.get("insights_by_severity", {}).get("medium", 0) < 1:
                self.logger.error("Medium severity insights not properly tracked")
                return False

            if analysis_status.get("analysis_confidence") != "medium":
                self.logger.error("Confidence level not properly tracked")
                return False

            self.logger.info("    ✅ Step 2 successful with proper tracking")

            # Store continuation_id for next test
            self.analysis_continuation_id = continuation_id
            return True

        except Exception as e:
            self.logger.error(f"Single analysis session test failed: {e}")
            return False

    def _test_analysis_refocus_flow(self) -> bool:
        """Test analysis flow that requires refocusing to revise findings"""
        try:
            self.logger.info("  1.2: Testing analysis refocus workflow")

            # Start a new analysis for testing refocus behaviour
            self.logger.info("    1.2.1: Start analysis for refocus test")
            response1, continuation_id = self.call_mcp_tool(
                "analyze",
                {
                    "step": "Analyzing performance characteristics of the data processing pipeline",
                    "step_number": 1,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Initial analysis suggests database queries might be the bottleneck",
                    "files_checked": [self.main_service_file],
                    "relevant_files": [self.main_service_file, self.utils_file],
                    "prompt": "Analyze performance bottlenecks in this microservice",
                    "analysis_type": "performance",
                },
            )

            if not response1 or not continuation_id:
                self.logger.error("Failed to start refocus test analysis")
                return False

            # Step 2: Wrong direction
            self.logger.info("    1.2.2: Step 2 - Incorrect analysis path")
            response2, _ = self.call_mcp_tool(
                "analyze",
                {
                    "step": "Focusing on database optimization strategies",
                    "step_number": 2,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Database queries seem reasonable, might be looking in wrong direction",
                    "files_checked": [self.main_service_file, self.models_file],
                    "relevant_files": [],
                    "relevant_context": [],
                    "issues_found": [],
                    "confidence": "low",
                    "continuation_id": continuation_id,
                },
            )

            if not response2:
                self.logger.error("Failed to continue to step 2")
                return False

            # Step 3: Adjust investigation path
            self.logger.info("    1.2.3: Step 3 - Refocus the analysis")
            response3, _ = self.call_mcp_tool(
                "analyze",
                {
                    "step": "Refocus - the performance issue might not be database related. Let me examine the caching and serialization patterns instead.",
                    "step_number": 3,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Found potential performance issues in JSON serialization and cache key generation patterns in utils module",
                    "files_checked": [self.utils_file, self.models_file],
                    "relevant_files": [self.utils_file, self.models_file],
                    "relevant_context": ["generate_cache_key", "User.to_dict", "sanitize_input"],
                    "issues_found": [
                        {"severity": "medium", "description": "JSON serialization in model classes could be optimized"},
                        {"severity": "low", "description": "Cache key generation lacks proper escaping"},
                    ],
                    "confidence": "medium",
                    "continuation_id": continuation_id,
                },
            )

            if not response3:
                self.logger.error("Failed to refocus analysis")
                return False

            response3_data = self._parse_analyze_response(response3)
            if not self._validate_step_response(response3_data, 3, 4, True, "pause_for_analysis"):
                return False

            self.logger.info("    ✅ Analysis refocus flow working correctly")
            return True

        except Exception as e:
            self.logger.error(f"Backtracking test failed: {e}")
            return False

    def _test_complete_analysis_with_expert(self) -> bool:
        """Test complete analysis ending with expert validation"""
        try:
            self.logger.info("  1.3: Testing complete analysis with expert validation")

            # Use the continuation from first test
            continuation_id = getattr(self, "analysis_continuation_id", None)
            if not continuation_id:
                # Start fresh if no continuation available
                self.logger.info("    1.3.0: Starting fresh analysis")
                response0, continuation_id = self.call_mcp_tool(
                    "analyze",
                    {
                        "step": "Analyzing the microservice architecture for improvement opportunities",
                        "step_number": 1,
                        "total_steps": 2,
                        "next_step_required": True,
                        "findings": "Found dependency injection and configuration management issues",
                        "files_checked": [self.main_service_file, self.config_file],
                        "relevant_files": [self.main_service_file, self.config_file],
                        "relevant_context": ["UserService", "AppConfig"],
                        "prompt": "Analyze architectural patterns and improvement opportunities",
                        "analysis_type": "architecture",
                    },
                )
                if not response0 or not continuation_id:
                    self.logger.error("Failed to start fresh analysis")
                    return False

            # Final step - trigger expert validation
            self.logger.info("    1.3.1: Final step - complete analysis")
            response_final, _ = self.call_mcp_tool(
                "analyze",
                {
                    "step": "Analysis complete. I have identified key architectural patterns and strategic improvement opportunities across scalability, maintainability, and performance dimensions.",
                    "step_number": 2,
                    "total_steps": 2,
                    "next_step_required": False,  # Final step - triggers expert validation
                    "findings": "Key findings: 1) Tight coupling via global dependencies, 2) Missing error handling and transaction management, 3) Mixed concerns in model classes, 4) Configuration management could be more flexible, 5) Opportunities for dependency injection and better separation of concerns.",
                    "files_checked": [self.main_service_file, self.config_file, self.models_file, self.utils_file],
                    "relevant_files": [self.main_service_file, self.config_file, self.models_file, self.utils_file],
                    "relevant_context": ["UserService", "AppConfig", "User", "validate_email"],
                    "issues_found": [
                        {"severity": "high", "description": "Tight coupling via global Redis client and configuration"},
                        {"severity": "medium", "description": "Missing transaction management in create_user"},
                        {"severity": "medium", "description": "Serialization logic mixed with model classes"},
                        {"severity": "low", "description": "Magic numbers and hardcoded values scattered throughout"},
                    ],
                    "confidence": "high",
                    "continuation_id": continuation_id,
                    "model": "flash",  # Use flash for expert validation
                },
            )

            if not response_final:
                self.logger.error("Failed to complete analysis")
                return False

            response_final_data = self._parse_analyze_response(response_final)
            if not response_final_data:
                return False

            # Validate final response structure - expect calling_expert_analysis for next_step_required=False
            if response_final_data.get("status") != "calling_expert_analysis":
                self.logger.error(
                    f"Expected status 'calling_expert_analysis', got '{response_final_data.get('status')}'"
                )
                return False

            if not response_final_data.get("analysis_complete"):
                self.logger.error("Expected analysis_complete=true for final step")
                return False  # Check for expert analysis
            if "expert_analysis" not in response_final_data:
                self.logger.error("Missing expert_analysis in final response")
                return False

            expert_analysis = response_final_data.get("expert_analysis", {})

            # Check for expected analysis content (checking common patterns)
            analysis_text = json.dumps(expert_analysis, ensure_ascii=False).lower()

            # Look for architectural analysis indicators
            arch_indicators = ["architecture", "pattern", "coupling", "dependency", "scalability", "maintainability"]
            found_indicators = sum(1 for indicator in arch_indicators if indicator in analysis_text)

            if found_indicators >= 3:
                self.logger.info("    ✅ Expert analysis identified architectural patterns correctly")
            else:
                self.logger.warning(
                    f"    ⚠️ Expert analysis may not have fully analyzed architecture (found {found_indicators}/6 indicators)"
                )

            # Check complete analysis summary
            if "complete_analysis" not in response_final_data:
                self.logger.error("Missing complete_analysis in final response")
                return False

            complete_analysis = response_final_data["complete_analysis"]
            if not complete_analysis.get("relevant_context"):
                self.logger.error("Missing relevant context in complete analysis")
                return False

            if "UserService" not in complete_analysis["relevant_context"]:
                self.logger.error("Expected context not found in analysis summary")
                return False

            self.logger.info("    ✅ Complete analysis with expert validation successful")
            return True

        except Exception as e:
            self.logger.error(f"Complete analysis test failed: {e}")
            return False

    def _test_certain_confidence(self) -> bool:
        """Test final step analysis completion (analyze tool doesn't use confidence levels)"""
        try:
            self.logger.info("  1.4: Testing final step analysis completion")

            # Test final step - analyze tool doesn't use confidence levels, but we test completion
            self.logger.info("    1.4.1: Final step analysis")
            response_final, _ = self.call_mcp_tool(
                "analyze",
                {
                    "step": "I have completed a comprehensive analysis of the architectural patterns and improvement opportunities.",
                    "step_number": 1,
                    "total_steps": 1,
                    "next_step_required": False,  # Final step - should trigger expert analysis
                    "findings": "Complete architectural analysis reveals: FastAPI microservice with clear separation needs, dependency injection opportunities, and performance optimization potential. Key patterns identified: service layer, repository-like data access, configuration management, and utility functions.",
                    "files_checked": [self.main_service_file, self.config_file, self.models_file, self.utils_file],
                    "relevant_files": [self.main_service_file, self.config_file, self.models_file, self.utils_file],
                    "relevant_context": ["UserService", "AppConfig", "User", "validate_email"],
                    "issues_found": [
                        {"severity": "high", "description": "Global dependencies create tight coupling"},
                        {"severity": "medium", "description": "Transaction management missing in critical operations"},
                    ],
                    "prompt": "Comprehensive architectural analysis",
                    "analysis_type": "architecture",
                    "model": "flash",
                },
            )

            if not response_final:
                self.logger.error("Failed to test final step analysis")
                return False

            response_final_data = self._parse_analyze_response(response_final)
            if not response_final_data:
                return False

            # Validate final step response - should trigger expert analysis
            expected_status = "calling_expert_analysis"
            if response_final_data.get("status") != expected_status:
                self.logger.error(f"Expected status '{expected_status}', got '{response_final_data.get('status')}'")
                return False

            # Check that expert analysis was performed
            expert_analysis = response_final_data.get("expert_analysis", {})
            if not expert_analysis:
                self.logger.error("Expert analysis should be present for final step")
                return False

            # Expert analysis should complete successfully
            if expert_analysis.get("status") != "analysis_complete":
                self.logger.error(
                    f"Expert analysis status: {expert_analysis.get('status')} (expected analysis_complete)"
                )
                return False

            self.logger.info("    ✅ Final step analysis completion working correctly")
            return True

        except Exception as e:
            self.logger.error(f"Final step analysis test failed: {e}")
            return False

    def _test_context_aware_file_embedding(self) -> bool:
        """Test context-aware file embedding optimization"""
        try:
            self.logger.info("  1.5: Testing context-aware file embedding")

            # Test 1: New conversation, intermediate step - should only reference files
            self.logger.info("    1.5.1: New conversation intermediate step (should reference only)")
            response1, continuation_id = self.call_mcp_tool(
                "analyze",
                {
                    "step": "Starting architectural analysis of microservice components",
                    "step_number": 1,
                    "total_steps": 3,
                    "next_step_required": True,  # Intermediate step
                    "findings": "Initial analysis of service layer and configuration patterns",
                    "files_checked": [self.main_service_file, self.config_file],
                    "relevant_files": [self.main_service_file],  # This should be referenced, not embedded
                    "relevant_context": ["UserService"],
                    "issues_found": [{"severity": "medium", "description": "Direct Redis dependency in service class"}],
                    "confidence": "low",
                    "prompt": "Analyze service architecture patterns",
                    "analysis_type": "architecture",
                    "model": "flash",
                },
            )

            if not response1 or not continuation_id:
                self.logger.error("Failed to start context-aware file embedding test")
                return False

            response1_data = self._parse_analyze_response(response1)
            if not response1_data:
                return False

            # Check file context - should be reference_only for intermediate step
            file_context = response1_data.get("file_context", {})
            if file_context.get("type") != "reference_only":
                self.logger.error(f"Expected reference_only file context, got: {file_context.get('type')}")
                return False

            if "Files referenced but not embedded" not in file_context.get("context_optimization", ""):
                self.logger.error("Expected context optimization message for reference_only")
                return False

            self.logger.info("    ✅ Intermediate step correctly uses reference_only file context")

            # Test 2: Final step - should embed files for expert validation
            self.logger.info("    1.5.2: Final step (should embed files)")
            response2, _ = self.call_mcp_tool(
                "analyze",
                {
                    "step": "Analysis complete - identified key architectural patterns and improvement opportunities",
                    "step_number": 2,
                    "total_steps": 2,
                    "next_step_required": False,  # Final step - should embed files
                    "continuation_id": continuation_id,
                    "findings": "Complete analysis reveals dependency injection opportunities, configuration management improvements, and separation of concerns enhancements",
                    "files_checked": [self.main_service_file, self.config_file, self.models_file],
                    "relevant_files": [self.main_service_file, self.config_file],  # Should be fully embedded
                    "relevant_context": ["UserService", "AppConfig"],
                    "issues_found": [
                        {"severity": "high", "description": "Global dependencies create architectural coupling"},
                        {"severity": "medium", "description": "Configuration management lacks flexibility"},
                    ],
                    "confidence": "high",
                    "model": "flash",
                },
            )

            if not response2:
                self.logger.error("Failed to complete to final step")
                return False

            response2_data = self._parse_analyze_response(response2)
            if not response2_data:
                return False

            # Check file context - should be fully_embedded for final step
            file_context2 = response2_data.get("file_context", {})
            if file_context2.get("type") != "fully_embedded":
                self.logger.error(
                    f"Expected fully_embedded file context for final step, got: {file_context2.get('type')}"
                )
                return False

            if "Full file content embedded for expert analysis" not in file_context2.get("context_optimization", ""):
                self.logger.error("Expected expert analysis optimization message for fully_embedded")
                return False

            # Verify expert analysis was called for final step
            if response2_data.get("status") != "calling_expert_analysis":
                self.logger.error("Final step should trigger expert analysis")
                return False

            if "expert_analysis" not in response2_data:
                self.logger.error("Expert analysis should be present in final step")
                return False

            self.logger.info("    ✅ Context-aware file embedding test completed successfully")
            return True

        except Exception as e:
            self.logger.error(f"Context-aware file embedding test failed: {e}")
            return False

    def _test_analysis_types(self) -> bool:
        """Test different analysis types (architecture, performance, security, quality)"""
        try:
            self.logger.info("  1.6: Testing different analysis types")

            # Test security analysis
            self.logger.info("    1.6.1: Security analysis")
            response_security, _ = self.call_mcp_tool(
                "analyze",
                {
                    "step": "Conducting security analysis of authentication and data handling patterns",
                    "step_number": 1,
                    "total_steps": 1,
                    "next_step_required": False,
                    "findings": "Security analysis reveals: password hashing implementation, input validation patterns, SQL injection prevention via parameterized queries, but missing input sanitization in some areas and weak default secret key handling.",
                    "files_checked": [self.main_service_file, self.utils_file],
                    "relevant_files": [self.main_service_file, self.utils_file],
                    "relevant_context": ["hash_password", "validate_email", "sanitize_input"],
                    "issues_found": [
                        {"severity": "critical", "description": "Weak default secret key in production detection"},
                        {"severity": "medium", "description": "Input sanitization not consistently applied"},
                    ],
                    "confidence": "high",
                    "prompt": "Analyze security patterns and vulnerabilities",
                    "analysis_type": "security",
                    "model": "flash",
                },
            )

            if not response_security:
                self.logger.error("Failed security analysis test")
                return False

            response_security_data = self._parse_analyze_response(response_security)
            if not response_security_data:
                return False

            # Check that security analysis was processed
            issues = response_security_data.get("complete_analysis", {}).get("issues_found", [])
            critical_issues = [issue for issue in issues if issue.get("severity") == "critical"]

            if not critical_issues:
                self.logger.warning("Security analysis should have identified critical security issues")
            else:
                self.logger.info("    ✅ Security analysis identified critical issues")

            # Test quality analysis
            self.logger.info("    1.6.2: Quality analysis")
            response_quality, _ = self.call_mcp_tool(
                "analyze",
                {
                    "step": "Conducting code quality analysis focusing on maintainability and best practices",
                    "step_number": 1,
                    "total_steps": 1,
                    "next_step_required": False,
                    "findings": "Code quality analysis shows: good use of type hints, proper error handling in some areas but missing in others, mixed separation of concerns, and opportunities for better abstraction.",
                    "files_checked": [self.models_file, self.utils_file],
                    "relevant_files": [self.models_file, self.utils_file],
                    "relevant_context": ["User.to_dict", "ValidationError", "PerformanceTimer"],
                    "issues_found": [
                        {"severity": "medium", "description": "Serialization logic mixed with model classes"},
                        {"severity": "low", "description": "Inconsistent error handling patterns"},
                    ],
                    "confidence": "high",
                    "prompt": "Analyze code quality and maintainability patterns",
                    "analysis_type": "quality",
                    "model": "flash",
                },
            )

            if not response_quality:
                self.logger.error("Failed quality analysis test")
                return False

            response_quality_data = self._parse_analyze_response(response_quality)
            if not response_quality_data:
                return False

            # Verify quality analysis was processed
            quality_context = response_quality_data.get("complete_analysis", {}).get("relevant_context", [])
            if not any("User" in ctx for ctx in quality_context):
                self.logger.warning("Quality analysis should have analyzed model classes")
            else:
                self.logger.info("    ✅ Quality analysis examined relevant code elements")

            self.logger.info("    ✅ Different analysis types test completed successfully")
            return True

        except Exception as e:
            self.logger.error(f"Analysis types test failed: {e}")
            return False

    def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
        """Call an MCP tool in-process - override for analyze-specific response handling"""
        # Use in-process implementation to maintain conversation memory
        response_text, _ = self.call_mcp_tool_direct(tool_name, params)

        if not response_text:
            return None, None

        # Extract continuation_id from analyze response specifically
        continuation_id = self._extract_analyze_continuation_id(response_text)

        return response_text, continuation_id

    def _extract_analyze_continuation_id(self, response_text: str) -> Optional[str]:
        """Extract continuation_id from analyze response"""
        try:
            # Parse the response
            response_data = json.loads(response_text)
            return response_data.get("continuation_id")

        except json.JSONDecodeError as e:
            self.logger.debug(f"Failed to parse response for analyze continuation_id: {e}")
            return None

    def _parse_analyze_response(self, response_text: str) -> dict:
        """Parse analyze tool JSON response"""
        try:
            # Parse the response - it should be direct JSON
            return json.loads(response_text)

        except json.JSONDecodeError as e:
            self.logger.error(f"Failed to parse analyze response as JSON: {e}")
            self.logger.error(f"Response text: {response_text[:500]}...")
            return {}

    def _validate_step_response(
        self,
        response_data: dict,
        expected_step: int,
        expected_total: int,
        expected_next_required: bool,
        expected_status: str,
    ) -> bool:
        """Validate an analyze investigation step response structure"""
        try:
            # Check status
            if response_data.get("status") != expected_status:
                self.logger.error(f"Expected status '{expected_status}', got '{response_data.get('status')}'")
                return False

            # Check step number
            if response_data.get("step_number") != expected_step:
                self.logger.error(f"Expected step_number {expected_step}, got {response_data.get('step_number')}")
                return False

            # Check total steps
            if response_data.get("total_steps") != expected_total:
                self.logger.error(f"Expected total_steps {expected_total}, got {response_data.get('total_steps')}")
                return False

            # Check next_step_required
            if response_data.get("next_step_required") != expected_next_required:
                self.logger.error(
                    f"Expected next_step_required {expected_next_required}, got {response_data.get('next_step_required')}"
                )
                return False

            # Check analysis_status exists
            if "analysis_status" not in response_data:
                self.logger.error("Missing analysis_status in response")
                return False

            # Check next_steps guidance
            if not response_data.get("next_steps"):
                self.logger.error("Missing next_steps guidance in response")
                return False

            return True

        except Exception as e:
            self.logger.error(f"Error validating step response: {e}")
            return False

```

--------------------------------------------------------------------------------
/simulator_tests/test_refactor_validation.py:
--------------------------------------------------------------------------------

```python
#!/usr/bin/env python3
"""
Refactor Tool Validation Test

Tests the refactor tool's capabilities using the new workflow architecture.
This validates the step-by-step refactoring analysis pattern with expert validation.
"""

import json
from typing import Optional

from .conversation_base_test import ConversationBaseTest


class RefactorValidationTest(ConversationBaseTest):
    """Test refactor tool with new workflow architecture"""

    @property
    def test_name(self) -> str:
        return "refactor_validation"

    @property
    def test_description(self) -> str:
        return "Refactor tool validation with new workflow architecture"

    def run_test(self) -> bool:
        """Test refactor tool capabilities"""
        # Set up the test environment
        self.setUp()

        try:
            self.logger.info("Test: Refactor tool validation (new architecture)")

            # Create test files with refactoring opportunities
            self._create_refactoring_test_code()

            # Test 1: Single refactoring analysis session with multiple steps
            if not self._test_single_refactoring_session():
                return False

            # Test 2: Refactoring analysis requiring refocus
            if not self._test_refactoring_refocus_flow():
                return False

            # Test 3: Complete refactoring analysis with expert analysis
            if not self._test_complete_refactoring_with_analysis():
                return False

            # Test 4: Certain confidence with complete refactor_result_confidence
            if not self._test_certain_confidence_complete_refactoring():
                return False

            # Test 5: Context-aware file embedding for refactoring
            if not self._test_context_aware_refactoring_file_embedding():
                return False

            # Test 6: Different refactor types
            if not self._test_different_refactor_types():
                return False

            self.logger.info("  ✅ All refactor validation tests passed")
            return True

        except Exception as e:
            self.logger.error(f"Refactor validation test failed: {e}")
            return False

    def _create_refactoring_test_code(self):
        """Create test files with various refactoring opportunities"""
        # Create a Python file with obvious code smells and decomposition opportunities
        refactor_code = """#!/usr/bin/env python3
import json
import os
from datetime import datetime

# Code smell: Large class with multiple responsibilities
class DataProcessorManager:
    def __init__(self, config_file):
        self.config = self._load_config(config_file)
        self.processed_count = 0
        self.error_count = 0
        self.log_file = "processing.log"

    def _load_config(self, config_file):
        \"\"\"Load configuration from file\"\"\"
        with open(config_file, 'r') as f:
            return json.load(f)

    # Code smell: Long method doing too many things (decompose opportunity)
    def process_user_data(self, user_data, validation_rules, output_format):
        \"\"\"Process user data with validation and formatting\"\"\"
        # Validation logic
        if not user_data:
            print("Error: No user data")  # Code smell: print instead of logging
            return None

        if not isinstance(user_data, dict):
            print("Error: Invalid data format")
            return None

        # Check required fields
        required_fields = ['name', 'email', 'age']
        for field in required_fields:
            if field not in user_data:
                print(f"Error: Missing field {field}")
                return None

        # Apply validation rules
        for rule in validation_rules:
            if rule['field'] == 'email':
                if '@' not in user_data['email']:  # Code smell: simple validation
                    print("Error: Invalid email")
                    return None
            elif rule['field'] == 'age':
                if user_data['age'] < 18:  # Code smell: magic number
                    print("Error: Age too young")
                    return None

        # Data processing
        processed_data = {}
        processed_data['full_name'] = user_data['name'].title()
        processed_data['email_domain'] = user_data['email'].split('@')[1]
        processed_data['age_category'] = 'adult' if user_data['age'] >= 18 else 'minor'

        # Code smell: Duplicate date formatting logic
        if output_format == 'json':
            processed_data['processed_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            result = json.dumps(processed_data, ensure_ascii=False)
        elif output_format == 'csv':
            processed_data['processed_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            result = f"{processed_data['full_name']},{processed_data['email_domain']},{processed_data['age_category']}"
        else:
            processed_data['processed_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            result = str(processed_data)

        # Logging and statistics
        self.processed_count += 1
        with open(self.log_file, 'a') as f:  # Code smell: file handling without context
            f.write(f"Processed: {user_data['name']} at {datetime.now()}\\n")

        return result

    # Code smell: Another long method (decompose opportunity)
    def batch_process_files(self, file_list, output_dir):
        \"\"\"Process multiple files in batch\"\"\"
        results = []

        for file_path in file_list:
            # File validation
            if not os.path.exists(file_path):
                print(f"Error: File {file_path} not found")
                continue

            if not file_path.endswith('.json'):
                print(f"Error: File {file_path} is not JSON")
                continue

            # Read and process file
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)

                # Code smell: Nested loops and complex logic
                for user_id, user_data in data.items():
                    if isinstance(user_data, dict):
                        # Duplicate validation logic from process_user_data
                        if 'name' in user_data and 'email' in user_data:
                            if '@' in user_data['email']:
                                # More processing...
                                processed = {
                                    'id': user_id,
                                    'name': user_data['name'].title(),
                                    'email': user_data['email'].lower()
                                }
                                results.append(processed)

                # Write output file
                output_file = os.path.join(output_dir, f"processed_{os.path.basename(file_path)}")
                with open(output_file, 'w') as f:
                    json.dump(results, f, indent=2)

            except Exception as e:
                print(f"Error processing file {file_path}: {e}")
                self.error_count += 1

        return results

    # Code smell: Method doing file I/O and business logic
    def generate_report(self):
        \"\"\"Generate processing report\"\"\"
        report_data = {
            'total_processed': self.processed_count,
            'total_errors': self.error_count,
            'success_rate': (self.processed_count / (self.processed_count + self.error_count)) * 100 if (self.processed_count + self.error_count) > 0 else 0,
            'generated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        }

        # Write to multiple formats (code smell: duplicate logic)
        with open('report.json', 'w') as f:
            json.dump(report_data, f, indent=2)

        with open('report.txt', 'w') as f:
            f.write(f"Processing Report\\n")
            f.write(f"================\\n")
            f.write(f"Total Processed: {report_data['total_processed']}\\n")
            f.write(f"Total Errors: {report_data['total_errors']}\\n")
            f.write(f"Success Rate: {report_data['success_rate']:.2f}%\\n")
            f.write(f"Generated: {report_data['generated_at']}\\n")

        return report_data

# Code smell: Utility functions that could be in a separate module
def validate_email(email):
    \"\"\"Simple email validation\"\"\"
    return '@' in email and '.' in email

def format_name(name):
    \"\"\"Format name to title case\"\"\"
    return name.title() if name else ""

def calculate_age_category(age):
    \"\"\"Calculate age category\"\"\"
    if age < 18:
        return 'minor'
    elif age < 65:
        return 'adult'
    else:
        return 'senior'
"""

        # Create test file with refactoring opportunities
        self.refactor_file = self.create_additional_test_file("data_processor_manager.py", refactor_code)
        self.logger.info(f"  ✅ Created test file with refactoring opportunities: {self.refactor_file}")

        # Create a smaller file for focused testing
        small_refactor_code = """#!/usr/bin/env python3

# Code smell: God function
def process_everything(data, config, logger):
    \"\"\"Function that does too many things\"\"\"
    # Validation
    if not data:
        print("No data")  # Should use logger
        return None

    # Processing
    result = []
    for item in data:
        if item > 5:  # Magic number
            result.append(item * 2)  # Magic number

    # Logging
    print(f"Processed {len(result)} items")

    # File I/O
    with open("output.txt", "w") as f:
        f.write(str(result))

    return result

# Modernization opportunity: Could use dataclass
class UserData:
    def __init__(self, name, email, age):
        self.name = name
        self.email = email
        self.age = age

    def to_dict(self):
        return {
            'name': self.name,
            'email': self.email,
            'age': self.age
        }
"""

        self.small_refactor_file = self.create_additional_test_file("simple_processor.py", small_refactor_code)
        self.logger.info(f"  ✅ Created small test file: {self.small_refactor_file}")

    def _test_single_refactoring_session(self) -> bool:
        """Test a complete refactoring analysis session with multiple steps"""
        try:
            self.logger.info("  1.1: Testing single refactoring analysis session")

            # Step 1: Start refactoring analysis
            self.logger.info("    1.1.1: Step 1 - Initial refactoring investigation")
            response1, continuation_id = self.call_mcp_tool(
                "refactor",
                {
                    "step": "Starting refactoring analysis of the data processor code. Let me examine the code structure and identify opportunities for decomposition, code smell fixes, and modernization.",
                    "step_number": 1,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Initial scan shows a large DataProcessorManager class with multiple responsibilities. The class handles configuration, data processing, file I/O, and logging - violating single responsibility principle.",
                    "files_checked": [self.refactor_file],
                    "relevant_files": [self.refactor_file],
                    "confidence": "incomplete",
                    "refactor_type": "codesmells",
                    "focus_areas": ["maintainability", "readability"],
                },
            )

            if not response1 or not continuation_id:
                self.logger.error("Failed to get initial refactoring response")
                return False

            # Parse and validate JSON response
            response1_data = self._parse_refactor_response(response1)
            if not response1_data:
                return False

            # Validate step 1 response structure - expect pause_for_refactoring_analysis for next_step_required=True
            if not self._validate_refactoring_step_response(
                response1_data, 1, 4, True, "pause_for_refactoring_analysis"
            ):
                return False

            self.logger.info(f"    ✅ Step 1 successful, continuation_id: {continuation_id}")

            # Step 2: Deeper analysis
            self.logger.info("    1.1.2: Step 2 - Detailed code analysis")
            response2, _ = self.call_mcp_tool(
                "refactor",
                {
                    "step": "Now examining the specific methods and identifying concrete refactoring opportunities. Found multiple code smells and decomposition needs.",
                    "step_number": 2,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Identified several major issues: 1) process_user_data method is 50+ lines doing validation, processing, and I/O. 2) Duplicate validation logic. 3) Magic numbers (18 for age). 4) print statements instead of proper logging. 5) File handling without proper context management.",
                    "files_checked": [self.refactor_file],
                    "relevant_files": [self.refactor_file],
                    "relevant_context": [
                        "DataProcessorManager.process_user_data",
                        "DataProcessorManager.batch_process_files",
                    ],
                    "issues_found": [
                        {
                            "type": "codesmells",
                            "severity": "high",
                            "description": "Long method: process_user_data does too many things",
                        },
                        {
                            "type": "codesmells",
                            "severity": "medium",
                            "description": "Magic numbers: age validation uses hardcoded 18",
                        },
                        {
                            "type": "codesmells",
                            "severity": "medium",
                            "description": "Duplicate validation logic in multiple places",
                        },
                    ],
                    "confidence": "partial",
                    "continuation_id": continuation_id,
                },
            )

            if not response2:
                self.logger.error("Failed to continue refactoring analysis to step 2")
                return False

            response2_data = self._parse_refactor_response(response2)
            if not self._validate_refactoring_step_response(
                response2_data, 2, 4, True, "pause_for_refactoring_analysis"
            ):
                return False

            # Check refactoring status tracking
            refactoring_status = response2_data.get("refactoring_status", {})
            if refactoring_status.get("files_checked", 0) < 1:
                self.logger.error("Files checked count not properly tracked")
                return False

            opportunities_by_type = refactoring_status.get("opportunities_by_type", {})
            if "codesmells" not in opportunities_by_type:
                self.logger.error("Code smells not properly tracked in opportunities")
                return False

            if refactoring_status.get("refactor_confidence") != "partial":
                self.logger.error("Refactor confidence not properly tracked")
                return False

            self.logger.info("    ✅ Step 2 successful with proper refactoring tracking")

            # Store continuation_id for next test
            self.refactoring_continuation_id = continuation_id
            return True

        except Exception as e:
            self.logger.error(f"Single refactoring session test failed: {e}")
            return False

    def _test_refactoring_refocus_flow(self) -> bool:
        """Test refactoring analysis that shifts focus mid-investigation"""
        try:
            self.logger.info("  1.2: Testing refactoring analysis refocus workflow")

            # Start a new refactoring analysis for testing refocus behaviour
            self.logger.info("    1.2.1: Start refactoring analysis for refocus test")
            response1, continuation_id = self.call_mcp_tool(
                "refactor",
                {
                    "step": "Analyzing code for decomposition opportunities",
                    "step_number": 1,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Initial focus on class-level decomposition",
                    "files_checked": [self.small_refactor_file],
                    "relevant_files": [self.small_refactor_file],
                    "confidence": "incomplete",
                    "refactor_type": "decompose",
                },
            )

            if not response1 or not continuation_id:
                self.logger.error("Failed to start refocus test refactoring analysis")
                return False

            # Step 2: Wrong direction
            self.logger.info("    1.2.2: Step 2 - Wrong refactoring focus")
            response2, _ = self.call_mcp_tool(
                "refactor",
                {
                    "step": "Focusing on class decomposition strategies",
                    "step_number": 2,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Class structure seems reasonable, might be looking in wrong direction",
                    "files_checked": [self.small_refactor_file],
                    "relevant_files": [],
                    "confidence": "incomplete",
                    "continuation_id": continuation_id,
                },
            )

            if not response2:
                self.logger.error("Failed to continue to step 2")
                return False

            # Step 3: Backtrack from step 2
            self.logger.info("    1.2.3: Step 3 - Refocus on function decomposition")
            response3, _ = self.call_mcp_tool(
                "refactor",
                {
                    "step": "Refocusing - the real decomposition opportunity is the god function process_everything. Let me analyze function-level refactoring instead.",
                    "step_number": 3,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Found the main decomposition opportunity: process_everything function does validation, processing, logging, and file I/O. Should be split into separate functions with single responsibilities.",
                    "files_checked": [self.small_refactor_file],
                    "relevant_files": [self.small_refactor_file],
                    "relevant_context": ["process_everything"],
                    "issues_found": [
                        {
                            "type": "decompose",
                            "severity": "high",
                            "description": "God function: process_everything has multiple responsibilities",
                        },
                        {
                            "type": "codesmells",
                            "severity": "medium",
                            "description": "Magic numbers in processing logic",
                        },
                    ],
                    "confidence": "partial",
                    "continuation_id": continuation_id,
                },
            )

            if not response3:
                self.logger.error("Failed to refocus")
                return False

            response3_data = self._parse_refactor_response(response3)
            if not self._validate_refactoring_step_response(
                response3_data, 3, 4, True, "pause_for_refactoring_analysis"
            ):
                return False

            self.logger.info("    ✅ Refocus working correctly for refactoring analysis")
            return True

        except Exception as e:
            self.logger.error(f"Refocusing test failed: {e}")
            return False

    def _test_complete_refactoring_with_analysis(self) -> bool:
        """Test complete refactoring analysis ending with expert analysis"""
        try:
            self.logger.info("  1.3: Testing complete refactoring analysis with expert analysis")

            # Use the continuation from first test
            continuation_id = getattr(self, "refactoring_continuation_id", None)
            if not continuation_id:
                # Start fresh if no continuation available
                self.logger.info("    1.3.0: Starting fresh refactoring analysis")
                response0, continuation_id = self.call_mcp_tool(
                    "refactor",
                    {
                        "step": "Analyzing the data processor for comprehensive refactoring opportunities",
                        "step_number": 1,
                        "total_steps": 2,
                        "next_step_required": True,
                        "findings": "Found multiple refactoring opportunities in DataProcessorManager",
                        "files_checked": [self.refactor_file],
                        "relevant_files": [self.refactor_file],
                        "relevant_context": ["DataProcessorManager.process_user_data"],
                        "confidence": "partial",
                        "refactor_type": "codesmells",
                    },
                )
                if not response0 or not continuation_id:
                    self.logger.error("Failed to start fresh refactoring analysis")
                    return False

            # Final step - trigger expert analysis
            self.logger.info("    1.3.1: Final step - complete refactoring analysis")
            response_final, _ = self.call_mcp_tool(
                "refactor",
                {
                    "step": "Refactoring analysis complete. Identified comprehensive opportunities for code smell fixes, decomposition, and modernization across the DataProcessorManager class.",
                    "step_number": 2,
                    "total_steps": 2,
                    "next_step_required": False,  # Final step - triggers expert analysis
                    "findings": "Complete analysis shows: 1) Large class violating SRP, 2) Long methods needing decomposition, 3) Duplicate validation logic, 4) Magic numbers, 5) Poor error handling with print statements, 6) File I/O mixed with business logic. All major refactoring opportunities identified with specific line locations.",
                    "files_checked": [self.refactor_file],
                    "relevant_files": [self.refactor_file],
                    "relevant_context": [
                        "DataProcessorManager.process_user_data",
                        "DataProcessorManager.batch_process_files",
                        "DataProcessorManager.generate_report",
                    ],
                    "issues_found": [
                        {
                            "type": "decompose",
                            "severity": "critical",
                            "description": "Large class with multiple responsibilities",
                        },
                        {
                            "type": "codesmells",
                            "severity": "high",
                            "description": "Long method: process_user_data (50+ lines)",
                        },
                        {"type": "codesmells", "severity": "high", "description": "Duplicate validation logic"},
                        {"type": "codesmells", "severity": "medium", "description": "Magic numbers in age validation"},
                        {
                            "type": "modernize",
                            "severity": "medium",
                            "description": "Use proper logging instead of print statements",
                        },
                    ],
                    "confidence": "partial",  # Use partial to trigger expert analysis
                    "continuation_id": continuation_id,
                    "model": "flash",  # Use flash for expert analysis
                },
            )

            if not response_final:
                self.logger.error("Failed to complete refactoring analysis")
                return False

            response_final_data = self._parse_refactor_response(response_final)
            if not response_final_data:
                return False

            # Validate final response structure - expect calling_expert_analysis or files_required_to_continue
            expected_statuses = ["calling_expert_analysis", "files_required_to_continue"]
            actual_status = response_final_data.get("status")
            if actual_status not in expected_statuses:
                self.logger.error(f"Expected status to be one of {expected_statuses}, got '{actual_status}'")
                return False

            if not response_final_data.get("refactoring_complete"):
                self.logger.error("Expected refactoring_complete=true for final step")
                return False

            # Check for expert analysis or content (depending on status)
            if actual_status == "calling_expert_analysis":
                if "expert_analysis" not in response_final_data:
                    self.logger.error("Missing expert_analysis in final response")
                    return False
                expert_analysis = response_final_data.get("expert_analysis", {})
                analysis_content = json.dumps(expert_analysis, ensure_ascii=False).lower()
            elif actual_status == "files_required_to_continue":
                # For files_required_to_continue, analysis is in content field
                if "content" not in response_final_data:
                    self.logger.error("Missing content in files_required_to_continue response")
                    return False
                expert_analysis = {"content": response_final_data.get("content", "")}
                analysis_content = response_final_data.get("content", "").lower()
            else:
                self.logger.error(f"Unexpected status: {actual_status}")
                return False

            # Check for expected analysis content (checking common patterns)
            analysis_text = analysis_content

            # Look for refactoring identification
            refactor_indicators = ["refactor", "decompose", "code smell", "method", "class", "responsibility"]
            found_indicators = sum(1 for indicator in refactor_indicators if indicator in analysis_text)

            if found_indicators >= 3:
                self.logger.info("    ✅ Expert analysis identified refactoring opportunities correctly")
            else:
                self.logger.warning(
                    f"    ⚠️ Expert analysis may not have fully identified refactoring opportunities (found {found_indicators}/6 indicators)"
                )

            # Check complete refactoring summary
            if "complete_refactoring" not in response_final_data:
                self.logger.error("Missing complete_refactoring in final response")
                return False

            complete_refactoring = response_final_data["complete_refactoring"]
            if not complete_refactoring.get("relevant_context"):
                self.logger.error("Missing relevant context in complete refactoring")
                return False

            if "DataProcessorManager.process_user_data" not in complete_refactoring["relevant_context"]:
                self.logger.error("Expected method not found in refactoring summary")
                return False

            self.logger.info("    ✅ Complete refactoring analysis with expert analysis successful")
            return True

        except Exception as e:
            self.logger.error(f"Complete refactoring analysis test failed: {e}")
            return False

    def _test_certain_confidence_complete_refactoring(self) -> bool:
        """Test complete confidence - should skip expert analysis"""
        try:
            self.logger.info("  1.4: Testing complete confidence behavior")

            # Test complete confidence - should skip expert analysis
            self.logger.info("    1.4.1: Complete confidence refactoring")
            response_certain, _ = self.call_mcp_tool(
                "refactor",
                {
                    "step": "I have completed comprehensive refactoring analysis with 100% certainty: identified all major opportunities including decomposition, code smells, and modernization.",
                    "step_number": 1,
                    "total_steps": 1,
                    "next_step_required": False,  # Final step
                    "findings": "Complete refactoring analysis: 1) DataProcessorManager class needs decomposition into separate responsibilities, 2) process_user_data method needs breaking into validation, processing, and formatting functions, 3) Replace print statements with proper logging, 4) Extract magic numbers to constants, 5) Use dataclasses for modern Python patterns.",
                    "files_checked": [self.small_refactor_file],
                    "relevant_files": [self.small_refactor_file],
                    "relevant_context": ["process_everything", "UserData"],
                    "issues_found": [
                        {"type": "decompose", "severity": "high", "description": "God function needs decomposition"},
                        {"type": "modernize", "severity": "medium", "description": "Use dataclass for UserData"},
                        {"type": "codesmells", "severity": "medium", "description": "Replace print with logging"},
                    ],
                    "confidence": "complete",  # Complete confidence should skip expert analysis
                    "refactor_type": "codesmells",
                    "model": "flash",
                },
            )

            if not response_certain:
                self.logger.error("Failed to test certain confidence with complete refactoring")
                return False

            response_certain_data = self._parse_refactor_response(response_certain)
            if not response_certain_data:
                return False

            # Validate certain confidence response - should skip expert analysis
            if response_certain_data.get("status") != "refactoring_analysis_complete_ready_for_implementation":
                self.logger.error(
                    f"Expected status 'refactoring_analysis_complete_ready_for_implementation', got '{response_certain_data.get('status')}'"
                )
                return False

            if not response_certain_data.get("skip_expert_analysis"):
                self.logger.error("Expected skip_expert_analysis=true for complete confidence")
                return False

            expert_analysis = response_certain_data.get("expert_analysis", {})
            if expert_analysis.get("status") != "skipped_due_to_complete_refactoring_confidence":
                self.logger.error("Expert analysis should be skipped for complete confidence")
                return False

            self.logger.info("    ✅ Complete confidence behavior working correctly")
            return True

        except Exception as e:
            self.logger.error(f"Complete confidence test failed: {e}")
            return False

    def _test_context_aware_refactoring_file_embedding(self) -> bool:
        """Test context-aware file embedding optimization for refactoring workflow"""
        try:
            self.logger.info("  1.5: Testing context-aware file embedding for refactoring")

            # Create multiple test files for context testing
            utils_content = """#!/usr/bin/env python3
# Utility functions with refactoring opportunities

def calculate_total(items):
    \"\"\"Calculate total with magic numbers\"\"\"
    total = 0
    for item in items:
        if item > 10:  # Magic number
            total += item * 1.1  # Magic number for tax
    return total

def format_output(data, format_type):
    \"\"\"Format output - duplicate logic\"\"\"
    if format_type == 'json':
        import json
        return json.dumps(data, ensure_ascii=False)
    elif format_type == 'csv':
        return ','.join(str(v) for v in data.values())
    else:
        return str(data)
"""

            helpers_content = """#!/usr/bin/env python3
# Helper functions that could be modernized

class DataContainer:
    \"\"\"Simple data container - could use dataclass\"\"\"
    def __init__(self, name, value, category):
        self.name = name
        self.value = value
        self.category = category

    def to_dict(self):
        return {
            'name': self.name,
            'value': self.value,
            'category': self.category
        }
"""

            # Create test files
            utils_file = self.create_additional_test_file("utils.py", utils_content)
            helpers_file = self.create_additional_test_file("helpers.py", helpers_content)

            # Test 1: New conversation, intermediate step - should only reference files
            self.logger.info("    1.5.1: New conversation intermediate step (should reference only)")
            response1, continuation_id = self.call_mcp_tool(
                "refactor",
                {
                    "step": "Starting refactoring analysis of utility modules",
                    "step_number": 1,
                    "total_steps": 3,
                    "next_step_required": True,  # Intermediate step
                    "findings": "Initial analysis of utility and helper modules for refactoring opportunities",
                    "files_checked": [utils_file, helpers_file],
                    "relevant_files": [utils_file],  # This should be referenced, not embedded
                    "relevant_context": ["calculate_total"],
                    "confidence": "incomplete",
                    "refactor_type": "codesmells",
                    "model": "flash",
                },
            )

            if not response1 or not continuation_id:
                self.logger.error("Failed to start context-aware file embedding test")
                return False

            response1_data = self._parse_refactor_response(response1)
            if not response1_data:
                return False

            # Check file context - should be reference_only for intermediate step
            file_context = response1_data.get("file_context", {})
            if file_context.get("type") != "reference_only":
                self.logger.error(f"Expected reference_only file context, got: {file_context.get('type')}")
                return False

            if "Files referenced but not embedded" not in file_context.get("context_optimization", ""):
                self.logger.error("Expected context optimization message for reference_only")
                return False

            self.logger.info("    ✅ Intermediate step correctly uses reference_only file context")

            # Test 2: Final step - should embed files for expert analysis
            self.logger.info("    1.5.2: Final step (should embed files)")
            response2, _ = self.call_mcp_tool(
                "refactor",
                {
                    "step": "Refactoring analysis complete - identified all opportunities",
                    "step_number": 3,
                    "total_steps": 3,
                    "next_step_required": False,  # Final step - should embed files
                    "continuation_id": continuation_id,
                    "findings": "Complete analysis: Found magic numbers in calculate_total, duplicate formatting logic, and modernization opportunity with DataContainer class that could use dataclass.",
                    "files_checked": [utils_file, helpers_file],
                    "relevant_files": [utils_file, helpers_file],  # Should be fully embedded
                    "relevant_context": ["calculate_total", "format_output", "DataContainer"],
                    "issues_found": [
                        {"type": "codesmells", "severity": "medium", "description": "Magic numbers in calculate_total"},
                        {"type": "modernize", "severity": "low", "description": "DataContainer could use dataclass"},
                        {"type": "codesmells", "severity": "low", "description": "Duplicate formatting logic"},
                    ],
                    "confidence": "partial",  # Use partial to trigger expert analysis
                    "model": "flash",
                },
            )

            if not response2:
                self.logger.error("Failed to complete to final step")
                return False

            response2_data = self._parse_refactor_response(response2)
            if not response2_data:
                return False

            # Check file context - should be fully_embedded for final step
            file_context2 = response2_data.get("file_context", {})
            if file_context2.get("type") != "fully_embedded":
                self.logger.error(
                    f"Expected fully_embedded file context for final step, got: {file_context2.get('type')}"
                )
                return False

            if "Full file content embedded for expert analysis" not in file_context2.get("context_optimization", ""):
                self.logger.error("Expected expert analysis optimization message for fully_embedded")
                return False

            self.logger.info("    ✅ Final step correctly uses fully_embedded file context")

            # Verify expert analysis was called for final step (or files_required_to_continue)
            expected_statuses = ["calling_expert_analysis", "files_required_to_continue"]
            actual_status = response2_data.get("status")
            if actual_status not in expected_statuses:
                self.logger.error(f"Expected one of {expected_statuses}, got: {actual_status}")
                return False

            # Handle expert analysis based on status
            if actual_status == "calling_expert_analysis" and "expert_analysis" not in response2_data:
                self.logger.error("Expert analysis should be present in final step with calling_expert_analysis")
                return False

            self.logger.info("    ✅ Context-aware file embedding test for refactoring completed successfully")
            return True

        except Exception as e:
            self.logger.error(f"Context-aware refactoring file embedding test failed: {e}")
            return False

    def _test_different_refactor_types(self) -> bool:
        """Test different refactor types (decompose, modernize, organization)"""
        try:
            self.logger.info("  1.6: Testing different refactor types")

            # Test decompose type
            self.logger.info("    1.6.1: Testing decompose refactor type")
            response_decompose, _ = self.call_mcp_tool(
                "refactor",
                {
                    "step": "Analyzing code for decomposition opportunities in large functions and classes",
                    "step_number": 1,
                    "total_steps": 1,
                    "next_step_required": False,
                    "findings": "Found large DataProcessorManager class that violates single responsibility principle and long process_user_data method that needs decomposition.",
                    "files_checked": [self.refactor_file],
                    "relevant_files": [self.refactor_file],
                    "relevant_context": ["DataProcessorManager", "DataProcessorManager.process_user_data"],
                    "issues_found": [
                        {
                            "type": "decompose",
                            "severity": "critical",
                            "description": "Large class with multiple responsibilities",
                        },
                        {
                            "type": "decompose",
                            "severity": "high",
                            "description": "Long method doing validation, processing, and I/O",
                        },
                    ],
                    "confidence": "complete",
                    "refactor_type": "decompose",
                    "model": "flash",
                },
            )

            if not response_decompose:
                self.logger.error("Failed to test decompose refactor type")
                return False

            response_decompose_data = self._parse_refactor_response(response_decompose)

            # Check that decompose type is properly tracked
            refactoring_status = response_decompose_data.get("refactoring_status", {})
            opportunities_by_type = refactoring_status.get("opportunities_by_type", {})
            if "decompose" not in opportunities_by_type:
                self.logger.error("Decompose opportunities not properly tracked")
                return False

            self.logger.info("    ✅ Decompose refactor type working correctly")

            # Test modernize type
            self.logger.info("    1.6.2: Testing modernize refactor type")
            response_modernize, _ = self.call_mcp_tool(
                "refactor",
                {
                    "step": "Analyzing code for modernization opportunities using newer Python features",
                    "step_number": 1,
                    "total_steps": 1,
                    "next_step_required": False,
                    "findings": "Found opportunities to use dataclasses, f-strings, pathlib, and proper logging instead of print statements.",
                    "files_checked": [self.small_refactor_file],
                    "relevant_files": [self.small_refactor_file],
                    "relevant_context": ["UserData", "process_everything"],
                    "issues_found": [
                        {
                            "type": "modernize",
                            "severity": "medium",
                            "description": "UserData class could use @dataclass decorator",
                        },
                        {
                            "type": "modernize",
                            "severity": "medium",
                            "description": "Replace print statements with proper logging",
                        },
                        {"type": "modernize", "severity": "low", "description": "Use pathlib for file operations"},
                    ],
                    "confidence": "complete",
                    "refactor_type": "modernize",
                    "model": "flash",
                },
            )

            if not response_modernize:
                self.logger.error("Failed to test modernize refactor type")
                return False

            response_modernize_data = self._parse_refactor_response(response_modernize)

            # Check that modernize type is properly tracked
            refactoring_status = response_modernize_data.get("refactoring_status", {})
            opportunities_by_type = refactoring_status.get("opportunities_by_type", {})
            if "modernize" not in opportunities_by_type:
                self.logger.error("Modernize opportunities not properly tracked")
                return False

            self.logger.info("    ✅ Modernize refactor type working correctly")

            self.logger.info("    ✅ Different refactor types test completed successfully")
            return True

        except Exception as e:
            self.logger.error(f"Different refactor types test failed: {e}")
            return False

    def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
        """Call an MCP tool in-process - override for -specific response handling"""
        # Use in-process implementation to maintain conversation memory
        response_text, _ = self.call_mcp_tool_direct(tool_name, params)

        if not response_text:
            return None, None

        # Extract continuation_id from refactor response specifically
        continuation_id = self._extract_refactor_continuation_id(response_text)

        return response_text, continuation_id

    def _extract_refactor_continuation_id(self, response_text: str) -> Optional[str]:
        """Extract continuation_id from refactor response"""
        try:
            # Parse the response
            response_data = json.loads(response_text)
            return response_data.get("continuation_id")

        except json.JSONDecodeError as e:
            self.logger.debug(f"Failed to parse response for refactor continuation_id: {e}")
            return None

    def _parse_refactor_response(self, response_text: str) -> dict:
        """Parse refactor tool JSON response"""
        try:
            # Parse the response - it should be direct JSON
            return json.loads(response_text)

        except json.JSONDecodeError as e:
            self.logger.error(f"Failed to parse refactor response as JSON: {e}")
            self.logger.error(f"Response text: {response_text[:500]}...")
            return {}

    def _validate_refactoring_step_response(
        self,
        response_data: dict,
        expected_step: int,
        expected_total: int,
        expected_next_required: bool,
        expected_status: str,
    ) -> bool:
        """Validate a refactor investigation step response structure"""
        try:
            # Check status
            if response_data.get("status") != expected_status:
                self.logger.error(f"Expected status '{expected_status}', got '{response_data.get('status')}'")
                return False

            # Check step number
            if response_data.get("step_number") != expected_step:
                self.logger.error(f"Expected step_number {expected_step}, got {response_data.get('step_number')}")
                return False

            # Check total steps
            if response_data.get("total_steps") != expected_total:
                self.logger.error(f"Expected total_steps {expected_total}, got {response_data.get('total_steps')}")
                return False

            # Check next_step_required
            if response_data.get("next_step_required") != expected_next_required:
                self.logger.error(
                    f"Expected next_step_required {expected_next_required}, got {response_data.get('next_step_required')}"
                )
                return False

            # Check refactoring_status exists
            if "refactoring_status" not in response_data:
                self.logger.error("Missing refactoring_status in response")
                return False

            # Check next_steps guidance
            if not response_data.get("next_steps"):
                self.logger.error("Missing next_steps guidance in response")
                return False

            return True

        except Exception as e:
            self.logger.error(f"Error validating refactoring step response: {e}")
            return False

```

--------------------------------------------------------------------------------
/simulator_tests/test_precommitworkflow_validation.py:
--------------------------------------------------------------------------------

```python
#!/usr/bin/env python3
"""
PrecommitWorkflow Tool Validation Test

Tests the precommit tool's capabilities using the new workflow architecture.
This validates that the workflow-based pre-commit validation provides step-by-step
analysis with proper investigation guidance and expert analysis integration.
"""

import json
from typing import Optional

from .conversation_base_test import ConversationBaseTest


class PrecommitWorkflowValidationTest(ConversationBaseTest):
    """Test precommit tool with new workflow architecture"""

    @property
    def test_name(self) -> str:
        return "precommit_validation"

    @property
    def test_description(self) -> str:
        return "PrecommitWorkflow tool validation with new workflow architecture"

    def run_test(self) -> bool:
        """Test precommit tool capabilities"""
        # Set up the test environment
        self.setUp()

        try:
            self.logger.info("Test: PrecommitWorkflow tool validation (new architecture)")

            # Create test git repository structure with changes
            self._create_test_git_changes()

            # Test 1: Single validation session with multiple steps
            if not self._test_single_validation_session():
                return False

            # Test 2: Validation flow that requires refocusing
            if not self._test_validation_refocus_flow():
                return False

            # Test 3: Complete validation with expert analysis
            if not self._test_complete_validation_with_analysis():
                return False

            # Test 4: Certain confidence behavior
            if not self._test_certain_confidence():
                return False

            # Test 5: Context-aware file embedding
            if not self._test_context_aware_file_embedding():
                return False

            # Test 6: Multi-step file context optimization
            if not self._test_multi_step_file_context():
                return False

            self.logger.info("  ✅ All precommit validation tests passed")
            return True

        except Exception as e:
            self.logger.error(f"PrecommitWorkflow validation test failed: {e}")
            return False

    def _create_test_git_changes(self):
        """Create test files simulating git changes for pre-commit validation"""
        # Create a new API endpoint with potential security issues
        new_api_code = """#!/usr/bin/env python3
from flask import Flask, request, jsonify
import sqlite3
import os

app = Flask(__name__)

@app.route('/api/user/<user_id>', methods=['GET'])
def get_user(user_id):
    \"\"\"Get user information by ID\"\"\"
    # Potential SQL injection vulnerability
    conn = sqlite3.connect('users.db')
    cursor = conn.cursor()

    # BUG: Direct string interpolation creates SQL injection risk
    query = f"SELECT * FROM users WHERE id = {user_id}"
    cursor.execute(query)

    result = cursor.fetchone()
    conn.close()

    if result:
        return jsonify({
            'id': result[0],
            'username': result[1],
            'email': result[2],
            'password_hash': result[3]  # Security issue: exposing password hash
        })
    else:
        return jsonify({'error': 'User not found'}), 404

@app.route('/api/admin/users', methods=['GET'])
def list_all_users():
    \"\"\"Admin endpoint to list all users\"\"\"
    # Missing authentication check
    conn = sqlite3.connect('users.db')
    cursor = conn.cursor()
    cursor.execute("SELECT id, username, email FROM users")

    users = []
    for row in cursor.fetchall():
        users.append({
            'id': row[0],
            'username': row[1],
            'email': row[2]
        })

    conn.close()
    return jsonify(users)

if __name__ == '__main__':
    # Debug mode in production is a security risk
    app.run(debug=True, host='0.0.0.0')
"""

        # Create configuration file with issues
        config_code = """#!/usr/bin/env python3
import os

# Database configuration
DATABASE_URL = os.getenv('DATABASE_URL', 'sqlite:///users.db')

# Security settings
SECRET_KEY = "hardcoded-secret-key-123"  # Security issue: hardcoded secret
DEBUG_MODE = True  # Should be environment-based

# API settings
API_RATE_LIMIT = 1000  # Very high, no rate limiting effectively
MAX_FILE_UPLOAD = 50 * 1024 * 1024  # 50MB - quite large

# Missing important security headers configuration
CORS_ORIGINS = "*"  # Security issue: allows all origins
"""

        # Create test files
        self.api_file = self.create_additional_test_file("api_endpoints.py", new_api_code)
        self.config_file = self.create_additional_test_file("config.py", config_code)
        self.logger.info(f"  ✅ Created test files: {self.api_file}, {self.config_file}")

        # Create change description
        change_description = """COMMIT DESCRIPTION:
Added new user API endpoints and configuration for user management system.

CHANGES MADE:
- Added GET /api/user/<user_id> endpoint to retrieve user information
- Added GET /api/admin/users endpoint for admin user listing
- Added configuration file with database and security settings
- Set up Flask application with basic routing

REQUIREMENTS:
- User data should be retrievable by ID
- Admin should be able to list all users
- System should be configurable via environment variables
- Security should be properly implemented
"""

        self.changes_file = self.create_additional_test_file("commit_description.txt", change_description)
        self.logger.info(f"  ✅ Created change description: {self.changes_file}")

    def _test_single_validation_session(self) -> bool:
        """Test a complete validation session with multiple steps"""
        try:
            self.logger.info("  1.1: Testing single validation session")

            # Step 1: Start validation
            self.logger.info("    1.1.1: Step 1 - Initial validation plan")
            response1, continuation_id = self.call_mcp_tool(
                "precommit",
                {
                    "step": "I need to perform comprehensive pre-commit validation for new API endpoints. Let me start by analyzing the changes and identifying potential issues.",
                    "step_number": 1,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "New user API endpoints and configuration added. Need to examine for security, performance, and best practices.",
                    "files_checked": [self.changes_file],
                    "relevant_files": [self.changes_file],
                    "path": self.test_dir,  # Required for step 1
                    "review_type": "full",
                    "severity_filter": "all",
                },
            )

            if not response1 or not continuation_id:
                self.logger.error("Failed to get initial validation response")
                return False

            # Parse and validate JSON response
            response1_data = self._parse_precommit_response(response1)
            if not response1_data:
                return False

            # Validate step 1 response structure - expect pause_for_validation for next_step_required=True
            if not self._validate_step_response(response1_data, 1, 4, True, "pause_for_validation"):
                return False

            self.logger.info(f"    ✅ Step 1 successful, continuation_id: {continuation_id}")

            # Step 2: Examine the code for issues
            self.logger.info("    1.1.2: Step 2 - Code examination")
            response2, _ = self.call_mcp_tool(
                "precommit",
                {
                    "step": "Now examining the API endpoint implementation and configuration for security vulnerabilities and best practices violations.",
                    "step_number": 2,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Found multiple critical security issues: SQL injection vulnerability in get_user(), hardcoded secrets in config, missing authentication, and password hash exposure.",
                    "files_checked": [self.changes_file, self.api_file, self.config_file],
                    "relevant_files": [self.api_file, self.config_file],
                    "relevant_context": ["get_user", "list_all_users"],
                    "issues_found": [
                        {"severity": "critical", "description": "SQL injection vulnerability in user lookup"},
                        {"severity": "high", "description": "Hardcoded secret key in configuration"},
                        {"severity": "high", "description": "Password hash exposed in API response"},
                        {"severity": "medium", "description": "Missing authentication on admin endpoint"},
                    ],
                    # Assessment field removed - using precommit_type instead
                    # Confidence field removed - using precommit_type instead
                    "continuation_id": continuation_id,
                },
            )

            if not response2:
                self.logger.error("Failed to continue validation to step 2")
                return False

            response2_data = self._parse_precommit_response(response2)
            if not self._validate_step_response(response2_data, 2, 4, True, "pause_for_validation"):
                return False

            # Check validation status tracking
            validation_status = response2_data.get("validation_status", {})
            if validation_status.get("files_checked", 0) < 3:
                self.logger.error("Files checked count not properly tracked")
                return False

            if validation_status.get("issues_identified", 0) != 4:
                self.logger.error("Issues found not properly tracked")
                return False

            if validation_status.get("precommit_type") != "external":
                self.logger.error("Precommit type not properly tracked")
                return False

            self.logger.info("    ✅ Step 2 successful with proper tracking")

            # Store continuation_id for next test
            self.validation_continuation_id = continuation_id
            return True

        except Exception as e:
            self.logger.error(f"Single validation session test failed: {e}")
            return False

    def _test_validation_refocus_flow(self) -> bool:
        """Test validation workflow that requires refocusing to revise findings"""
        try:
            self.logger.info("  1.2: Testing validation refocus workflow")

            # Start a new validation for testing refocus behaviour
            self.logger.info("    1.2.1: Start validation for refocus test")
            response1, continuation_id = self.call_mcp_tool(
                "precommit",
                {
                    "step": "Validating database connection optimization changes",
                    "step_number": 1,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Initial analysis shows database connection pooling implementation",
                    "files_checked": ["/db/connection.py"],
                    "relevant_files": ["/db/connection.py"],
                    "path": self.test_dir,
                },
            )

            if not response1 or not continuation_id:
                self.logger.error("Failed to start refocus test validation")
                return False

            # Step 2: Wrong direction
            self.logger.info("    1.2.2: Step 2 - Wrong validation focus")
            response2, _ = self.call_mcp_tool(
                "precommit",
                {
                    "step": "Focusing on connection pool size optimization",
                    "step_number": 2,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Connection pool configuration seems reasonable, might be looking in wrong place",
                    "files_checked": ["/db/connection.py", "/config/database.py"],
                    "relevant_files": [],
                    # Assessment fields removed - using precommit_type instead
                    "continuation_id": continuation_id,
                },
            )

            if not response2:
                self.logger.error("Failed to continue to step 2")
                return False

            # Step 3: Shift investigation focus
            self.logger.info("    1.2.3: Step 3 - Refocus and revise approach")
            response3, _ = self.call_mcp_tool(
                "precommit",
                {
                    "step": "Refocusing - the issue might not be database configuration. Let me examine the actual SQL queries and data access patterns instead.",
                    "step_number": 3,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Found inefficient N+1 query pattern in user data loading causing performance issues",
                    "files_checked": ["/models/user.py"],
                    "relevant_files": ["/models/user.py"],
                    "relevant_context": ["User.load_profile"],
                    "issues_found": [
                        {"severity": "medium", "description": "N+1 query pattern in user profile loading"}
                    ],
                    # Assessment fields removed - using precommit_type instead
                    "continuation_id": continuation_id,
                },
            )

            if not response3:
                self.logger.error("Failed to refocus")
                return False

            response3_data = self._parse_precommit_response(response3)
            if not self._validate_step_response(response3_data, 3, 4, True, "pause_for_validation"):
                return False

            self.logger.info("    ✅ Refocus flow working correctly")
            return True

        except Exception as e:
            self.logger.error(f"Refocus test failed: {e}")
            return False

    def _test_complete_validation_with_analysis(self) -> bool:
        """Test complete validation ending with expert analysis"""
        try:
            self.logger.info("  1.3: Testing complete validation with expert analysis")

            # Use the continuation from first test
            continuation_id = getattr(self, "validation_continuation_id", None)
            if not continuation_id:
                # Start fresh if no continuation available
                self.logger.info("    1.3.0: Starting fresh validation")
                response0, continuation_id = self.call_mcp_tool(
                    "precommit",
                    {
                        "step": "Validating the security fixes for API endpoints",
                        "step_number": 1,
                        "total_steps": 2,
                        "next_step_required": True,
                        "findings": "Found critical security vulnerabilities in API implementation",
                        "files_checked": [self.api_file],
                        "relevant_files": [self.api_file],
                        "relevant_context": ["get_user", "list_all_users"],
                        "issues_found": [{"severity": "critical", "description": "SQL injection vulnerability"}],
                        "path": self.test_dir,
                    },
                )
                if not response0 or not continuation_id:
                    self.logger.error("Failed to start fresh validation")
                    return False

            # Final step - trigger expert analysis
            self.logger.info("    1.3.1: Final step - complete validation")
            response_final, _ = self.call_mcp_tool(
                "precommit",
                {
                    "step": "Validation complete. I have identified all critical security issues and missing safeguards in the new API endpoints.",
                    "step_number": 2,
                    "total_steps": 2,
                    "next_step_required": False,  # Final step - triggers expert analysis
                    "findings": "Comprehensive analysis complete: SQL injection, hardcoded secrets, missing authentication, password exposure, and insecure defaults all identified with specific fixes needed.",
                    "files_checked": [self.api_file, self.config_file],
                    "relevant_files": [self.api_file, self.config_file],
                    "relevant_context": ["get_user", "list_all_users", "SECRET_KEY", "DEBUG_MODE"],
                    "issues_found": [
                        {"severity": "critical", "description": "SQL injection vulnerability in user lookup query"},
                        {"severity": "high", "description": "Hardcoded secret key exposes application security"},
                        {"severity": "high", "description": "Password hash exposed in API response"},
                        {"severity": "medium", "description": "Missing authentication on admin endpoint"},
                        {"severity": "medium", "description": "Debug mode enabled in production configuration"},
                    ],
                    # Confidence field removed - using precommit_type instead
                    "continuation_id": continuation_id,
                    "model": "flash",  # Use flash for expert analysis
                },
            )

            if not response_final:
                self.logger.error("Failed to complete validation")
                return False

            response_final_data = self._parse_precommit_response(response_final)
            if not response_final_data:
                return False

            # Validate final response structure - expect calling_expert_analysis for next_step_required=False
            if response_final_data.get("status") != "calling_expert_analysis":
                self.logger.error(
                    f"Expected status 'calling_expert_analysis', got '{response_final_data.get('status')}'"
                )
                return False

            if not response_final_data.get("validation_complete"):
                self.logger.error("Expected validation_complete=true for final step")
                return False

            # Check for expert analysis
            if "expert_analysis" not in response_final_data:
                self.logger.error("Missing expert_analysis in final response")
                return False

            expert_analysis = response_final_data.get("expert_analysis", {})

            # Check for expected analysis content (checking common patterns)
            analysis_text = json.dumps(expert_analysis, ensure_ascii=False).lower()

            # Look for security issue identification
            security_indicators = ["sql", "injection", "security", "hardcoded", "secret", "authentication"]
            found_indicators = sum(1 for indicator in security_indicators if indicator in analysis_text)

            if found_indicators >= 3:
                self.logger.info("    ✅ Expert analysis identified security issues correctly")
            else:
                self.logger.warning(
                    f"    ⚠️ Expert analysis may not have fully identified security issues (found {found_indicators}/6 indicators)"
                )

            # Check complete validation summary
            if "complete_validation" not in response_final_data:
                self.logger.error("Missing complete_validation in final response")
                return False

            complete_validation = response_final_data["complete_validation"]
            if not complete_validation.get("relevant_context"):
                self.logger.error("Missing relevant context in complete validation")
                return False

            if "get_user" not in complete_validation["relevant_context"]:
                self.logger.error("Expected function not found in validation summary")
                return False

            self.logger.info("    ✅ Complete validation with expert analysis successful")
            return True

        except Exception as e:
            self.logger.error(f"Complete validation test failed: {e}")
            return False

    def _test_certain_confidence(self) -> bool:
        """Test certain confidence behavior - should skip expert analysis"""
        try:
            self.logger.info("  1.4: Testing certain confidence behavior")

            # Test certain confidence - should skip expert analysis
            self.logger.info("    1.4.1: Certain confidence validation")
            response_certain, _ = self.call_mcp_tool(
                "precommit",
                {
                    "step": "I have confirmed all security issues with 100% certainty: SQL injection, hardcoded secrets, and missing authentication.",
                    "step_number": 1,
                    "total_steps": 1,
                    "next_step_required": False,  # Final step
                    "findings": "All critical issues identified: parameterized queries needed, environment variables for secrets, authentication middleware required, and debug mode must be disabled for production.",
                    "files_checked": [self.api_file, self.config_file],
                    "relevant_files": [self.api_file, self.config_file],
                    "relevant_context": ["get_user", "list_all_users"],
                    "issues_found": [
                        {
                            "severity": "critical",
                            "description": "SQL injection vulnerability - fix with parameterized queries",
                        },
                        {"severity": "high", "description": "Hardcoded secret - use environment variables"},
                        {"severity": "medium", "description": "Missing authentication - add middleware"},
                    ],
                    "precommit_type": "internal",  # This should skip expert analysis
                    "path": self.test_dir,
                    "model": "flash",
                },
            )

            if not response_certain:
                self.logger.error("Failed to test certain confidence")
                return False

            response_certain_data = self._parse_precommit_response(response_certain)
            if not response_certain_data:
                return False

            # Validate certain confidence response - should skip expert analysis
            if response_certain_data.get("status") != "validation_complete_ready_for_commit":
                self.logger.error(
                    f"Expected status 'validation_complete_ready_for_commit', got '{response_certain_data.get('status')}'"
                )
                return False

            if not response_certain_data.get("skip_expert_analysis"):
                self.logger.error("Expected skip_expert_analysis=true for certain confidence")
                return False

            expert_analysis = response_certain_data.get("expert_analysis", {})
            if expert_analysis.get("status") != "skipped_due_to_internal_analysis_type":
                self.logger.error("Expert analysis should be skipped for certain confidence")
                return False

            self.logger.info("    ✅ Certain confidence behavior working correctly")
            return True

        except Exception as e:
            self.logger.error(f"Certain confidence test failed: {e}")
            return False

    def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
        """Call an MCP tool in-process - override for precommit-specific response handling"""
        # Use in-process implementation to maintain conversation memory
        response_text, _ = self.call_mcp_tool_direct(tool_name, params)

        if not response_text:
            return None, None

        # Extract continuation_id from precommit response specifically
        continuation_id = self._extract_precommit_continuation_id(response_text)

        return response_text, continuation_id

    def _extract_precommit_continuation_id(self, response_text: str) -> Optional[str]:
        """Extract continuation_id from precommit response"""
        try:
            # Parse the response
            response_data = json.loads(response_text)
            return response_data.get("continuation_id")

        except json.JSONDecodeError as e:
            self.logger.debug(f"Failed to parse response for precommit continuation_id: {e}")
            return None

    def _parse_precommit_response(self, response_text: str) -> dict:
        """Parse precommit tool JSON response"""
        try:
            # Parse the response - it should be direct JSON
            return json.loads(response_text)

        except json.JSONDecodeError as e:
            self.logger.error(f"Failed to parse precommit response as JSON: {e}")
            self.logger.error(f"Response text: {response_text[:500]}...")
            return {}

    def _validate_step_response(
        self,
        response_data: dict,
        expected_step: int,
        expected_total: int,
        expected_next_required: bool,
        expected_status: str,
    ) -> bool:
        """Validate a precommit validation step response structure"""
        try:
            # Check status
            if response_data.get("status") != expected_status:
                self.logger.error(f"Expected status '{expected_status}', got '{response_data.get('status')}'")
                return False

            # Check step number
            if response_data.get("step_number") != expected_step:
                self.logger.error(f"Expected step_number {expected_step}, got {response_data.get('step_number')}")
                return False

            # Check total steps
            if response_data.get("total_steps") != expected_total:
                self.logger.error(f"Expected total_steps {expected_total}, got {response_data.get('total_steps')}")
                return False

            # Check next_step_required
            if response_data.get("next_step_required") != expected_next_required:
                self.logger.error(
                    f"Expected next_step_required {expected_next_required}, got {response_data.get('next_step_required')}"
                )
                return False

            # Check validation_status exists
            if "validation_status" not in response_data:
                self.logger.error("Missing validation_status in response")
                return False

            # Check next_steps guidance
            if not response_data.get("next_steps"):
                self.logger.error("Missing next_steps guidance in response")
                return False

            return True

        except Exception as e:
            self.logger.error(f"Error validating step response: {e}")
            return False

    def _test_context_aware_file_embedding(self) -> bool:
        """Test context-aware file embedding optimization"""
        try:
            self.logger.info("  1.5: Testing context-aware file embedding")

            # Create multiple test files for context testing
            auth_file_content = """#!/usr/bin/env python3
from functools import wraps
from flask import request, jsonify

def require_auth(f):
    \"\"\"Authentication decorator\"\"\"
    @wraps(f)
    def decorated_function(*args, **kwargs):
        token = request.headers.get('Authorization')
        if not token:
            return jsonify({'error': 'No token provided'}), 401

        # Validate token here
        if not validate_token(token):
            return jsonify({'error': 'Invalid token'}), 401

        return f(*args, **kwargs)
    return decorated_function

def validate_token(token):
    \"\"\"Validate authentication token\"\"\"
    # Token validation logic
    return token.startswith('Bearer ')
"""

            middleware_file_content = """#!/usr/bin/env python3
from flask import Flask, request, g
import time

def add_security_headers(app):
    \"\"\"Add security headers to all responses\"\"\"
    @app.after_request
    def security_headers(response):
        response.headers['X-Content-Type-Options'] = 'nosniff'
        response.headers['X-Frame-Options'] = 'DENY'
        response.headers['X-XSS-Protection'] = '1; mode=block'
        return response

def rate_limiting_middleware(app):
    \"\"\"Basic rate limiting\"\"\"
    @app.before_request
    def limit_remote_addr():
        # Simple rate limiting logic
        pass
"""

            # Create test files
            auth_file = self.create_additional_test_file("auth.py", auth_file_content)
            middleware_file = self.create_additional_test_file("middleware.py", middleware_file_content)

            # Test 1: New conversation, intermediate step - should only reference files
            self.logger.info("    1.5.1: New conversation intermediate step (should reference only)")
            response1, continuation_id = self.call_mcp_tool(
                "precommit",
                {
                    "step": "Starting validation of new authentication and security middleware",
                    "step_number": 1,
                    "total_steps": 3,
                    "next_step_required": True,  # Intermediate step
                    "findings": "Initial analysis of authentication and middleware components",
                    "files_checked": [auth_file, middleware_file],
                    "relevant_files": [auth_file],  # This should be referenced, not embedded
                    "relevant_context": ["require_auth"],
                    # Assessment fields removed - using precommit_type instead
                    "path": self.test_dir,
                    "model": "flash",
                },
            )

            if not response1 or not continuation_id:
                self.logger.error("Failed to start context-aware file embedding test")
                return False

            response1_data = self._parse_precommit_response(response1)
            if not response1_data:
                return False

            # Check file context - should be reference_only for intermediate step
            file_context = response1_data.get("file_context", {})
            if file_context.get("type") != "reference_only":
                self.logger.error(f"Expected reference_only file context, got: {file_context.get('type')}")
                return False

            if "Files referenced but not embedded" not in file_context.get("context_optimization", ""):
                self.logger.error("Expected context optimization message for reference_only")
                return False

            self.logger.info("    ✅ Intermediate step correctly uses reference_only file context")

            # Test 2: Intermediate step with continuation - should still only reference
            self.logger.info("    1.5.2: Intermediate step with continuation (should reference only)")
            response2, _ = self.call_mcp_tool(
                "precommit",
                {
                    "step": "Continuing validation with detailed security analysis",
                    "step_number": 2,
                    "total_steps": 3,
                    "next_step_required": True,  # Still intermediate
                    "continuation_id": continuation_id,
                    "findings": "Found potential issues in token validation and missing security headers",
                    "files_checked": [auth_file, middleware_file],
                    "relevant_files": [auth_file, middleware_file],  # Both files referenced
                    "relevant_context": ["require_auth", "validate_token", "add_security_headers"],
                    "issues_found": [
                        {"severity": "medium", "description": "Basic token validation might be insufficient"}
                    ],
                    # Assessment fields removed - using precommit_type instead
                    "model": "flash",
                },
            )

            if not response2:
                self.logger.error("Failed to continue to step 2")
                return False

            response2_data = self._parse_precommit_response(response2)
            if not response2_data:
                return False

            # Check file context - should still be reference_only
            file_context2 = response2_data.get("file_context", {})
            if file_context2.get("type") != "reference_only":
                self.logger.error(f"Expected reference_only file context for step 2, got: {file_context2.get('type')}")
                return False

            # Should include reference note
            if not file_context2.get("note"):
                self.logger.error("Expected file reference note for intermediate step")
                return False

            reference_note = file_context2.get("note", "")
            if "auth.py" not in reference_note or "middleware.py" not in reference_note:
                self.logger.error("File reference note should mention both files")
                return False

            self.logger.info("    ✅ Intermediate step with continuation correctly uses reference_only")

            # Test 3: Final step - should embed files for expert analysis
            self.logger.info("    1.5.3: Final step (should embed files)")
            response3, _ = self.call_mcp_tool(
                "precommit",
                {
                    "step": "Validation complete - identified security gaps and improvement areas",
                    "step_number": 3,
                    "total_steps": 3,
                    "next_step_required": False,  # Final step - should embed files
                    "continuation_id": continuation_id,
                    "findings": "Security implementation has several gaps: token validation is basic, missing CSRF protection, and rate limiting is not implemented",
                    "files_checked": [auth_file, middleware_file],
                    "relevant_files": [auth_file, middleware_file],  # Should be fully embedded
                    "relevant_context": ["require_auth", "validate_token", "add_security_headers"],
                    "issues_found": [
                        {"severity": "medium", "description": "Token validation needs strengthening"},
                        {"severity": "low", "description": "Missing CSRF protection"},
                        {"severity": "low", "description": "Rate limiting not implemented"},
                    ],
                    # Assessment field removed - using precommit_type instead
                    # Confidence field removed - using precommit_type instead
                    "model": "flash",
                },
            )

            if not response3:
                self.logger.error("Failed to complete to final step")
                return False

            response3_data = self._parse_precommit_response(response3)
            if not response3_data:
                return False

            # Check file context - should be fully_embedded for final step
            file_context3 = response3_data.get("file_context", {})
            if file_context3.get("type") != "fully_embedded":
                self.logger.error(
                    f"Expected fully_embedded file context for final step, got: {file_context3.get('type')}"
                )
                return False

            if "Full file content embedded for expert analysis" not in file_context3.get("context_optimization", ""):
                self.logger.error("Expected expert analysis optimization message for fully_embedded")
                return False

            # Should show files embedded count
            files_embedded = file_context3.get("files_embedded", 0)
            if files_embedded == 0:
                # This is OK - files might already be in conversation history
                self.logger.info(
                    "    ℹ️ Files embedded count is 0 - files already in conversation history (smart deduplication)"
                )
            else:
                self.logger.info(f"    ✅ Files embedded count: {files_embedded}")

            self.logger.info("    ✅ Final step correctly uses fully_embedded file context")

            # Verify expert analysis was called for final step
            if response3_data.get("status") != "calling_expert_analysis":
                self.logger.error("Final step should trigger expert analysis")
                return False

            if "expert_analysis" not in response3_data:
                self.logger.error("Expert analysis should be present in final step")
                return False

            self.logger.info("    ✅ Context-aware file embedding test completed successfully")
            return True

        except Exception as e:
            self.logger.error(f"Context-aware file embedding test failed: {e}")
            return False

    def _test_multi_step_file_context(self) -> bool:
        """Test multi-step workflow with proper file context transitions"""
        try:
            self.logger.info("  1.6: Testing multi-step file context optimization")

            # Create a complex scenario with multiple files for pre-commit validation
            database_content = """#!/usr/bin/env python3
import sqlite3
import os
from contextlib import contextmanager

class DatabaseManager:
    def __init__(self):
        self.db_path = os.getenv('DATABASE_PATH', 'app.db')

    @contextmanager
    def get_connection(self):
        \"\"\"Get database connection with proper cleanup\"\"\"
        conn = None
        try:
            conn = sqlite3.connect(self.db_path)
            yield conn
        finally:
            if conn:
                conn.close()

    def create_user(self, username, email, password_hash):
        \"\"\"Create a new user\"\"\"
        with self.get_connection() as conn:
            cursor = conn.cursor()
            # Proper parameterized query
            cursor.execute(
                "INSERT INTO users (username, email, password_hash) VALUES (?, ?, ?)",
                (username, email, password_hash)
            )
            conn.commit()
            return cursor.lastrowid
"""

            tests_content = """#!/usr/bin/env python3
import unittest
from unittest.mock import patch, MagicMock
from database_manager import DatabaseManager

class TestDatabaseManager(unittest.TestCase):
    def setUp(self):
        self.db_manager = DatabaseManager()

    @patch('sqlite3.connect')
    def test_create_user(self, mock_connect):
        \"\"\"Test user creation\"\"\"
        mock_conn = MagicMock()
        mock_cursor = MagicMock()
        mock_cursor.lastrowid = 123
        mock_conn.cursor.return_value = mock_cursor
        mock_connect.return_value = mock_conn

        user_id = self.db_manager.create_user('testuser', '[email protected]', 'hashed_password')

        self.assertEqual(user_id, 123)
        mock_cursor.execute.assert_called_once_with(
            "INSERT INTO users (username, email, password_hash) VALUES (?, ?, ?)",
            ('testuser', '[email protected]', 'hashed_password')
        )

if __name__ == '__main__':
    unittest.main()
"""

            # Create test files
            db_file = self.create_additional_test_file("database_manager.py", database_content)
            test_file = self.create_additional_test_file("test_database.py", tests_content)

            # Step 1: Start validation (new conversation)
            self.logger.info("    1.6.1: Step 1 - Start validation")
            response1, continuation_id = self.call_mcp_tool(
                "precommit",
                {
                    "step": "Validating new database manager implementation and corresponding tests",
                    "step_number": 1,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "New database manager with connection handling and user creation functionality",
                    "files_checked": [db_file],
                    "relevant_files": [db_file],
                    "relevant_context": [],
                    # Assessment fields removed - using precommit_type instead
                    "path": self.test_dir,
                    "model": "flash",
                },
            )

            if not response1 or not continuation_id:
                self.logger.error("Failed to start multi-step file context test")
                return False

            response1_data = self._parse_precommit_response(response1)

            # Validate step 1 - should use reference_only
            file_context1 = response1_data.get("file_context", {})
            if file_context1.get("type") != "reference_only":
                self.logger.error("Step 1 should use reference_only file context")
                return False

            self.logger.info("    ✅ Step 1: reference_only file context")

            # Step 2: Expand validation
            self.logger.info("    1.6.2: Step 2 - Expand validation")
            response2, _ = self.call_mcp_tool(
                "precommit",
                {
                    "step": "Found good database implementation - now examining test coverage",
                    "step_number": 2,
                    "total_steps": 4,
                    "next_step_required": True,
                    "continuation_id": continuation_id,
                    "findings": "Database manager uses proper parameterized queries and context managers. Test file provides good coverage with mocking.",
                    "files_checked": [db_file, test_file],
                    "relevant_files": [db_file, test_file],
                    "relevant_context": ["DatabaseManager.create_user", "TestDatabaseManager.test_create_user"],
                    # Assessment fields removed - using precommit_type instead
                    "model": "flash",
                },
            )

            if not response2:
                self.logger.error("Failed to continue to step 2")
                return False

            response2_data = self._parse_precommit_response(response2)

            # Validate step 2 - should still use reference_only
            file_context2 = response2_data.get("file_context", {})
            if file_context2.get("type") != "reference_only":
                self.logger.error("Step 2 should use reference_only file context")
                return False

            # Should reference both files
            reference_note = file_context2.get("note", "")
            if "database_manager.py" not in reference_note or "test_database.py" not in reference_note:
                self.logger.error("Step 2 should reference both files in note")
                return False

            self.logger.info("    ✅ Step 2: reference_only file context with multiple files")

            # Step 3: Deep analysis
            self.logger.info("    1.6.3: Step 3 - Deep analysis")
            response3, _ = self.call_mcp_tool(
                "precommit",
                {
                    "step": "Performing comprehensive security and best practices analysis",
                    "step_number": 3,
                    "total_steps": 4,
                    "next_step_required": True,
                    "continuation_id": continuation_id,
                    "findings": "Code follows security best practices: parameterized queries prevent SQL injection, proper resource cleanup with context managers, environment-based configuration.",
                    "files_checked": [db_file, test_file],
                    "relevant_files": [db_file, test_file],
                    "relevant_context": ["DatabaseManager.get_connection", "DatabaseManager.create_user"],
                    "issues_found": [],  # No issues found
                    # Assessment field removed - using precommit_type instead
                    # Confidence field removed - using precommit_type instead
                    "model": "flash",
                },
            )

            if not response3:
                self.logger.error("Failed to continue to step 3")
                return False

            response3_data = self._parse_precommit_response(response3)

            # Validate step 3 - should still use reference_only
            file_context3 = response3_data.get("file_context", {})
            if file_context3.get("type") != "reference_only":
                self.logger.error("Step 3 should use reference_only file context")
                return False

            self.logger.info("    ✅ Step 3: reference_only file context")

            # Step 4: Final validation with expert consultation
            self.logger.info("    1.6.4: Step 4 - Final step with expert analysis")
            response4, _ = self.call_mcp_tool(
                "precommit",
                {
                    "step": "Validation complete - code is ready for commit",
                    "step_number": 4,
                    "total_steps": 4,
                    "next_step_required": False,  # Final step - should embed files
                    "continuation_id": continuation_id,
                    "findings": "Comprehensive validation complete: secure implementation with parameterized queries, proper resource management, good test coverage, and no security vulnerabilities identified.",
                    "files_checked": [db_file, test_file],
                    "relevant_files": [db_file, test_file],
                    "relevant_context": ["DatabaseManager", "TestDatabaseManager"],
                    "issues_found": [],
                    # Assessment field removed - using precommit_type instead
                    # Confidence field removed - using precommit_type instead
                    "model": "flash",
                },
            )

            if not response4:
                self.logger.error("Failed to complete to final step")
                return False

            response4_data = self._parse_precommit_response(response4)

            # Validate step 4 - should use fully_embedded for expert analysis
            file_context4 = response4_data.get("file_context", {})
            if file_context4.get("type") != "fully_embedded":
                self.logger.error("Step 4 (final) should use fully_embedded file context")
                return False

            if "expert analysis" not in file_context4.get("context_optimization", "").lower():
                self.logger.error("Final step should mention expert analysis in context optimization")
                return False

            # Verify expert analysis was triggered
            if response4_data.get("status") != "calling_expert_analysis":
                self.logger.error("Final step should trigger expert analysis")
                return False

            # Check that expert analysis has file context
            expert_analysis = response4_data.get("expert_analysis", {})
            if not expert_analysis:
                self.logger.error("Expert analysis should be present in final step")
                return False

            self.logger.info("    ✅ Step 4: fully_embedded file context with expert analysis")

            # Validate the complete workflow progression
            progression_summary = {
                "step_1": "reference_only (new conversation, intermediate)",
                "step_2": "reference_only (continuation, intermediate)",
                "step_3": "reference_only (continuation, intermediate)",
                "step_4": "fully_embedded (continuation, final)",
            }

            self.logger.info("    📋 File context progression:")
            for step, context_type in progression_summary.items():
                self.logger.info(f"      {step}: {context_type}")

            self.logger.info("    ✅ Multi-step file context optimization test completed successfully")
            return True

        except Exception as e:
            self.logger.error(f"Multi-step file context test failed: {e}")
            return False

```