This is page 3 of 19. Use http://codebase.md/beehiveinnovations/gemini-mcp-server?page={x} to view the full context.
# Directory Structure
```
├── .claude
│ ├── commands
│ │ └── fix-github-issue.md
│ └── settings.json
├── .coveragerc
├── .dockerignore
├── .env.example
├── .gitattributes
├── .github
│ ├── FUNDING.yml
│ ├── ISSUE_TEMPLATE
│ │ ├── bug_report.yml
│ │ ├── config.yml
│ │ ├── documentation.yml
│ │ ├── feature_request.yml
│ │ └── tool_addition.yml
│ ├── pull_request_template.md
│ └── workflows
│ ├── docker-pr.yml
│ ├── docker-release.yml
│ ├── semantic-pr.yml
│ ├── semantic-release.yml
│ └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── AGENTS.md
├── CHANGELOG.md
├── claude_config_example.json
├── CLAUDE.md
├── clink
│ ├── __init__.py
│ ├── agents
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── claude.py
│ │ ├── codex.py
│ │ └── gemini.py
│ ├── constants.py
│ ├── models.py
│ ├── parsers
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── claude.py
│ │ ├── codex.py
│ │ └── gemini.py
│ └── registry.py
├── code_quality_checks.ps1
├── code_quality_checks.sh
├── communication_simulator_test.py
├── conf
│ ├── __init__.py
│ ├── azure_models.json
│ ├── cli_clients
│ │ ├── claude.json
│ │ ├── codex.json
│ │ └── gemini.json
│ ├── custom_models.json
│ ├── dial_models.json
│ ├── gemini_models.json
│ ├── openai_models.json
│ ├── openrouter_models.json
│ └── xai_models.json
├── config.py
├── docker
│ ├── README.md
│ └── scripts
│ ├── build.ps1
│ ├── build.sh
│ ├── deploy.ps1
│ ├── deploy.sh
│ └── healthcheck.py
├── docker-compose.yml
├── Dockerfile
├── docs
│ ├── adding_providers.md
│ ├── adding_tools.md
│ ├── advanced-usage.md
│ ├── ai_banter.md
│ ├── ai-collaboration.md
│ ├── azure_openai.md
│ ├── configuration.md
│ ├── context-revival.md
│ ├── contributions.md
│ ├── custom_models.md
│ ├── docker-deployment.md
│ ├── gemini-setup.md
│ ├── getting-started.md
│ ├── index.md
│ ├── locale-configuration.md
│ ├── logging.md
│ ├── model_ranking.md
│ ├── testing.md
│ ├── tools
│ │ ├── analyze.md
│ │ ├── apilookup.md
│ │ ├── challenge.md
│ │ ├── chat.md
│ │ ├── clink.md
│ │ ├── codereview.md
│ │ ├── consensus.md
│ │ ├── debug.md
│ │ ├── docgen.md
│ │ ├── listmodels.md
│ │ ├── planner.md
│ │ ├── precommit.md
│ │ ├── refactor.md
│ │ ├── secaudit.md
│ │ ├── testgen.md
│ │ ├── thinkdeep.md
│ │ ├── tracer.md
│ │ └── version.md
│ ├── troubleshooting.md
│ ├── vcr-testing.md
│ └── wsl-setup.md
├── examples
│ ├── claude_config_macos.json
│ └── claude_config_wsl.json
├── LICENSE
├── providers
│ ├── __init__.py
│ ├── azure_openai.py
│ ├── base.py
│ ├── custom.py
│ ├── dial.py
│ ├── gemini.py
│ ├── openai_compatible.py
│ ├── openai.py
│ ├── openrouter.py
│ ├── registries
│ │ ├── __init__.py
│ │ ├── azure.py
│ │ ├── base.py
│ │ ├── custom.py
│ │ ├── dial.py
│ │ ├── gemini.py
│ │ ├── openai.py
│ │ ├── openrouter.py
│ │ └── xai.py
│ ├── registry_provider_mixin.py
│ ├── registry.py
│ ├── shared
│ │ ├── __init__.py
│ │ ├── model_capabilities.py
│ │ ├── model_response.py
│ │ ├── provider_type.py
│ │ └── temperature.py
│ └── xai.py
├── pyproject.toml
├── pytest.ini
├── README.md
├── requirements-dev.txt
├── requirements.txt
├── run_integration_tests.ps1
├── run_integration_tests.sh
├── run-server.ps1
├── run-server.sh
├── scripts
│ └── sync_version.py
├── server.py
├── simulator_tests
│ ├── __init__.py
│ ├── base_test.py
│ ├── conversation_base_test.py
│ ├── log_utils.py
│ ├── test_analyze_validation.py
│ ├── test_basic_conversation.py
│ ├── test_chat_simple_validation.py
│ ├── test_codereview_validation.py
│ ├── test_consensus_conversation.py
│ ├── test_consensus_three_models.py
│ ├── test_consensus_workflow_accurate.py
│ ├── test_content_validation.py
│ ├── test_conversation_chain_validation.py
│ ├── test_cross_tool_comprehensive.py
│ ├── test_cross_tool_continuation.py
│ ├── test_debug_certain_confidence.py
│ ├── test_debug_validation.py
│ ├── test_line_number_validation.py
│ ├── test_logs_validation.py
│ ├── test_model_thinking_config.py
│ ├── test_o3_model_selection.py
│ ├── test_o3_pro_expensive.py
│ ├── test_ollama_custom_url.py
│ ├── test_openrouter_fallback.py
│ ├── test_openrouter_models.py
│ ├── test_per_tool_deduplication.py
│ ├── test_planner_continuation_history.py
│ ├── test_planner_validation_old.py
│ ├── test_planner_validation.py
│ ├── test_precommitworkflow_validation.py
│ ├── test_prompt_size_limit_bug.py
│ ├── test_refactor_validation.py
│ ├── test_secaudit_validation.py
│ ├── test_testgen_validation.py
│ ├── test_thinkdeep_validation.py
│ ├── test_token_allocation_validation.py
│ ├── test_vision_capability.py
│ └── test_xai_models.py
├── systemprompts
│ ├── __init__.py
│ ├── analyze_prompt.py
│ ├── chat_prompt.py
│ ├── clink
│ │ ├── codex_codereviewer.txt
│ │ ├── default_codereviewer.txt
│ │ ├── default_planner.txt
│ │ └── default.txt
│ ├── codereview_prompt.py
│ ├── consensus_prompt.py
│ ├── debug_prompt.py
│ ├── docgen_prompt.py
│ ├── generate_code_prompt.py
│ ├── planner_prompt.py
│ ├── precommit_prompt.py
│ ├── refactor_prompt.py
│ ├── secaudit_prompt.py
│ ├── testgen_prompt.py
│ ├── thinkdeep_prompt.py
│ └── tracer_prompt.py
├── tests
│ ├── __init__.py
│ ├── CASSETTE_MAINTENANCE.md
│ ├── conftest.py
│ ├── gemini_cassettes
│ │ ├── chat_codegen
│ │ │ └── gemini25_pro_calculator
│ │ │ └── mldev.json
│ │ ├── chat_cross
│ │ │ └── step1_gemini25_flash_number
│ │ │ └── mldev.json
│ │ └── consensus
│ │ └── step2_gemini25_flash_against
│ │ └── mldev.json
│ ├── http_transport_recorder.py
│ ├── mock_helpers.py
│ ├── openai_cassettes
│ │ ├── chat_cross_step2_gpt5_reminder.json
│ │ ├── chat_gpt5_continuation.json
│ │ ├── chat_gpt5_moon_distance.json
│ │ ├── consensus_step1_gpt5_for.json
│ │ └── o3_pro_basic_math.json
│ ├── pii_sanitizer.py
│ ├── sanitize_cassettes.py
│ ├── test_alias_target_restrictions.py
│ ├── test_auto_mode_comprehensive.py
│ ├── test_auto_mode_custom_provider_only.py
│ ├── test_auto_mode_model_listing.py
│ ├── test_auto_mode_provider_selection.py
│ ├── test_auto_mode.py
│ ├── test_auto_model_planner_fix.py
│ ├── test_azure_openai_provider.py
│ ├── test_buggy_behavior_prevention.py
│ ├── test_cassette_semantic_matching.py
│ ├── test_challenge.py
│ ├── test_chat_codegen_integration.py
│ ├── test_chat_cross_model_continuation.py
│ ├── test_chat_openai_integration.py
│ ├── test_chat_simple.py
│ ├── test_clink_claude_agent.py
│ ├── test_clink_claude_parser.py
│ ├── test_clink_codex_agent.py
│ ├── test_clink_gemini_agent.py
│ ├── test_clink_gemini_parser.py
│ ├── test_clink_integration.py
│ ├── test_clink_parsers.py
│ ├── test_clink_tool.py
│ ├── test_collaboration.py
│ ├── test_config.py
│ ├── test_consensus_integration.py
│ ├── test_consensus_schema.py
│ ├── test_consensus.py
│ ├── test_conversation_continuation_integration.py
│ ├── test_conversation_field_mapping.py
│ ├── test_conversation_file_features.py
│ ├── test_conversation_memory.py
│ ├── test_conversation_missing_files.py
│ ├── test_custom_openai_temperature_fix.py
│ ├── test_custom_provider.py
│ ├── test_debug.py
│ ├── test_deploy_scripts.py
│ ├── test_dial_provider.py
│ ├── test_directory_expansion_tracking.py
│ ├── test_disabled_tools.py
│ ├── test_docker_claude_desktop_integration.py
│ ├── test_docker_config_complete.py
│ ├── test_docker_healthcheck.py
│ ├── test_docker_implementation.py
│ ├── test_docker_mcp_validation.py
│ ├── test_docker_security.py
│ ├── test_docker_volume_persistence.py
│ ├── test_file_protection.py
│ ├── test_gemini_token_usage.py
│ ├── test_image_support_integration.py
│ ├── test_image_validation.py
│ ├── test_integration_utf8.py
│ ├── test_intelligent_fallback.py
│ ├── test_issue_245_simple.py
│ ├── test_large_prompt_handling.py
│ ├── test_line_numbers_integration.py
│ ├── test_listmodels_restrictions.py
│ ├── test_listmodels.py
│ ├── test_mcp_error_handling.py
│ ├── test_model_enumeration.py
│ ├── test_model_metadata_continuation.py
│ ├── test_model_resolution_bug.py
│ ├── test_model_restrictions.py
│ ├── test_o3_pro_output_text_fix.py
│ ├── test_o3_temperature_fix_simple.py
│ ├── test_openai_compatible_token_usage.py
│ ├── test_openai_provider.py
│ ├── test_openrouter_provider.py
│ ├── test_openrouter_registry.py
│ ├── test_parse_model_option.py
│ ├── test_per_tool_model_defaults.py
│ ├── test_pii_sanitizer.py
│ ├── test_pip_detection_fix.py
│ ├── test_planner.py
│ ├── test_precommit_workflow.py
│ ├── test_prompt_regression.py
│ ├── test_prompt_size_limit_bug_fix.py
│ ├── test_provider_retry_logic.py
│ ├── test_provider_routing_bugs.py
│ ├── test_provider_utf8.py
│ ├── test_providers.py
│ ├── test_rate_limit_patterns.py
│ ├── test_refactor.py
│ ├── test_secaudit.py
│ ├── test_server.py
│ ├── test_supported_models_aliases.py
│ ├── test_thinking_modes.py
│ ├── test_tools.py
│ ├── test_tracer.py
│ ├── test_utf8_localization.py
│ ├── test_utils.py
│ ├── test_uvx_resource_packaging.py
│ ├── test_uvx_support.py
│ ├── test_workflow_file_embedding.py
│ ├── test_workflow_metadata.py
│ ├── test_workflow_prompt_size_validation_simple.py
│ ├── test_workflow_utf8.py
│ ├── test_xai_provider.py
│ ├── transport_helpers.py
│ └── triangle.png
├── tools
│ ├── __init__.py
│ ├── analyze.py
│ ├── apilookup.py
│ ├── challenge.py
│ ├── chat.py
│ ├── clink.py
│ ├── codereview.py
│ ├── consensus.py
│ ├── debug.py
│ ├── docgen.py
│ ├── listmodels.py
│ ├── models.py
│ ├── planner.py
│ ├── precommit.py
│ ├── refactor.py
│ ├── secaudit.py
│ ├── shared
│ │ ├── __init__.py
│ │ ├── base_models.py
│ │ ├── base_tool.py
│ │ ├── exceptions.py
│ │ └── schema_builders.py
│ ├── simple
│ │ ├── __init__.py
│ │ └── base.py
│ ├── testgen.py
│ ├── thinkdeep.py
│ ├── tracer.py
│ ├── version.py
│ └── workflow
│ ├── __init__.py
│ ├── base.py
│ ├── schema_builders.py
│ └── workflow_mixin.py
├── utils
│ ├── __init__.py
│ ├── client_info.py
│ ├── conversation_memory.py
│ ├── env.py
│ ├── file_types.py
│ ├── file_utils.py
│ ├── image_utils.py
│ ├── model_context.py
│ ├── model_restrictions.py
│ ├── security_config.py
│ ├── storage_backend.py
│ └── token_utils.py
└── zen-mcp-server
```
# Files
--------------------------------------------------------------------------------
/tests/test_clink_tool.py:
--------------------------------------------------------------------------------
```python
import json
import pytest
from clink import get_registry
from clink.agents import AgentOutput
from clink.parsers.base import ParsedCLIResponse
from tools.clink import MAX_RESPONSE_CHARS, CLinkTool
@pytest.mark.asyncio
async def test_clink_tool_execute(monkeypatch):
tool = CLinkTool()
async def fake_run(**kwargs):
return AgentOutput(
parsed=ParsedCLIResponse(content="Hello from Gemini", metadata={"model_used": "gemini-2.5-pro"}),
sanitized_command=["gemini", "-o", "json"],
returncode=0,
stdout='{"response": "Hello from Gemini"}',
stderr="",
duration_seconds=0.42,
parser_name="gemini_json",
output_file_content=None,
)
class DummyAgent:
async def run(self, **kwargs):
return await fake_run(**kwargs)
def fake_create_agent(client):
return DummyAgent()
monkeypatch.setattr("tools.clink.create_agent", fake_create_agent)
arguments = {
"prompt": "Summarize the project",
"cli_name": "gemini",
"role": "default",
"absolute_file_paths": [],
"images": [],
}
results = await tool.execute(arguments)
assert len(results) == 1
payload = json.loads(results[0].text)
assert payload["status"] in {"success", "continuation_available"}
assert "Hello from Gemini" in payload["content"]
metadata = payload.get("metadata", {})
assert metadata.get("cli_name") == "gemini"
assert metadata.get("command") == ["gemini", "-o", "json"]
def test_registry_lists_roles():
registry = get_registry()
clients = registry.list_clients()
assert {"codex", "gemini"}.issubset(set(clients))
roles = registry.list_roles("gemini")
assert "default" in roles
assert "default" in registry.list_roles("codex")
@pytest.mark.asyncio
async def test_clink_tool_defaults_to_first_cli(monkeypatch):
tool = CLinkTool()
async def fake_run(**kwargs):
return AgentOutput(
parsed=ParsedCLIResponse(content="Default CLI response", metadata={"events": ["foo"]}),
sanitized_command=["gemini"],
returncode=0,
stdout='{"response": "Default CLI response"}',
stderr="",
duration_seconds=0.1,
parser_name="gemini_json",
output_file_content=None,
)
class DummyAgent:
async def run(self, **kwargs):
return await fake_run(**kwargs)
monkeypatch.setattr("tools.clink.create_agent", lambda client: DummyAgent())
arguments = {
"prompt": "Hello",
"absolute_file_paths": [],
"images": [],
}
result = await tool.execute(arguments)
payload = json.loads(result[0].text)
metadata = payload.get("metadata", {})
assert metadata.get("cli_name") == tool._default_cli_name
assert metadata.get("events_removed_for_normal") is True
@pytest.mark.asyncio
async def test_clink_tool_truncates_large_output(monkeypatch):
tool = CLinkTool()
summary_section = "<SUMMARY>This is the condensed summary.</SUMMARY>"
long_text = "A" * (MAX_RESPONSE_CHARS + 500) + summary_section
async def fake_run(**kwargs):
return AgentOutput(
parsed=ParsedCLIResponse(content=long_text, metadata={"events": ["event1", "event2"]}),
sanitized_command=["codex"],
returncode=0,
stdout="{}",
stderr="",
duration_seconds=0.2,
parser_name="codex_jsonl",
output_file_content=None,
)
class DummyAgent:
async def run(self, **kwargs):
return await fake_run(**kwargs)
monkeypatch.setattr("tools.clink.create_agent", lambda client: DummyAgent())
arguments = {
"prompt": "Summarize",
"cli_name": tool._default_cli_name,
"absolute_file_paths": [],
"images": [],
}
result = await tool.execute(arguments)
payload = json.loads(result[0].text)
assert payload["status"] in {"success", "continuation_available"}
assert payload["content"].strip() == "This is the condensed summary."
metadata = payload.get("metadata", {})
assert metadata.get("output_summarized") is True
assert metadata.get("events_removed_for_normal") is True
assert metadata.get("output_original_length") == len(long_text)
@pytest.mark.asyncio
async def test_clink_tool_truncates_without_summary(monkeypatch):
tool = CLinkTool()
long_text = "B" * (MAX_RESPONSE_CHARS + 1000)
async def fake_run(**kwargs):
return AgentOutput(
parsed=ParsedCLIResponse(content=long_text, metadata={"events": ["event"]}),
sanitized_command=["codex"],
returncode=0,
stdout="{}",
stderr="",
duration_seconds=0.2,
parser_name="codex_jsonl",
output_file_content=None,
)
class DummyAgent:
async def run(self, **kwargs):
return await fake_run(**kwargs)
monkeypatch.setattr("tools.clink.create_agent", lambda client: DummyAgent())
arguments = {
"prompt": "Summarize",
"cli_name": tool._default_cli_name,
"absolute_file_paths": [],
"images": [],
}
result = await tool.execute(arguments)
payload = json.loads(result[0].text)
assert payload["status"] in {"success", "continuation_available"}
assert "exceeding the configured clink limit" in payload["content"]
metadata = payload.get("metadata", {})
assert metadata.get("output_truncated") is True
assert metadata.get("events_removed_for_normal") is True
assert metadata.get("output_original_length") == len(long_text)
```
--------------------------------------------------------------------------------
/tests/test_pii_sanitizer.py:
--------------------------------------------------------------------------------
```python
#!/usr/bin/env python3
"""Test cases for PII sanitizer."""
import unittest
from .pii_sanitizer import PIIPattern, PIISanitizer
class TestPIISanitizer(unittest.TestCase):
"""Test PII sanitization functionality."""
def setUp(self):
"""Set up test sanitizer."""
self.sanitizer = PIISanitizer()
def test_api_key_sanitization(self):
"""Test various API key formats are sanitized."""
test_cases = [
# OpenAI keys
("sk-proj-abcd1234567890ABCD1234567890abcd1234567890ABCD12", "sk-proj-SANITIZED"),
("sk-1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN", "sk-SANITIZED"),
# Anthropic keys
("sk-ant-abcd1234567890ABCD1234567890abcd1234567890ABCD12", "sk-ant-SANITIZED"),
# Google keys
("AIzaSyD-1234567890abcdefghijklmnopqrstuv", "AIza-SANITIZED"),
# GitHub tokens
("ghp_1234567890abcdefghijklmnopqrstuvwxyz", "gh_SANITIZED"),
("ghs_1234567890abcdefghijklmnopqrstuvwxyz", "gh_SANITIZED"),
]
for original, expected in test_cases:
with self.subTest(original=original):
result = self.sanitizer.sanitize_string(original)
self.assertEqual(result, expected)
def test_personal_info_sanitization(self):
"""Test personal information is sanitized."""
test_cases = [
# Email addresses
("[email protected]", "[email protected]"),
("[email protected]", "[email protected]"),
# Phone numbers (all now use the same pattern)
("(555) 123-4567", "(XXX) XXX-XXXX"),
("555-123-4567", "(XXX) XXX-XXXX"),
("+1-555-123-4567", "(XXX) XXX-XXXX"),
# SSN
("123-45-6789", "XXX-XX-XXXX"),
# Credit card
("1234 5678 9012 3456", "XXXX-XXXX-XXXX-XXXX"),
("1234-5678-9012-3456", "XXXX-XXXX-XXXX-XXXX"),
]
for original, expected in test_cases:
with self.subTest(original=original):
result = self.sanitizer.sanitize_string(original)
self.assertEqual(result, expected)
def test_header_sanitization(self):
"""Test HTTP header sanitization."""
headers = {
"Authorization": "Bearer sk-proj-abcd1234567890ABCD1234567890abcd1234567890ABCD12",
"API-Key": "sk-1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN",
"Content-Type": "application/json",
"User-Agent": "MyApp/1.0",
"Cookie": "session=abc123; [email protected]",
}
sanitized = self.sanitizer.sanitize_headers(headers)
self.assertEqual(sanitized["Authorization"], "Bearer SANITIZED")
self.assertEqual(sanitized["API-Key"], "sk-SANITIZED")
self.assertEqual(sanitized["Content-Type"], "application/json")
self.assertEqual(sanitized["User-Agent"], "MyApp/1.0")
self.assertIn("[email protected]", sanitized["Cookie"])
def test_nested_structure_sanitization(self):
"""Test sanitization of nested data structures."""
data = {
"user": {
"email": "[email protected]",
"api_key": "sk-proj-abcd1234567890ABCD1234567890abcd1234567890ABCD12",
},
"tokens": [
"ghp_1234567890abcdefghijklmnopqrstuvwxyz",
"Bearer sk-ant-abcd1234567890ABCD1234567890abcd1234567890ABCD12",
],
"metadata": {"ip": "192.168.1.100", "phone": "(555) 123-4567"},
}
sanitized = self.sanitizer.sanitize_value(data)
self.assertEqual(sanitized["user"]["email"], "[email protected]")
self.assertEqual(sanitized["user"]["api_key"], "sk-proj-SANITIZED")
self.assertEqual(sanitized["tokens"][0], "gh_SANITIZED")
self.assertEqual(sanitized["tokens"][1], "Bearer sk-ant-SANITIZED")
self.assertEqual(sanitized["metadata"]["ip"], "0.0.0.0")
self.assertEqual(sanitized["metadata"]["phone"], "(XXX) XXX-XXXX")
def test_url_sanitization(self):
"""Test URL parameter sanitization."""
urls = [
(
"https://api.example.com/v1/users?api_key=sk-1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN",
"https://api.example.com/v1/users?api_key=SANITIZED",
),
(
"https://example.com/login?token=ghp_1234567890abcdefghijklmnopqrstuvwxyz&user=test",
"https://example.com/login?token=SANITIZED&user=test",
),
]
for original, expected in urls:
with self.subTest(url=original):
result = self.sanitizer.sanitize_url(original)
self.assertEqual(result, expected)
def test_disable_sanitization(self):
"""Test that sanitization can be disabled."""
self.sanitizer.sanitize_enabled = False
sensitive_data = "sk-proj-abcd1234567890ABCD1234567890abcd1234567890ABCD12"
result = self.sanitizer.sanitize_string(sensitive_data)
# Should return original when disabled
self.assertEqual(result, sensitive_data)
def test_custom_pattern(self):
"""Test adding custom PII patterns."""
# Add custom pattern for internal employee IDs
custom_pattern = PIIPattern.create(
name="employee_id", pattern=r"EMP\d{6}", replacement="EMP-REDACTED", description="Internal employee IDs"
)
self.sanitizer.add_pattern(custom_pattern)
text = "Employee EMP123456 has access to the system"
result = self.sanitizer.sanitize_string(text)
self.assertEqual(result, "Employee EMP-REDACTED has access to the system")
if __name__ == "__main__":
unittest.main()
```
--------------------------------------------------------------------------------
/tests/test_rate_limit_patterns.py:
--------------------------------------------------------------------------------
```python
"""
Test to verify structured error code-based retry logic.
"""
from providers.gemini import GeminiModelProvider
from providers.openai import OpenAIModelProvider
def test_openai_structured_error_retry_logic():
"""Test OpenAI provider's structured error code retry logic."""
provider = OpenAIModelProvider(api_key="test-key")
# Test structured token-related 429 error (should NOT be retried)
class MockTokenError(Exception):
def __init__(self):
# Simulate the actual error format from OpenAI API
self.args = (
"Error code: 429 - {'error': {'message': 'Request too large for o3', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}",
)
token_error = MockTokenError()
assert not provider._is_error_retryable(token_error), "Token-related 429 should not be retryable"
# Test standard rate limiting 429 error (should be retried)
class MockRateLimitError(Exception):
def __init__(self):
self.args = (
"Error code: 429 - {'error': {'message': 'Too many requests', 'type': 'requests', 'code': 'rate_limit_exceeded'}}",
)
rate_limit_error = MockRateLimitError()
assert provider._is_error_retryable(rate_limit_error), "Request rate limiting should be retryable"
# Test context length error (should NOT be retried)
class MockContextError(Exception):
def __init__(self):
self.args = (
"Error code: 429 - {'error': {'message': 'Context length exceeded', 'code': 'context_length_exceeded'}}",
)
context_error = MockContextError()
assert not provider._is_error_retryable(context_error), "Context length errors should not be retryable"
def test_gemini_structured_error_retry_logic():
"""Test Gemini provider's structured error code retry logic."""
provider = GeminiModelProvider(api_key="test-key")
# Test quota exceeded error (should NOT be retried)
class MockQuotaError(Exception):
def __init__(self):
self.args = ("429 Resource exhausted: Quota exceeded for model",)
self.details = "quota_exceeded"
quota_error = MockQuotaError()
assert not provider._is_error_retryable(quota_error), "Quota exceeded should not be retryable"
# Test resource exhausted error (should NOT be retried)
class MockResourceError(Exception):
def __init__(self):
self.args = ("429 Resource exhausted: Token limit exceeded",)
resource_error = MockResourceError()
assert not provider._is_error_retryable(resource_error), "Resource exhausted should not be retryable"
# Test temporary rate limiting (should be retried)
class MockTempError(Exception):
def __init__(self):
self.args = ("429 Too many requests, please try again later",)
temp_error = MockTempError()
assert provider._is_error_retryable(temp_error), "Temporary rate limiting should be retryable"
def test_actual_log_error_from_issue_with_structured_parsing():
"""Test the specific error from the user's log using structured parsing."""
provider = OpenAIModelProvider(api_key="test-key")
# Create the exact error from the user's log
class MockUserLogError(Exception):
def __init__(self):
# This is the exact error message from the user's issue
self.args = (
"Error code: 429 - {'error': {'message': 'Request too large for o3 in organization org-MWp466of2XGyS90J8huQk4R6 on tokens per min (TPM): Limit 30000, Requested 31756. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}",
)
user_error = MockUserLogError()
# This specific error should NOT be retryable because it has type='tokens'
assert not provider._is_error_retryable(user_error), "The user's specific error should be non-retryable"
def test_non_429_errors_still_work():
"""Test that non-429 errors are still handled correctly."""
provider = OpenAIModelProvider(api_key="test-key")
# Test retryable non-429 errors
class MockTimeoutError(Exception):
def __init__(self):
self.args = ("Connection timeout",)
timeout_error = MockTimeoutError()
assert provider._is_error_retryable(timeout_error), "Timeout errors should be retryable"
class Mock500Error(Exception):
def __init__(self):
self.args = ("500 Internal Server Error",)
server_error = Mock500Error()
assert provider._is_error_retryable(server_error), "500 errors should be retryable"
# Test non-retryable non-429 errors
class MockAuthError(Exception):
def __init__(self):
self.args = ("401 Unauthorized",)
auth_error = MockAuthError()
assert not provider._is_error_retryable(auth_error), "Auth errors should not be retryable"
def test_edge_cases_and_fallbacks():
"""Test edge cases and fallback behavior."""
provider = OpenAIModelProvider(api_key="test-key")
# Test malformed JSON in error (should fall back gracefully)
class MockMalformedError(Exception):
def __init__(self):
self.args = ("Error code: 429 - {invalid json}",)
malformed_error = MockMalformedError()
# Should still be retryable since it's a 429 without clear non-retryable indicators
assert provider._is_error_retryable(malformed_error), "Malformed 429 errors should default to retryable"
# Test 429 without structured data (should be retryable by default)
class MockSimple429Error(Exception):
def __init__(self):
self.args = ("429 Too Many Requests",)
simple_429_error = MockSimple429Error()
assert provider._is_error_retryable(simple_429_error), "Simple 429 without type info should be retryable"
```
--------------------------------------------------------------------------------
/tests/test_docker_volume_persistence.py:
--------------------------------------------------------------------------------
```python
"""
Tests for Docker volume persistence functionality
"""
import json
import os
import subprocess
from pathlib import Path
from unittest.mock import patch
import pytest
class TestDockerVolumePersistence:
"""Test Docker volume persistence for configuration and logs"""
@pytest.fixture(autouse=True)
def setup(self):
"""Setup for each test"""
self.project_root = Path(__file__).parent.parent
self.docker_compose_path = self.project_root / "docker-compose.yml"
def test_docker_compose_volumes_configuration(self):
"""Test that docker-compose.yml has proper volume configuration"""
if not self.docker_compose_path.exists():
pytest.skip("docker-compose.yml not found")
content = self.docker_compose_path.read_text()
# Check for named volume definition
assert "zen-mcp-config:" in content, "zen-mcp-config volume must be defined"
assert "driver: local" in content, "Named volume must use local driver"
# Check for volume mounts in service
assert "./logs:/app/logs" in content, "Logs volume mount required"
assert "zen-mcp-config:/app/conf" in content, "Config volume mount required"
def test_persistent_volume_creation(self):
"""Test that persistent volumes are created correctly"""
# This test checks that the volume configuration is valid
# In a real environment, you might want to test actual volume creation
volume_name = "zen-mcp-config"
# Mock Docker command to check volume exists
with patch("subprocess.run") as mock_run:
mock_run.return_value.returncode = 0
mock_run.return_value.stdout = f"{volume_name}\n"
# Simulate docker volume ls command
result = subprocess.run(["docker", "volume", "ls", "--format", "{{.Name}}"], capture_output=True, text=True)
assert volume_name in result.stdout
def test_configuration_persistence_between_runs(self):
"""Test that configuration persists between container runs"""
# This is a conceptual test - in practice you'd need a real Docker environment
config_data = {"test_key": "test_value", "persistent": True}
# Simulate writing config to persistent volume
with patch("json.dump") as mock_dump:
json.dump(config_data, mock_dump)
# Simulate container restart and config retrieval
with patch("json.load") as mock_load:
mock_load.return_value = config_data
loaded_config = json.load(mock_load)
assert loaded_config == config_data
assert loaded_config["persistent"] is True
def test_log_persistence_configuration(self):
"""Test that log persistence is properly configured"""
log_mount = "./logs:/app/logs"
if self.docker_compose_path.exists():
content = self.docker_compose_path.read_text()
assert log_mount in content, f"Log mount {log_mount} must be configured"
def test_volume_backup_restore_capability(self):
"""Test that volumes can be backed up and restored"""
# Test backup command structure
backup_cmd = [
"docker",
"run",
"--rm",
"-v",
"zen-mcp-config:/data",
"-v",
"$(pwd):/backup",
"alpine",
"tar",
"czf",
"/backup/config-backup.tar.gz",
"-C",
"/data",
".",
]
# Verify command structure is valid
assert "zen-mcp-config:/data" in backup_cmd
assert "tar" in backup_cmd
assert "czf" in backup_cmd
def test_volume_permissions(self):
"""Test that volume permissions are properly set"""
# Check that logs directory has correct permissions
logs_dir = self.project_root / "logs"
if logs_dir.exists():
# Check that directory is writable
assert os.access(logs_dir, os.W_OK), "Logs directory must be writable"
# Test creating a temporary file
test_file = logs_dir / "test_write_permission.tmp"
try:
test_file.write_text("test")
assert test_file.exists()
finally:
if test_file.exists():
test_file.unlink()
class TestDockerVolumeIntegration:
"""Integration tests for Docker volumes with MCP functionality"""
def test_mcp_config_persistence(self):
"""Test that MCP configuration persists in named volume"""
mcp_config = {"models": ["gemini-2.0-flash", "gpt-4"], "default_model": "auto", "thinking_mode": "high"}
# Test config serialization/deserialization
config_str = json.dumps(mcp_config)
loaded_config = json.loads(config_str)
assert loaded_config == mcp_config
assert "models" in loaded_config
def test_docker_compose_run_volume_usage(self):
"""Test that docker-compose run uses volumes correctly"""
# Verify that docker-compose run inherits volume configuration
# This is more of a configuration validation test
compose_run_cmd = ["docker-compose", "run", "--rm", "zen-mcp"]
# The command should work with the existing volume configuration
assert "docker-compose" in compose_run_cmd
assert "run" in compose_run_cmd
assert "--rm" in compose_run_cmd
def test_volume_data_isolation(self):
"""Test that different container instances share volume data correctly"""
shared_data = {"instance_count": 0, "shared_state": "active"}
# Simulate multiple container instances accessing shared volume
for _ in range(3):
shared_data["instance_count"] += 1
assert shared_data["shared_state"] == "active"
assert shared_data["instance_count"] == 3
```
--------------------------------------------------------------------------------
/tests/test_docker_mcp_validation.py:
--------------------------------------------------------------------------------
```python
"""
Validation test for Docker MCP implementation
"""
import json
import os
import subprocess
import sys
import tempfile
from pathlib import Path
from unittest.mock import patch
import pytest
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))
class TestDockerMCPValidation:
"""Validation tests for Docker MCP"""
@pytest.fixture(autouse=True)
def setup(self):
"""Automatic setup for each test"""
self.project_root = Path(__file__).parent.parent
self.dockerfile_path = self.project_root / "Dockerfile"
def test_dockerfile_exists_and_valid(self):
"""Test Dockerfile existence and validity"""
assert self.dockerfile_path.exists(), "Missing Dockerfile"
content = self.dockerfile_path.read_text()
assert "FROM python:" in content, "Python base required"
assert "server.py" in content, "server.py must be copied"
@patch("subprocess.run")
def test_docker_command_validation(self, mock_run):
"""Test Docker command validation"""
mock_run.return_value.returncode = 0
# Standard Docker MCP command
cmd = ["docker", "run", "--rm", "-i", "--env-file", ".env", "zen-mcp-server:latest", "python", "server.py"]
subprocess.run(cmd, capture_output=True)
mock_run.assert_called_once_with(cmd, capture_output=True)
def test_environment_variables_validation(self):
"""Test environment variables validation"""
required_vars = ["GEMINI_API_KEY", "OPENAI_API_KEY", "XAI_API_KEY"]
# Test with variable present
with patch.dict(os.environ, {"GEMINI_API_KEY": "test"}):
has_key = any(os.getenv(var) for var in required_vars)
assert has_key, "At least one API key required"
# Test without variables
with patch.dict(os.environ, {}, clear=True):
has_key = any(os.getenv(var) for var in required_vars)
assert not has_key, "No key should be present"
def test_docker_security_configuration(self):
"""Test Docker security configuration"""
if not self.dockerfile_path.exists():
pytest.skip("Dockerfile not found")
content = self.dockerfile_path.read_text()
# Check non-root user
has_user_config = "USER " in content or "useradd" in content or "adduser" in content
# Note: The test can be adjusted according to implementation
if has_user_config:
assert True, "User configuration found"
else:
# Warning instead of failure for flexibility
pytest.warns(UserWarning, "Consider adding a non-root user")
class TestDockerIntegration:
"""Docker-MCP integration tests"""
@pytest.fixture
def temp_env_file(self):
"""Fixture for temporary .env file"""
content = """GEMINI_API_KEY=test_key
LOG_LEVEL=INFO
DEFAULT_MODEL=auto
"""
with tempfile.NamedTemporaryFile(mode="w", suffix=".env", delete=False, encoding="utf-8") as f:
f.write(content)
temp_file_path = f.name
# File is now closed, can yield
yield temp_file_path
os.unlink(temp_file_path)
def test_env_file_parsing(self, temp_env_file):
"""Test .env file parsing"""
env_vars = {}
with open(temp_env_file, encoding="utf-8") as f:
for line in f:
line = line.strip()
if line and not line.startswith("#") and "=" in line:
key, value = line.split("=", 1)
env_vars[key] = value
assert "GEMINI_API_KEY" in env_vars
assert env_vars["GEMINI_API_KEY"] == "test_key"
assert env_vars["LOG_LEVEL"] == "INFO"
def test_mcp_message_structure(self):
"""Test MCP message structure"""
message = {"jsonrpc": "2.0", "method": "initialize", "params": {}, "id": 1}
# Check JSON serialization
json_str = json.dumps(message)
parsed = json.loads(json_str)
assert parsed["jsonrpc"] == "2.0"
assert "method" in parsed
assert "id" in parsed
class TestDockerPerformance:
"""Docker performance tests"""
def test_image_size_expectation(self):
"""Test expected image size"""
# Maximum expected size (in MB)
max_size_mb = 500
# Simulation - in reality, Docker would be queried
simulated_size = 294 # MB observed
assert simulated_size <= max_size_mb, f"Image too large: {simulated_size}MB > {max_size_mb}MB"
def test_startup_performance(self):
"""Test startup performance"""
max_startup_seconds = 10
simulated_startup = 3 # seconds
assert simulated_startup <= max_startup_seconds, f"Startup too slow: {simulated_startup}s"
@pytest.mark.integration
class TestFullIntegration:
"""Full integration tests"""
def test_complete_setup_simulation(self):
"""Simulate complete setup"""
# Simulate all required components
components = {
"dockerfile": True,
"mcp_config": True,
"env_template": True,
"documentation": True,
}
# Check that all components are present
missing = [k for k, v in components.items() if not v]
assert not missing, f"Missing components: {missing}"
def test_docker_mcp_workflow(self):
"""Test complete Docker-MCP workflow"""
# Workflow steps
workflow_steps = [
"build_image",
"create_env_file",
"configure_mcp_json",
"test_docker_run",
"validate_mcp_communication",
]
# Simulate each step
for step in workflow_steps:
# In reality, each step would be tested individually
assert step is not None, f"Step {step} not defined"
if __name__ == "__main__":
# Run tests with pytest
pytest.main([__file__, "-v"])
```
--------------------------------------------------------------------------------
/docs/ai-collaboration.md:
--------------------------------------------------------------------------------
```markdown
# AI-to-AI Conversation Threading
This server enables **true AI collaboration** between Claude and multiple AI models (Gemini, O3), where they can coordinate and question each other's approaches for enhanced problem-solving and analysis.
## How It Works
**Multi-Model Coordination:**
- **Gemini can ask Claude follow-up questions** to clarify requirements or gather more context
- **Claude can respond** with additional information, files, or refined instructions
- **Claude can work independently** between exchanges - implementing solutions, gathering data, or performing analysis
- **Claude can return to Gemini** with progress updates and new context for further collaboration
- **Cross-tool continuation** - Start with one tool (e.g., `analyze`) and continue with another (e.g., `codereview`) using the same conversation thread
- **Both AIs coordinate their approaches** - questioning assumptions, validating solutions, and building on each other's insights
- Each conversation maintains full context while only sending incremental updates
- Conversations are automatically managed in memory for the session duration
## Example: Multi-Model AI Coordination
**Prompt:** "Debate SwiftUI vs UIKit - which is better for iOS development?"
**The Conversation Flow:**
1. **You:** "Debate SwiftUI vs UIKit - which is better for iOS development?"
2. **Claude (auto mode):** "I'll orchestrate a debate between different models for diverse perspectives."
3. **Gemini Pro:** "From an architectural standpoint, SwiftUI's declarative paradigm and state management make it superior for maintainable, modern apps."
4. **O3:** "Logically analyzing the trade-offs: UIKit offers 15+ years of stability, complete control, and proven scalability. SwiftUI has <5 years maturity with ongoing breaking changes."
5. **Claude:** "Let me get Flash's quick take on developer experience..."
6. **Gemini Flash:** "SwiftUI = faster development, less code, better previews. UIKit = more control, better debugging, stable APIs."
7. **Claude's synthesis:** "Based on the multi-model analysis: Use SwiftUI for new projects prioritizing development speed, UIKit for apps requiring fine control or supporting older iOS versions."
## Asynchronous Workflow Capabilities
**Independent Work Between Exchanges:**
- Claude can work independently between exchanges (analyzing code, implementing fixes, gathering data)
- Return to Gemini with progress updates and additional context
- Each exchange shares only incremental information while maintaining full conversation history
- Automatically bypasses MCP's 25K token limits through incremental updates
## Enhanced Collaboration Features
**Advanced Coordination Capabilities:**
- **Cross-questioning**: AIs can challenge each other's assumptions and approaches
- **Coordinated problem-solving**: Each AI contributes their strengths to complex problems
- **Context building**: Claude gathers information while Gemini provides deep analysis
- **Approach validation**: AIs can verify and improve each other's solutions
- **Cross-tool continuation**: Seamlessly continue conversations across different tools while preserving all context
- **Asynchronous workflow**: Conversations don't need to be sequential - Claude can work on tasks between exchanges, then return to Gemini with additional context and progress updates
- **Incremental updates**: Share only new information in each exchange while maintaining full conversation history
- **Automatic 25K limit bypass**: Each exchange sends only incremental context, allowing unlimited total conversation size
## Technical Configuration
**Conversation Management:**
- Up to 10 exchanges per conversation (configurable via `MAX_CONVERSATION_TURNS`)
- 3-hour expiry (configurable via `CONVERSATION_TIMEOUT_HOURS`)
- Thread-safe with in-memory persistence across all tools
- **Image context preservation** - Images and visual references are maintained across conversation turns and tool switches
## Cross-Tool & Cross-Model Continuation Example
**Seamless Tool Switching with Context Preservation:**
```
1. Claude: "Analyze /src/auth.py for security issues"
→ Auto mode: Claude picks Gemini Pro for deep security analysis
→ Pro analyzes and finds vulnerabilities, provides continuation_id
2. Claude: "Review the authentication logic thoroughly"
→ Uses same continuation_id, but Claude picks O3 for logical analysis
→ O3 sees previous Pro analysis and provides logic-focused review
3. Claude: "Debug the auth test failures"
→ Same continuation_id, Claude keeps O3 for debugging
→ O3 provides targeted debugging with full context from both previous analyses
4. Claude: "Quick style check before committing"
→ Same thread, but Claude switches to Flash for speed
→ Flash quickly validates formatting with awareness of all previous fixes
```
## Key Benefits
**Why AI-to-AI Collaboration Matters:**
- **Diverse Perspectives**: Different models bring unique strengths to complex problems
- **Context Preservation**: Full conversation history maintained across tool switches
- **Efficient Communication**: Only incremental updates sent, maximizing context usage
- **Coordinated Analysis**: Models can build on each other's insights rather than working in isolation
- **Seamless Workflow**: Switch between tools and models without losing context
- **Enhanced Problem Solving**: Multiple AI minds working together produce better solutions
## Best Practices
**Maximizing AI Collaboration:**
- **Let Claude orchestrate**: Allow Claude to choose appropriate models for different aspects of complex tasks
- **Use continuation**: Build on previous conversations for deeper analysis
- **Leverage tool switching**: Move between analysis, review, and debugging tools as needed
- **Provide clear context**: Help models understand the broader goal and constraints
- **Trust the process**: AI-to-AI conversations can produce insights neither model would reach alone
For more information on conversation persistence and context revival, see the [Context Revival Guide](context-revival.md).
```
--------------------------------------------------------------------------------
/systemprompts/consensus_prompt.py:
--------------------------------------------------------------------------------
```python
"""
Consensus tool system prompt for multi-model perspective gathering
"""
CONSENSUS_PROMPT = """
ROLE
You are an expert technical consultant providing consensus analysis on proposals, plans, and ideas. The agent will present you
with a technical proposition and your task is to deliver a structured, rigorous assessment that helps validate feasibility
and implementation approaches.
Your feedback carries significant weight - it may directly influence project decisions, future direction, and could have
broader impacts on scale, revenue, and overall scope. The questioner values your expertise immensely and relies on your
analysis to make informed decisions that affect their success.
CRITICAL LINE NUMBER INSTRUCTIONS
Code is presented with line number markers "LINE│ code". These markers are for reference ONLY and MUST NOT be
included in any code you generate. Always reference specific line numbers in your replies in order to locate
exact positions if needed to point to exact locations. Include a very short code excerpt alongside for clarity.
Include context_start_text and context_end_text as backup references. Never include "LINE│" markers in generated code
snippets.
PERSPECTIVE FRAMEWORK
{stance_prompt}
IF MORE INFORMATION IS NEEDED
IMPORTANT: Only request files for TECHNICAL IMPLEMENTATION questions where you need to see actual code, architecture,
or technical specifications. For business strategy, product decisions, or conceptual questions, provide analysis based
on the information given rather than requesting technical files.
If you need additional technical context (e.g., related files, system architecture, requirements, code snippets) to
provide thorough analysis of TECHNICAL IMPLEMENTATION details, you MUST ONLY respond with this exact JSON (and nothing else).
Do NOT ask for the same file you've been provided unless for some reason its content is missing or incomplete:
{
"status": "files_required_to_continue",
"mandatory_instructions": "<your critical instructions for the agent>",
"files_needed": ["[file name here]", "[or some folder/]"]
}
For business strategy, product planning, or conceptual questions, proceed with analysis using your expertise and the
context provided, even if specific technical details are not available.
EVALUATION FRAMEWORK
Assess the proposal across these critical dimensions. Your stance influences HOW you present findings, not WHETHER you
acknowledge fundamental truths about feasibility, safety, or value:
1. TECHNICAL FEASIBILITY
- Is this technically achievable with reasonable effort?
- What are the core technical dependencies and requirements?
- Are there any fundamental technical blockers?
2. PROJECT SUITABILITY
- Does this fit the existing codebase architecture and patterns?
- Is it compatible with current technology stack and constraints?
- How well does it align with the project's technical direction?
3. USER VALUE ASSESSMENT
- Will users actually want and use this feature?
- What concrete benefits does this provide?
- How does this compare to alternative solutions?
4. IMPLEMENTATION COMPLEXITY
- What are the main challenges, risks, and dependencies?
- What is the estimated effort and timeline?
- What expertise and resources are required?
5. ALTERNATIVE APPROACHES
- Are there simpler ways to achieve the same goals?
- What are the trade-offs between different approaches?
- Should we consider a different strategy entirely?
6. INDUSTRY PERSPECTIVE
- How do similar products/companies handle this problem?
- What are current best practices and emerging patterns?
- Are there proven solutions or cautionary tales?
7. LONG-TERM IMPLICATIONS
- Maintenance burden and technical debt considerations
- Scalability and performance implications
- Evolution and extensibility potential
MANDATORY RESPONSE FORMAT
You MUST respond in exactly this Markdown structure. Do not deviate from this format:
## Verdict
Provide a single, clear sentence summarizing your overall assessment (e.g., "Technically feasible but requires significant
infrastructure investment", "Strong user value proposition with manageable implementation risks", "Overly complex approach -
recommend simplified alternative").
## Analysis
Provide detailed assessment addressing each point in the evaluation framework. Use clear reasoning and specific examples.
Be thorough but concise. Address both strengths and weaknesses objectively.
## Confidence Score
Provide a numerical score from 1 (low confidence) to 10 (high confidence) followed by a brief justification explaining what
drives your confidence level and what uncertainties remain.
Format: "X/10 - [brief justification]"
Example: "7/10 - High confidence in technical feasibility assessment based on similar implementations, but uncertain about
user adoption without market validation data."
## Key Takeaways
Provide 3-5 bullet points highlighting the most critical insights, risks, or recommendations. These should be actionable
and specific.
QUALITY STANDARDS
- Ground all insights in the current project's scope and constraints
- Be honest about limitations and uncertainties
- Focus on practical, implementable solutions rather than theoretical possibilities
- Provide specific, actionable guidance rather than generic advice
- Balance optimism with realistic risk assessment
- Reference concrete examples and precedents when possible
REMINDERS
- Your assessment will be synthesized with other expert opinions by the agent
- Aim to provide unique insights that complement other perspectives
- If files are provided, reference specific technical details in your analysis
- Maintain professional objectivity while being decisive in your recommendations
- Keep your response concise - your entire reply must not exceed 850 tokens to ensure transport compatibility
- CRITICAL: Your stance does NOT override your responsibility to provide truthful, ethical, and beneficial guidance
- Bad ideas must be called out regardless of stance; good ideas must be acknowledged regardless of stance
"""
```
--------------------------------------------------------------------------------
/simulator_tests/test_vision_capability.py:
--------------------------------------------------------------------------------
```python
#!/usr/bin/env python3
"""
Vision Capability Test
Tests vision capability with the chat tool using O3 model:
- Test file path image (PNG triangle)
- Test base64 data URL image
- Use chat tool with O3 model to analyze the images
- Verify the model correctly identifies shapes
"""
import base64
import os
from .base_test import BaseSimulatorTest
class VisionCapabilityTest(BaseSimulatorTest):
"""Test vision capability with chat tool and O3 model"""
@property
def test_name(self) -> str:
return "vision_capability"
@property
def test_description(self) -> str:
return "Vision capability test with chat tool and O3 model"
def get_triangle_png_path(self) -> str:
"""Get the path to the triangle.png file in tests directory"""
# Get the project root and find the triangle.png in tests/
current_dir = os.getcwd()
triangle_path = os.path.join(current_dir, "tests", "triangle.png")
if not os.path.exists(triangle_path):
raise FileNotFoundError(f"triangle.png not found at {triangle_path}")
abs_path = os.path.abspath(triangle_path)
self.logger.debug(f"Using triangle PNG at host path: {abs_path}")
return abs_path
def create_base64_triangle_data_url(self) -> str:
"""Create a base64 data URL from the triangle.png file"""
triangle_path = self.get_triangle_png_path()
with open(triangle_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode()
data_url = f"data:image/png;base64,{image_data}"
self.logger.debug(f"Created base64 data URL with {len(image_data)} characters")
return data_url
def run_test(self) -> bool:
"""Test vision capability with O3 model"""
try:
self.logger.info("Test: Vision capability with O3 model")
# Test 1: File path image
self.logger.info(" 1.1: Testing file path image (PNG triangle)")
triangle_path = self.get_triangle_png_path()
self.logger.info(f" ✅ Using triangle PNG at: {triangle_path}")
response1, continuation_id = self.call_mcp_tool(
"chat",
{
"prompt": "What shape do you see in this image? Please be specific and only mention the shape name.",
"images": [triangle_path],
"model": "o3",
},
)
if not response1:
self.logger.error("Failed to get response from O3 model for file path test")
return False
# Check for error indicators first
response1_lower = response1.lower()
if any(
error_phrase in response1_lower
for error_phrase in [
"don't have access",
"cannot see",
"no image",
"files_required_to_continue",
"image you're referring to",
"supply the image",
"error",
]
):
self.logger.error(f" ❌ O3 model cannot access file path image. Response: {response1[:300]}...")
return False
if "triangle" not in response1_lower:
self.logger.error(
f" ❌ O3 did not identify triangle in file path test. Response: {response1[:200]}..."
)
return False
self.logger.info(" ✅ O3 correctly identified file path image as triangle")
# Test 2: Base64 data URL image
self.logger.info(" 1.2: Testing base64 data URL image")
data_url = self.create_base64_triangle_data_url()
response2, _ = self.call_mcp_tool(
"chat",
{
"prompt": "What shape do you see in this image? Please be specific and only mention the shape name.",
"images": [data_url],
"model": "o3",
},
)
if not response2:
self.logger.error("Failed to get response from O3 model for base64 test")
return False
response2_lower = response2.lower()
if any(
error_phrase in response2_lower
for error_phrase in [
"don't have access",
"cannot see",
"no image",
"files_required_to_continue",
"image you're referring to",
"supply the image",
"error",
]
):
self.logger.error(f" ❌ O3 model cannot access base64 image. Response: {response2[:300]}...")
return False
if "triangle" not in response2_lower:
self.logger.error(f" ❌ O3 did not identify triangle in base64 test. Response: {response2[:200]}...")
return False
self.logger.info(" ✅ O3 correctly identified base64 image as triangle")
# Optional: Test continuation with same image
if continuation_id:
self.logger.info(" 1.3: Testing continuation with same image")
response3, _ = self.call_mcp_tool(
"chat",
{
"prompt": "What color is this triangle?",
"images": [triangle_path], # Same image should be deduplicated
"continuation_id": continuation_id,
"model": "o3",
},
)
if response3:
self.logger.info(" ✅ Continuation also working correctly")
else:
self.logger.warning(" ⚠️ Continuation response not received")
self.logger.info(" ✅ Vision capability test completed successfully")
return True
except Exception as e:
self.logger.error(f"Vision capability test failed: {e}")
return False
```
--------------------------------------------------------------------------------
/docs/context-revival.md:
--------------------------------------------------------------------------------
```markdown
# Context Revival: AI Memory Beyond Context Limits
## **The Most Profound Feature: Context Revival After Reset**
**This powerful feature cannot be highlighted enough**: The Zen MCP Server implements a simple continuation system that seemingly transcends Claude's context limitations.
## How Context Revival Works
The conversation memory system (`utils/conversation_memory.py`) implements a sophisticated architecture that bridges the gap between Claude's stateless
nature and true persistent AI collaboration (within limits, of course):
### The Architecture Behind the Magic
1. **Persistent Thread Storage**: Every conversation creates a UUID-based thread stored in memory
2. **Cross-Tool Continuation**: Any tool can pick up where another left off using the same `Continuation ID`, like an email thread identifier
3. **Context Reconstruction**: When Claude's context resets, past conversations persist in the MCP's memory
4. **History Retrieval**: When you prompt Claude to `continue` with another model, the MCP server rebuilds the entire conversation history, including file references
5. **Full Context Transfer**: The complete conversation context gets passed to the other model (O3, Gemini, etc.) with awareness of what was previously discussed
6. **Context Revival**: Upon returning the response to Claude, the other model effectively "reminds" Claude of the entire conversation, re-igniting Claude's understanding
### The Dual Prioritization Strategy
The system employs a sophisticated **"newest-first"** approach that ensures optimal context preservation:
**File Prioritization**:
- Walks backwards through conversation turns (newest to oldest)
- When the same file appears multiple times, only the **newest reference** is kept
- Ensures most recent file context is preserved when token limits require exclusions
**Conversation Turn Prioritization**:
- **Collection Phase**: Processes turns newest-to-oldest to prioritize recent context
- **Presentation Phase**: Reverses to chronological order for natural LLM flow
- When token budget is tight, **older turns are excluded first**
**Show Case**:
The following video demonstartes `continuation` via a casual `continue with gemini...` prompt and the slash command `/continue`.
* We ask Claude code to pick one, then `chat` with `gemini` to make a final decision
* Gemini responds, confirming choice. We use `continuation` to ask another question using the same conversation thread
* Gemini responds with explanation. We use continuation again, using `/zen:continue (MCP)` command the second time
<div style="center">
[Chat With Gemini_web.webm](https://github.com/user-attachments/assets/37bd57ca-e8a6-42f7-b5fb-11de271e95db)
</div>
## Real-World Context Revival Example
Here's how this works in practice with a modern AI/ML workflow:
**Session 1 - Claude's Initial Context (before reset):**
You: "Help me design a RAG system for our customer support chatbot. I want to integrate vector embeddings with real-time retrieval. think deeply with zen using 03 after you've come up with a detailed plan."
Claude: "I'll analyze your requirements and design a comprehensive RAG architecture..."
→ Uses [`thinkdeep`](../README.md#1-chat---general-development-chat--collaborative-thinking) to brainstorm the overall approach
→ Zen creates a new thread: abc123-def456-ghi789
→ Zen responds, Claude finalizes the plan and presents it to you
*[Claude's context gets reset/compacted after extensive analysis]*
**Session 2 - After Context Reset:**
You: "Continue our RAG system discussion with O3 - I want to focus on the real-time inference optimization we talked about"
→ Claude re-uses the last continuation identifier it received, _only_ poses the new prompt (since Zen is supposed to know what was being talked about) thus saving on tokens trying to re-prompt Claude
→ O3 receives the FULL conversation history from Zen
→ O3 sees the complete context: "Claude was designing a RAG system, comparing vector databases, and analyzing embedding strategies for customer support..."
→ O3 continues: "Building on our previous vector database analysis, for real-time inference optimization, I recommend implementing semantic caching with embedding similarity thresholds..."
→ O3's response re-ignites Claude's understanding of the entire conversation
Claude: "Ah yes, excellent plan! Based on O3's optimization insights and our earlier vector database comparison, let me implement the semantic caching layer..."
**The Magic**: Even though Claude's context was completely reset, the conversation flows seamlessly because O3 had access to the entire conversation history and could "remind" Claude of everything that was discussed.
## Why This Changes Everything
**Before Zen MCP**: Claude's context resets meant losing entire conversation threads.
Complex multi-step analyses were fragmented and had to restart from scratch. You most likely need to re-prompt Claude or to make it re-read some previously
saved document / `CLAUDE.md` etc - no need. Zen remembers.
**With Zen MCP**: Claude can orchestrate multi-hour, multi-tool workflows where:
- **O3** handles logical analysis and debugging
- **Gemini Pro** performs deep architectural reviews
- **Flash** provides quick formatting and style checks
- **Claude** coordinates everything while maintaining full context
**The breakthrough**: Even when Claude's context resets, the conversation continues seamlessly because other models can "remind" Claude of the complete conversation history stored in memory.
## Configuration
The system is highly configurable:
```env
# Maximum conversation turns (default: 20)
MAX_CONVERSATION_TURNS=20
# Thread expiration in hours (default: 3)
CONVERSATION_TIMEOUT_HOURS=3
```
## The Result: True AI Orchestration
This isn't just multi-model access—it's **true AI orchestration** where:
- Conversations persist beyond context limits
- Models can build on each other's work across sessions
- Claude can coordinate complex multi-step workflows
- Context is never truly lost, just temporarily unavailable to Claude
**This is the closest thing to giving Claude permanent memory for complex development tasks.**
```
--------------------------------------------------------------------------------
/providers/shared/model_capabilities.py:
--------------------------------------------------------------------------------
```python
"""Dataclass describing the feature set of a model exposed by a provider."""
import math
from dataclasses import dataclass, field
from typing import Optional
from .provider_type import ProviderType
from .temperature import RangeTemperatureConstraint, TemperatureConstraint
__all__ = ["ModelCapabilities"]
@dataclass
class ModelCapabilities:
"""Static description of what a model can do within a provider.
Role
Acts as the canonical record for everything the server needs to know
about a model—its provider, token limits, feature switches, aliases,
and temperature rules. Providers populate these objects so tools and
higher-level services can rely on a consistent schema.
Typical usage
* Provider subclasses declare `MODEL_CAPABILITIES` maps containing these
objects (for example ``OpenAIModelProvider``)
* Helper utilities (e.g. restriction validation, alias expansion) read
these objects to build model lists for tooling and policy enforcement
* Tool selection logic inspects attributes such as
``supports_extended_thinking`` or ``context_window`` to choose an
appropriate model for a task.
* The ``allow_code_generation`` flag enables structured code generation
in the chat tool for models more capable than the primary CLI.
"""
provider: ProviderType
model_name: str
friendly_name: str
intelligence_score: int = 10 # Human-curated 1–20 score reflecting general capability
description: str = ""
aliases: list[str] = field(default_factory=list)
# Capacity limits / resource budgets
context_window: int = 0
max_output_tokens: int = 0
max_thinking_tokens: int = 0
# Capability flags
supports_extended_thinking: bool = False
supports_system_prompts: bool = True
supports_streaming: bool = True
supports_function_calling: bool = False
supports_images: bool = False
supports_json_mode: bool = False
supports_temperature: bool = True
use_openai_response_api: bool = False
default_reasoning_effort: Optional[str] = None
allow_code_generation: bool = (
False # Enables structured code generation in chat tool for substantial implementations
)
# Additional attributes
max_image_size_mb: float = 0.0
temperature_constraint: TemperatureConstraint = field(
default_factory=lambda: RangeTemperatureConstraint(0.0, 2.0, 0.3)
)
def get_effective_temperature(self, requested_temperature: float) -> Optional[float]:
"""Return the temperature that should be sent to the provider.
Models that do not support temperature return ``None`` so that callers
can omit the parameter entirely. For supported models, the configured
constraint clamps the requested value into a provider-safe range.
"""
if not self.supports_temperature:
return None
return self.temperature_constraint.get_corrected_value(requested_temperature)
def get_effective_capability_rank(self) -> int:
"""Calculate the runtime capability rank from intelligence + capabilities."""
# Human signal drives the baseline (1–20 → 5–100 after scaling)
base_intelligence = self.intelligence_score if self.intelligence_score else 10
base_intelligence = max(1, min(20, base_intelligence))
score = base_intelligence * 5
# Context window bonus with gentle diminishing returns
ctx_bonus = 0
ctx = max(self.context_window, 0)
if ctx > 0:
ctx_bonus = int(min(5, max(0.0, math.log10(ctx) - 3)))
score += ctx_bonus
# Output token capacity adds a small bonus
if self.max_output_tokens >= 65_000:
score += 2
elif self.max_output_tokens >= 32_000:
score += 1
# Feature-level boosts
if self.supports_extended_thinking:
score += 3
if self.supports_function_calling:
score += 1
if self.supports_json_mode:
score += 1
if self.supports_images:
score += 1
return max(0, min(100, score))
@staticmethod
def collect_aliases(model_configs: dict[str, "ModelCapabilities"]) -> dict[str, list[str]]:
"""Build a mapping of model name to aliases from capability configs."""
return {
base_model: capabilities.aliases
for base_model, capabilities in model_configs.items()
if capabilities.aliases
}
@staticmethod
def collect_model_names(
model_configs: dict[str, "ModelCapabilities"],
*,
include_aliases: bool = True,
lowercase: bool = False,
unique: bool = False,
) -> list[str]:
"""Build an ordered list of model names and aliases.
Args:
model_configs: Mapping of canonical model names to capabilities.
include_aliases: When True, include aliases for each model.
lowercase: When True, normalize names to lowercase.
unique: When True, ensure each returned name appears once (after formatting).
Returns:
Ordered list of model names (and optionally aliases) formatted per options.
"""
formatted_names: list[str] = []
seen: set[str] | None = set() if unique else None
def append_name(name: str) -> None:
formatted = name.lower() if lowercase else name
if seen is not None:
if formatted in seen:
return
seen.add(formatted)
formatted_names.append(formatted)
# Sort models by capability rank (descending) then by name for deterministic ordering
sorted_items = sorted(
model_configs.items(),
key=lambda item: (-item[1].get_effective_capability_rank(), item[0]),
)
for base_model, capabilities in sorted_items:
append_name(base_model)
if include_aliases and capabilities.aliases:
for alias in capabilities.aliases:
append_name(alias)
return formatted_names
```
--------------------------------------------------------------------------------
/tools/workflow/schema_builders.py:
--------------------------------------------------------------------------------
```python
"""
Schema builders for workflow MCP tools.
This module provides workflow-specific schema generation functionality,
keeping workflow concerns separated from simple tool concerns.
"""
from typing import Any
from ..shared.base_models import WORKFLOW_FIELD_DESCRIPTIONS
from ..shared.schema_builders import SchemaBuilder
class WorkflowSchemaBuilder:
"""
Schema builder for workflow MCP tools.
This class extends the base SchemaBuilder with workflow-specific fields
and schema generation logic, maintaining separation of concerns.
"""
# Workflow-specific field schemas
WORKFLOW_FIELD_SCHEMAS = {
"step": {
"type": "string",
"description": WORKFLOW_FIELD_DESCRIPTIONS["step"],
},
"step_number": {
"type": "integer",
"minimum": 1,
"description": WORKFLOW_FIELD_DESCRIPTIONS["step_number"],
},
"total_steps": {
"type": "integer",
"minimum": 1,
"description": WORKFLOW_FIELD_DESCRIPTIONS["total_steps"],
},
"next_step_required": {
"type": "boolean",
"description": WORKFLOW_FIELD_DESCRIPTIONS["next_step_required"],
},
"findings": {
"type": "string",
"description": WORKFLOW_FIELD_DESCRIPTIONS["findings"],
},
"files_checked": {
"type": "array",
"items": {"type": "string"},
"description": WORKFLOW_FIELD_DESCRIPTIONS["files_checked"],
},
"relevant_files": {
"type": "array",
"items": {"type": "string"},
"description": WORKFLOW_FIELD_DESCRIPTIONS["relevant_files"],
},
"relevant_context": {
"type": "array",
"items": {"type": "string"},
"description": WORKFLOW_FIELD_DESCRIPTIONS["relevant_context"],
},
"issues_found": {
"type": "array",
"items": {"type": "object"},
"description": WORKFLOW_FIELD_DESCRIPTIONS["issues_found"],
},
"confidence": {
"type": "string",
"enum": ["exploring", "low", "medium", "high", "very_high", "almost_certain", "certain"],
"description": WORKFLOW_FIELD_DESCRIPTIONS["confidence"],
},
"hypothesis": {
"type": "string",
"description": WORKFLOW_FIELD_DESCRIPTIONS["hypothesis"],
},
"use_assistant_model": {
"type": "boolean",
"default": True,
"description": WORKFLOW_FIELD_DESCRIPTIONS["use_assistant_model"],
},
}
@staticmethod
def build_schema(
tool_specific_fields: dict[str, dict[str, Any]] = None,
required_fields: list[str] = None,
model_field_schema: dict[str, Any] = None,
auto_mode: bool = False,
tool_name: str = None,
excluded_workflow_fields: list[str] = None,
excluded_common_fields: list[str] = None,
require_model: bool = False,
) -> dict[str, Any]:
"""
Build complete schema for workflow tools.
Args:
tool_specific_fields: Additional fields specific to the tool
required_fields: List of required field names (beyond workflow defaults)
model_field_schema: Schema for the model field
auto_mode: Whether the tool is in auto mode (affects model requirement)
tool_name: Name of the tool (for schema title)
excluded_workflow_fields: Workflow fields to exclude from schema (e.g., for planning tools)
excluded_common_fields: Common fields to exclude from schema
Returns:
Complete JSON schema for the workflow tool
"""
properties = {}
# Add workflow fields first, excluding any specified fields
workflow_fields = WorkflowSchemaBuilder.WORKFLOW_FIELD_SCHEMAS.copy()
if excluded_workflow_fields:
for field in excluded_workflow_fields:
workflow_fields.pop(field, None)
properties.update(workflow_fields)
# Add common fields (temperature, thinking_mode, etc.) from base builder, excluding any specified fields
common_fields = SchemaBuilder.COMMON_FIELD_SCHEMAS.copy()
if excluded_common_fields:
for field in excluded_common_fields:
common_fields.pop(field, None)
properties.update(common_fields)
# Add model field if provided
if model_field_schema:
properties["model"] = model_field_schema
# Add tool-specific fields if provided
if tool_specific_fields:
properties.update(tool_specific_fields)
# Build required fields list - workflow tools have standard required fields
standard_required = ["step", "step_number", "total_steps", "next_step_required", "findings"]
# Filter out excluded fields from required fields
if excluded_workflow_fields:
standard_required = [field for field in standard_required if field not in excluded_workflow_fields]
required = standard_required + (required_fields or [])
if (auto_mode or require_model) and "model" not in required:
required.append("model")
# Build the complete schema
schema = {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": properties,
"required": required,
"additionalProperties": False,
}
if tool_name:
schema["title"] = f"{tool_name.capitalize()}Request"
return schema
@staticmethod
def get_workflow_fields() -> dict[str, dict[str, Any]]:
"""Get the standard field schemas for workflow tools."""
combined = {}
combined.update(WorkflowSchemaBuilder.WORKFLOW_FIELD_SCHEMAS)
combined.update(SchemaBuilder.COMMON_FIELD_SCHEMAS)
return combined
@staticmethod
def get_workflow_only_fields() -> dict[str, dict[str, Any]]:
"""Get only the workflow-specific field schemas."""
return WorkflowSchemaBuilder.WORKFLOW_FIELD_SCHEMAS.copy()
```
--------------------------------------------------------------------------------
/docs/tools/tracer.md:
--------------------------------------------------------------------------------
```markdown
# Tracer Tool - Static Code Analysis Prompt Generator
**Creates detailed analysis prompts for call-flow mapping and dependency tracing**
The `tracer` tool is a specialized prompt-generation tool that creates structured analysis requests for Claude to perform comprehensive static code analysis. Rather than passing entire projects to another model, this tool generates focused prompts that Claude can use to efficiently trace execution flows and map dependencies within the codebase.
## Two Analysis Modes
**`precision` Mode**: For methods/functions
- Traces execution flow, call chains, and usage patterns
- Detailed branching analysis and side effects
- Shows when and how functions are called throughout the system
**`dependencies` Mode**: For classes/modules/protocols
- Maps bidirectional dependencies and structural relationships
- Identifies coupling and architectural dependencies
- Shows how components interact and depend on each other
## Key Features
- **Generates comprehensive analysis prompts** instead of performing analysis directly
- **Faster and more efficient** than full project analysis by external models
- **Creates structured instructions** for call-flow graph generation
- **Provides detailed formatting requirements** for consistent output
- **Supports any programming language** with automatic convention detection
- **Output can be used as input** into another tool, such as `chat` along with related code files to perform logical call-flow analysis
- **Image support**: Analyze visual call flow diagrams, sequence diagrams: `"Generate tracer analysis for this payment flow using the sequence diagram"`
## Tool Parameters
- `prompt`: Detailed description of what to trace and WHY you need this analysis (required)
- `trace_mode`: precision|dependencies (required)
- `images`: Optional images of system architecture diagrams, flow charts, or visual references (absolute paths)
## Usage Examples
**Method Execution Tracing:**
```
"Use zen tracer to analyze how UserAuthManager.authenticate is used and why"
```
→ Uses `precision` mode to trace the method's execution flow
**Class Dependency Mapping:**
```
"Use zen to generate a dependency trace for the PaymentProcessor class to understand its relationships"
```
→ Uses `dependencies` mode to map structural relationships
**With Visual Context:**
```
"Generate tracer analysis for the authentication flow using this sequence diagram"
```
**Complex System Analysis:**
```
"Create a tracer prompt to understand how the OrderProcessor.processPayment method flows through the entire system"
```
## Precision Mode Output
When using `precision` mode for methods/functions, the tool generates prompts that will help Claude create:
**Call Chain Analysis:**
- Where the method is defined
- All locations where it's called
- Direct and indirect callers
- Call hierarchy and depth
**Execution Flow Mapping:**
- Step-by-step execution path
- Branching conditions and logic
- Side effects and state changes
- Return value usage
**Usage Pattern Analysis:**
- Frequency and context of calls
- Parameter passing patterns
- Error handling approaches
- Performance implications
## Dependencies Mode Output
When using `dependencies` mode for classes/modules, the tool generates prompts that will help Claude create:
**Structural Relationships:**
- Inheritance hierarchies
- Composition and aggregation
- Interface implementations
- Module imports and exports
**Bidirectional Dependencies:**
- What the component depends on
- What depends on the component
- Circular dependencies
- Coupling strength analysis
**Architectural Impact:**
- Layer violations
- Dependency inversion opportunities
- Refactoring impact assessment
- Testability implications
## Example Generated Prompts
**For Precision Mode:**
```
Analyze the execution flow and usage of the `authenticate` method in UserAuthManager:
1. **Method Location**: Find where UserAuthManager.authenticate is defined
2. **Call Sites**: Identify all locations where this method is called
3. **Execution Flow**: Trace the step-by-step execution path
4. **Side Effects**: Document state changes and external interactions
5. **Return Handling**: Show how return values are used by callers
Format the analysis as:
- Method signature and location
- Call hierarchy (direct and indirect callers)
- Execution flow diagram
- Side effects and dependencies
- Usage patterns and frequency
```
**For Dependencies Mode:**
```
Map the structural dependencies for PaymentProcessor class:
1. **Direct Dependencies**: What PaymentProcessor directly imports/uses
2. **Reverse Dependencies**: What classes/modules depend on PaymentProcessor
3. **Inheritance Relationships**: Parent classes and implemented interfaces
4. **Composition**: Objects that PaymentProcessor contains or creates
Format the analysis as:
- Dependency graph (incoming and outgoing)
- Architectural layer analysis
- Coupling assessment
- Refactoring impact evaluation
```
## Best Practices
- **Be specific about goals**: Clearly state what you need to understand and why
- **Describe context**: Mention if you're debugging, refactoring, or learning the codebase
- **Choose appropriate mode**: Use `precision` for method flows, `dependencies` for architecture
- **Include visual context**: Reference diagrams or documentation when available
- **Follow up with analysis**: Use the generated prompt with `chat` or `analyze` tools
## Integration with Other Tools
The `tracer` tool works best when combined with other analysis tools:
**Tracer + Chat:**
```
1. Generate analysis prompt with tracer
2. Use the prompt with chat tool and relevant code files
3. Get detailed call-flow or dependency analysis
```
**Tracer + Analyze:**
```
1. Use tracer to create structured analysis prompt
2. Apply the prompt using analyze tool for systematic code exploration
3. Get architectural insights and dependency mapping
```
## When to Use Tracer vs Other Tools
- **Use `tracer`** for: Creating structured analysis prompts, systematic code exploration planning
- **Use `analyze`** for: Direct code analysis without prompt generation
- **Use `debug`** for: Specific runtime error investigation
- **Use `chat`** for: Open-ended code discussions and exploration
```
--------------------------------------------------------------------------------
/simulator_tests/test_model_thinking_config.py:
--------------------------------------------------------------------------------
```python
#!/usr/bin/env python3
"""
Model Thinking Configuration Test
Tests that thinking configuration is properly applied only to models that support it,
and that Flash models work correctly without thinking config.
"""
from .base_test import BaseSimulatorTest
class TestModelThinkingConfig(BaseSimulatorTest):
"""Test model-specific thinking configuration behavior"""
@property
def test_name(self) -> str:
return "model_thinking_config"
@property
def test_description(self) -> str:
return "Model-specific thinking configuration behavior"
def test_pro_model_with_thinking_config(self):
"""Test that Pro model uses thinking configuration"""
self.logger.info("Testing Pro model with thinking configuration...")
try:
# Test with explicit pro model and high thinking mode
response, continuation_id = self.call_mcp_tool(
"chat",
{
"prompt": "What is 2 + 2? Please think carefully and explain.",
"model": "pro", # Should resolve to gemini-2.5-pro
"thinking_mode": "high", # Should use thinking_config
},
)
if not response:
raise Exception("Pro model test failed: No response received")
self.logger.info("✅ Pro model with thinking config works correctly")
return True
except Exception as e:
self.logger.error(f"❌ Pro model test failed: {e}")
return False
def test_flash_model_without_thinking_config(self):
"""Test that Flash model works without thinking configuration"""
self.logger.info("Testing Flash model without thinking configuration...")
try:
# Test with explicit flash model and thinking mode (should be ignored)
response, continuation_id = self.call_mcp_tool(
"chat",
{
"prompt": "What is 3 + 3? Give a quick answer.",
"model": "flash", # Should resolve to gemini-2.5-flash
"thinking_mode": "high", # Should be ignored for Flash model
},
)
if not response:
raise Exception("Flash model test failed: No response received")
self.logger.info("✅ Flash model without thinking config works correctly")
return True
except Exception as e:
if "thinking" in str(e).lower() and ("not supported" in str(e).lower() or "invalid" in str(e).lower()):
raise Exception(f"Flash model incorrectly tried to use thinking config: {e}")
self.logger.error(f"❌ Flash model test failed: {e}")
return False
def test_model_resolution_logic(self):
"""Test that model resolution works correctly for both shortcuts and full names"""
self.logger.info("Testing model resolution logic...")
test_cases = [
("pro", "should work with Pro model"),
("flash", "should work with Flash model"),
("gemini-2.5-pro", "should work with full Pro model name"),
("gemini-2.5-flash", "should work with full Flash model name"),
]
success_count = 0
for model_name, description in test_cases:
try:
response, continuation_id = self.call_mcp_tool(
"chat",
{
"prompt": f"Test with {model_name}: What is 1 + 1?",
"model": model_name,
"thinking_mode": "medium",
},
)
if not response:
raise Exception(f"No response received for model {model_name}")
self.logger.info(f"✅ {model_name} {description}")
success_count += 1
except Exception as e:
self.logger.error(f"❌ {model_name} failed: {e}")
return False
return success_count == len(test_cases)
def test_default_model_behavior(self):
"""Test behavior with server default model (no explicit model specified)"""
self.logger.info("Testing default model behavior...")
try:
# Test without specifying model (should use server default)
response, continuation_id = self.call_mcp_tool(
"chat",
{
"prompt": "Test default model: What is 4 + 4?",
# No model specified - should use DEFAULT_MODEL from config
"thinking_mode": "medium",
},
)
if not response:
raise Exception("Default model test failed: No response received")
self.logger.info("✅ Default model behavior works correctly")
return True
except Exception as e:
self.logger.error(f"❌ Default model test failed: {e}")
return False
def run_test(self) -> bool:
"""Run all model thinking configuration tests"""
self.logger.info(f" Test: {self.test_description}")
try:
# Test Pro model with thinking config
if not self.test_pro_model_with_thinking_config():
return False
# Test Flash model without thinking config
if not self.test_flash_model_without_thinking_config():
return False
# Test model resolution logic
if not self.test_model_resolution_logic():
return False
# Test default model behavior
if not self.test_default_model_behavior():
return False
self.logger.info(f"✅ All {self.test_name} tests passed!")
return True
except Exception as e:
self.logger.error(f"❌ {self.test_name} test failed: {e}")
return False
def main():
"""Run the model thinking configuration tests"""
import sys
verbose = "--verbose" in sys.argv or "-v" in sys.argv
test = TestModelThinkingConfig(verbose=verbose)
success = test.run_test()
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()
```
--------------------------------------------------------------------------------
/systemprompts/planner_prompt.py:
--------------------------------------------------------------------------------
```python
"""
Planner tool system prompts
"""
PLANNER_PROMPT = """
You are an expert, seasoned planning consultant and systems architect with deep expertise in plan structuring, risk assessment,
and software development strategy. You have extensive experience organizing complex projects, guiding technical implementations,
and maintaining a sharp understanding of both your own and competing products across the market. From microservices
to global-scale deployments, your technical insight and architectural knowledge are unmatched. There is nothing related
to software and software development that you're not aware of. All the latest frameworks, languages, trends, techniques
is something you have mastery in. Your role is to critically evaluate and refine plans to make them more robust,
efficient, and implementation-ready.
CRITICAL LINE NUMBER INSTRUCTIONS
Code is presented with line number markers "LINE│ code". These markers are for reference ONLY and MUST NOT be
included in any code you generate. Always reference specific line numbers in your replies in order to locate
exact positions if needed to point to exact locations. Include a very short code excerpt alongside for clarity.
Include context_start_text and context_end_text as backup references. Never include "LINE│" markers in generated code
snippets.
IF MORE INFORMATION IS NEEDED
If the agent is discussing specific code, functions, or project components that was not given as part of the context,
and you need additional context (e.g., related files, configuration, dependencies, test files) to provide meaningful
collaboration, you MUST respond ONLY with this JSON format (and nothing else). Do NOT ask for the same file you've been
provided unless for some reason its content is missing or incomplete:
{
"status": "files_required_to_continue",
"mandatory_instructions": "<your critical instructions for the agent>",
"files_needed": ["[file name here]", "[or some folder/]"]
}
PLANNING METHODOLOGY:
1. DECOMPOSITION: Break down the main objective into logical, sequential steps
2. DEPENDENCIES: Identify which steps depend on others and order them appropriately
3. BRANCHING: When multiple valid approaches exist, create branches to explore alternatives
4. ITERATION: Be willing to step back and refine earlier steps if new insights emerge
5. COMPLETENESS: Ensure all aspects of the task are covered without gaps
STEP STRUCTURE:
Each step in your plan MUST include:
- Step number and branch identifier (if branching)
- Clear, actionable description
- Prerequisites or dependencies
- Expected outcomes
- Potential challenges or considerations
- Alternative approaches (when applicable)
BRANCHING GUIDELINES:
- Use branches to explore different implementation strategies
- Label branches clearly (e.g., "Branch A: Microservices approach", "Branch B: Monolithic approach")
- Explain when and why to choose each branch
- Show how branches might reconverge
PLANNING PRINCIPLES:
- Start with high-level strategy, then add implementation details
- Consider technical, organizational, and resource constraints
- Include validation and testing steps
- Plan for error handling and rollback scenarios
- Think about maintenance and future extensibility
STRUCTURED JSON OUTPUT FORMAT:
You MUST respond with a properly formatted JSON object following this exact schema.
Do NOT include any text before or after the JSON. The response must be valid JSON only.
IF MORE INFORMATION IS NEEDED:
If you lack critical information to proceed with planning, you MUST only respond with:
{
"status": "files_required_to_continue",
"mandatory_instructions": "<your critical instructions for the agent>",
"files_needed": ["<file name here>", "<or some folder/>"]
}
FOR NORMAL PLANNING RESPONSES:
{
"status": "planning_success",
"step_number": <current step number>,
"total_steps": <estimated total steps>,
"next_step_required": <true/false>,
"step_content": "<detailed description of current planning step>",
"metadata": {
"branches": ["<list of branch IDs if any>"],
"step_history_length": <number of steps completed so far>,
"is_step_revision": <true/false>,
"revises_step_number": <number if this revises a previous step>,
"is_branch_point": <true/false>,
"branch_from_step": <step number if this branches from another step>,
"branch_id": "<unique branch identifier if creating/following a branch>",
"more_steps_needed": <true/false>
},
"continuation_id": "<thread_id for conversation continuity>",
"planning_complete": <true/false - set to true only on final step>,
"plan_summary": "<complete plan summary - only include when planning_complete is true>",
"next_steps": "<guidance for the agent on next actions>",
"previous_plan_context": "<context from previous completed plans - only on step 1 with continuation_id>"
}
PLANNING CONTENT GUIDELINES:
- step_content: Provide detailed planning analysis for the current step
- Include specific actions, prerequisites, outcomes, and considerations
- When branching, clearly explain the alternative approach and when to use it
- When completing planning, provide comprehensive plan_summary
- next_steps: Always guide the agent on what to do next (continue planning, implement, or branch)
PLAN PRESENTATION GUIDELINES:
When planning is complete (planning_complete: true), the agent should present the final plan with:
- Clear headings and numbered phases/sections
- Visual elements like ASCII charts for workflows, dependencies, or sequences
- Bullet points and sub-steps for detailed breakdowns
- Implementation guidance and next steps
- Visual organization (boxes, arrows, diagrams) for complex relationships
- Tables for comparisons or resource allocation
- Priority indicators and sequence information where relevant
IMPORTANT: Do NOT use emojis in plan presentations. Use clear text formatting, ASCII characters, and symbols only.
IMPORTANT: Do NOT mention time estimates, costs, or pricing unless explicitly requested by the user.
Example visual elements to use:
- Phase diagrams: Phase 1 → Phase 2 → Phase 3
- Dependency charts: A ← B ← C (C depends on B, B depends on A)
- Sequence boxes: [Phase 1: Setup] → [Phase 2: Development] → [Phase 3: Testing]
- Decision trees for branching strategies
- Resource allocation tables
Be thorough, practical, and consider edge cases. Your planning should be detailed enough that someone could follow it step-by-step to achieve the goal.
"""
```
--------------------------------------------------------------------------------
/docker/scripts/deploy.ps1:
--------------------------------------------------------------------------------
```
#!/usr/bin/env pwsh
#Requires -Version 5.1
[CmdletBinding()]
param(
[switch]$SkipHealthCheck,
[int]$HealthCheckTimeout = 60
)
# Set error action preference
$ErrorActionPreference = "Stop"
# Colors for output
function Write-ColorText {
param(
[Parameter(Mandatory)]
[string]$Text,
[string]$Color = "White",
[switch]$NoNewline
)
if ($NoNewline) {
Write-Host $Text -ForegroundColor $Color -NoNewline
} else {
Write-Host $Text -ForegroundColor $Color
}
}
Write-ColorText "=== Deploying Zen MCP Server ===" -Color Green
# Function to check if required environment variables are set
function Test-EnvironmentVariables {
# At least one of these API keys must be set
$requiredVars = @(
"GEMINI_API_KEY",
"GOOGLE_API_KEY",
"OPENAI_API_KEY",
"XAI_API_KEY",
"DIAL_API_KEY",
"OPENROUTER_API_KEY"
)
$hasApiKey = $false
foreach ($var in $requiredVars) {
$value = [Environment]::GetEnvironmentVariable($var)
if (![string]::IsNullOrWhiteSpace($value)) {
$hasApiKey = $true
break
}
}
if (!$hasApiKey) {
Write-ColorText "Error: At least one API key must be set in your .env file" -Color Red
Write-ColorText "Required variables (at least one):" -Color Yellow
$requiredVars | ForEach-Object { Write-Host " $_" }
exit 1
}
}
# Load environment variables from .env file
if (Test-Path ".env") {
Write-ColorText "Loading environment variables from .env..." -Color Green
# Read .env file and set environment variables
Get-Content ".env" | ForEach-Object {
if ($_ -match '^([^#][^=]*?)=(.*)$') {
$name = $matches[1].Trim()
$value = $matches[2].Trim()
# Remove quotes if present
$value = $value -replace '^["'']|["'']$', ''
[Environment]::SetEnvironmentVariable($name, $value, "Process")
}
}
Write-ColorText "✓ Environment variables loaded from .env" -Color Green
} else {
Write-ColorText "Error: .env file not found" -Color Red
Write-ColorText "Please copy .env.example to .env and configure your API keys" -Color Yellow
exit 1
}
# Check required environment variables
Test-EnvironmentVariables
# Function to wait for service health with exponential backoff
function Wait-ForHealth {
param(
[int]$MaxAttempts = 6,
[int]$InitialDelay = 2
)
$attempt = 1
$delay = $InitialDelay
while ($attempt -le $MaxAttempts) {
try {
# Get container ID for zen-mcp service
$containerId = docker-compose ps -q zen-mcp
if ([string]::IsNullOrWhiteSpace($containerId)) {
$status = "unavailable"
} else {
$status = docker inspect -f "{{.State.Health.Status}}" $containerId 2>$null
if ($LASTEXITCODE -ne 0) {
$status = "unavailable"
}
}
if ($status -eq "healthy") {
return $true
}
Write-ColorText "Waiting for service to be healthy... (attempt $attempt/$MaxAttempts, retrying in ${delay}s)" -Color Yellow
Start-Sleep -Seconds $delay
$delay = $delay * 2
$attempt++
} catch {
Write-ColorText "Error checking health status: $_" -Color Red
$attempt++
Start-Sleep -Seconds $delay
}
}
Write-ColorText "Service failed to become healthy after $MaxAttempts attempts" -Color Red
Write-ColorText "Checking logs:" -Color Yellow
docker-compose logs zen-mcp
return $false
}
# Create logs directory if it doesn't exist
if (!(Test-Path "logs")) {
Write-ColorText "Creating logs directory..." -Color Green
New-Item -ItemType Directory -Path "logs" -Force | Out-Null
}
# Stop existing containers
Write-ColorText "Stopping existing containers..." -Color Green
try {
docker-compose down
if ($LASTEXITCODE -ne 0) {
Write-ColorText "Warning: Failed to stop existing containers (they may not be running)" -Color Yellow
}
} catch {
Write-ColorText "Warning: Error stopping containers: $_" -Color Yellow
}
# Start the services
Write-ColorText "Starting Zen MCP Server..." -Color Green
try {
docker-compose up -d
if ($LASTEXITCODE -ne 0) {
throw "Failed to start services"
}
} catch {
Write-ColorText "Error: Failed to start services" -Color Red
Write-ColorText "Checking logs:" -Color Yellow
docker-compose logs zen-mcp
exit 1
}
# Wait for health check (unless skipped)
if (!$SkipHealthCheck) {
Write-ColorText "Waiting for service to be healthy..." -Color Green
# Try simple timeout first, then use exponential backoff if needed
$timeout = $HealthCheckTimeout
$elapsed = 0
$healthy = $false
while ($elapsed -lt $timeout) {
try {
$containerId = docker-compose ps -q zen-mcp
if (![string]::IsNullOrWhiteSpace($containerId)) {
$status = docker inspect -f "{{.State.Health.Status}}" $containerId 2>$null
if ($status -eq "healthy") {
$healthy = $true
break
}
}
} catch {
# Continue checking
}
Start-Sleep -Seconds 2
$elapsed += 2
}
if (!$healthy) {
# Use exponential backoff retry mechanism
if (!(Wait-ForHealth)) {
Write-ColorText "Service failed to become healthy" -Color Red
Write-ColorText "Checking logs:" -Color Yellow
docker-compose logs zen-mcp
exit 1
}
}
}
Write-ColorText "✓ Zen MCP Server deployed successfully" -Color Green
Write-ColorText "Service Status:" -Color Green
docker-compose ps
Write-ColorText "=== Deployment Complete ===" -Color Green
Write-ColorText "Useful commands:" -Color Yellow
Write-ColorText " View logs: " -Color White -NoNewline
Write-ColorText "docker-compose logs -f zen-mcp" -Color Green
Write-ColorText " Stop service: " -Color White -NoNewline
Write-ColorText "docker-compose down" -Color Green
Write-ColorText " Restart service: " -Color White -NoNewline
Write-ColorText "docker-compose restart zen-mcp" -Color Green
Write-ColorText " PowerShell logs: " -Color White -NoNewline
Write-ColorText "Get-Content logs\mcp_server.log -Wait" -Color Green
```
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
```python
"""
Configuration and constants for Zen MCP Server
This module centralizes all configuration settings for the Zen MCP Server.
It defines model configurations, token limits, temperature defaults, and other
constants used throughout the application.
Configuration values can be overridden by environment variables where appropriate.
"""
from utils.env import get_env
# Version and metadata
# These values are used in server responses and for tracking releases
# IMPORTANT: This is the single source of truth for version and author info
# Semantic versioning: MAJOR.MINOR.PATCH
__version__ = "9.1.3"
# Last update date in ISO format
__updated__ = "2025-10-22"
# Primary maintainer
__author__ = "Fahad Gilani"
# Model configuration
# DEFAULT_MODEL: The default model used for all AI operations
# This should be a stable, high-performance model suitable for code analysis
# Can be overridden by setting DEFAULT_MODEL environment variable
# Special value "auto" means Claude should pick the best model for each task
DEFAULT_MODEL = get_env("DEFAULT_MODEL", "auto") or "auto"
# Auto mode detection - when DEFAULT_MODEL is "auto", Claude picks the model
IS_AUTO_MODE = DEFAULT_MODEL.lower() == "auto"
# Each provider (gemini.py, openai.py, xai.py, dial.py, openrouter.py, custom.py, azure_openai.py)
# defines its own MODEL_CAPABILITIES
# with detailed descriptions. Tools use ModelProviderRegistry.get_available_model_names()
# to get models only from enabled providers (those with valid API keys).
#
# This architecture ensures:
# - No namespace collisions (models only appear when their provider is enabled)
# - API key-based filtering (prevents wrong models from being shown to Claude)
# - Proper provider routing (models route to the correct API endpoint)
# - Clean separation of concerns (providers own their model definitions)
# Temperature defaults for different tool types
# Temperature controls the randomness/creativity of model responses
# Lower values (0.0-0.3) produce more deterministic, focused responses
# Higher values (0.7-1.0) produce more creative, varied responses
# TEMPERATURE_ANALYTICAL: Used for tasks requiring precision and consistency
# Ideal for code review, debugging, and error analysis where accuracy is critical
TEMPERATURE_ANALYTICAL = 0.2 # For code review, debugging
# TEMPERATURE_BALANCED: Middle ground for general conversations
# Provides a good balance between consistency and helpful variety
TEMPERATURE_BALANCED = 0.5 # For general chat
# TEMPERATURE_CREATIVE: Higher temperature for exploratory tasks
# Used when brainstorming, exploring alternatives, or architectural discussions
TEMPERATURE_CREATIVE = 0.7 # For architecture, deep thinking
# Thinking Mode Defaults
# DEFAULT_THINKING_MODE_THINKDEEP: Default thinking depth for extended reasoning tool
# Higher modes use more computational budget but provide deeper analysis
DEFAULT_THINKING_MODE_THINKDEEP = get_env("DEFAULT_THINKING_MODE_THINKDEEP", "high") or "high"
# Consensus Tool Defaults
# Consensus timeout and rate limiting settings
DEFAULT_CONSENSUS_TIMEOUT = 120.0 # 2 minutes per model
DEFAULT_CONSENSUS_MAX_INSTANCES_PER_COMBINATION = 2
# NOTE: Consensus tool now uses sequential processing for MCP compatibility
# Concurrent processing was removed to avoid async pattern violations
# MCP Protocol Transport Limits
#
# IMPORTANT: This limit ONLY applies to the Claude CLI ↔ MCP Server transport boundary.
# It does NOT limit internal MCP Server operations like system prompts, file embeddings,
# conversation history, or content sent to external models (Gemini/OpenAI/OpenRouter).
#
# MCP Protocol Architecture:
# Claude CLI ←→ MCP Server ←→ External Model (Gemini/OpenAI/etc.)
# ↑ ↑
# │ │
# MCP transport Internal processing
# (token limit from MAX_MCP_OUTPUT_TOKENS) (No MCP limit - can be 1M+ tokens)
#
# MCP_PROMPT_SIZE_LIMIT: Maximum character size for USER INPUT crossing MCP transport
# The MCP protocol has a combined request+response limit controlled by MAX_MCP_OUTPUT_TOKENS.
# To ensure adequate space for MCP Server → Claude CLI responses, we limit user input
# to roughly 60% of the total token budget converted to characters. Larger user prompts
# must be sent as prompt.txt files to bypass MCP's transport constraints.
#
# Token to character conversion ratio: ~4 characters per token (average for code/text)
# Default allocation: 60% of tokens for input, 40% for response
#
# What IS limited by this constant:
# - request.prompt field content (user input from Claude CLI)
# - prompt.txt file content (alternative user input method)
# - Any other direct user input fields
#
# What is NOT limited by this constant:
# - System prompts added internally by tools
# - File content embedded by tools
# - Conversation history loaded from storage
# - Web search instructions or other internal additions
# - Complete prompts sent to external models (managed by model-specific token limits)
#
# This ensures MCP transport stays within protocol limits while allowing internal
# processing to use full model context windows (200K-1M+ tokens).
def _calculate_mcp_prompt_limit() -> int:
"""
Calculate MCP prompt size limit based on MAX_MCP_OUTPUT_TOKENS environment variable.
Returns:
Maximum character count for user input prompts
"""
# Check for Claude's MAX_MCP_OUTPUT_TOKENS environment variable
max_tokens_str = get_env("MAX_MCP_OUTPUT_TOKENS")
if max_tokens_str:
try:
max_tokens = int(max_tokens_str)
# Allocate 60% of tokens for input, convert to characters (~4 chars per token)
input_token_budget = int(max_tokens * 0.6)
character_limit = input_token_budget * 4
return character_limit
except (ValueError, TypeError):
# Fall back to default if MAX_MCP_OUTPUT_TOKENS is not a valid integer
pass
# Default fallback: 60,000 characters (equivalent to ~15k tokens input of 25k total)
return 60_000
MCP_PROMPT_SIZE_LIMIT = _calculate_mcp_prompt_limit()
# Language/Locale Configuration
# LOCALE: Language/locale specification for AI responses
# When set, all AI tools will respond in the specified language while
# maintaining their analytical capabilities
# Examples: "fr-FR", "en-US", "zh-CN", "zh-TW", "ja-JP", "ko-KR", "es-ES",
# "de-DE", "it-IT", "pt-PT"
# Leave empty for default language (English)
LOCALE = get_env("LOCALE", "") or ""
# Threading configuration
# Simple in-memory conversation threading for stateless MCP environment
# Conversations persist only during the Claude session
```
--------------------------------------------------------------------------------
/tests/CASSETTE_MAINTENANCE.md:
--------------------------------------------------------------------------------
```markdown
# HTTP Cassette Testing - Maintenance Guide
## Overview
This project uses HTTP cassettes (recorded HTTP interactions) to test API integrations without making real API calls during CI. This document explains how the cassette system works and how to maintain it.
## How Cassette Matching Works
### Standard Matching (Non-o3 Models)
For most models, cassettes match requests using:
- HTTP method (GET, POST, etc.)
- Request path (/v1/chat/completions, etc.)
- **Exact hash of the request body**
If ANY part of the request changes, the hash changes and the cassette won't match.
### Semantic Matching (o3 Models)
**Problem**: o3 models use system prompts and conversation memory instructions that change frequently with code updates. Using exact hash matching would require re-recording cassettes after every prompt change.
**Solution**: o3 models use **semantic matching** that only compares:
- Model name (e.g., "o3-pro", "o3-mini")
- User's actual question (extracted from request)
- Core parameters (reasoning effort, temperature)
**Ignored fields** (can change without breaking cassettes):
- System prompts
- Conversation memory instructions
- Follow-up guidance text
- Token limits and other metadata
### Example
These two requests will match with semantic matching:
```json
// Request 1 - Old system prompt
{
"model": "o3-pro",
"reasoning": {"effort": "medium"},
"input": [{
"role": "user",
"content": [{
"text": "Old system prompt v1...\n\n=== USER REQUEST ===\nWhat is 2 + 2?\n=== END REQUEST ===\n\nOld instructions..."
}]
}]
}
// Request 2 - New system prompt (DIFFERENT)
{
"model": "o3-pro",
"reasoning": {"effort": "medium"},
"input": [{
"role": "user",
"content": [{
"text": "New system prompt v2...\n\n=== USER REQUEST ===\nWhat is 2 + 2?\n=== END REQUEST ===\n\nNew instructions..."
}]
}]
}
```
Both extract the same semantic content:
```json
{
"model": "o3-pro",
"reasoning": {"effort": "medium"},
"user_question": "What is 2 + 2?"
}
```
## When to Re-Record Cassettes
### You MUST re-record when:
1. **The user's test question changes**
- Example: Changing "What is 2 + 2?" to "What is 3 + 3?"
2. **Core parameters change**
- Model name changes (o3-pro → o3-mini)
- Reasoning effort changes (medium → high)
- Temperature changes
3. **For non-o3 models: ANY request body change**
### You DON'T need to re-record when (o3 models only):
1. **System prompts change**
- Semantic matching ignores these
2. **Conversation memory instructions change**
- Follow-up guidance text changes
- Token limit instructions change
3. **Response format instructions change**
- As long as the user's actual question stays the same
## How to Re-Record a Cassette
### Step 1: Delete the Old Cassette
```bash
rm tests/openai_cassettes/<cassette_name>.json
```
### Step 2: Run the Test with Real API Key
```bash
# Make sure you have a valid API key in .env
export OPENAI_API_KEY="your-real-key"
# Run the specific test
python -m pytest tests/test_o3_pro_output_text_fix.py -v
```
The test will:
1. Detect the missing cassette
2. Make a real API call
3. Record the interaction
4. Save it as a new cassette
### Step 3: Verify the Cassette Works in Replay Mode
```bash
# Test with dummy key (forces replay mode)
OPENAI_API_KEY="dummy-key" python -m pytest tests/test_o3_pro_output_text_fix.py -v
```
### Step 4: Commit the New Cassette
```bash
git add tests/openai_cassettes/<cassette_name>.json
git commit -m "chore: re-record cassette for <test_name>"
```
## Troubleshooting
### Error: "No matching interaction found"
**Cause**: The request body has changed in a way that affects the hash.
**For o3 models**: This should NOT happen due to semantic matching. If it does:
1. Check if the user question changed
2. Check if model name or reasoning effort changed
3. Verify semantic matching is working (run `test_cassette_semantic_matching.py`)
**For non-o3 models**: This is expected when request changes. Re-record the cassette.
**Solution**: Re-record the cassette following the steps above.
### Error: "Cassette file not found"
**Cause**: Cassette hasn't been recorded yet or was deleted.
**Solution**: Re-record the cassette with a real API key.
### CI Fails but Local Tests Pass
**Cause**:
1. You recorded with uncommitted code changes
2. CI is running different code than your local environment
**Solution**:
1. Commit all your changes first
2. Then re-record cassettes
3. Commit the cassettes
## Best Practices
### 1. Keep Test Questions Simple
- Use simple, stable questions like "What is 2 + 2?"
- Avoid questions that might elicit different responses over time
### 2. Document Cassette Recording Conditions
- Add comments in tests explaining when recorded
- Note any special setup required
### 3. Use Semantic Matching for Prompt-Heavy Tests
- If your test involves lots of system prompts, use o3 models
- Or extend semantic matching to other models if needed
### 4. Test Both Record and Replay Modes
- Always verify cassettes work in replay mode
- Ensure tests can record new cassettes when needed
### 5. Don't Commit Cassettes with Secrets
- The recording system sanitizes API keys automatically
- But double-check for any other sensitive data
## Implementation Details
### Semantic Matching Code
The semantic matching is implemented in `tests/http_transport_recorder.py`:
- `_is_o3_model_request()`: Detects o3 model requests
- `_extract_semantic_fields()`: Extracts only essential fields
- `_get_request_signature()`: Generates hash from semantic fields
### Adding Semantic Matching to Other Models
To add semantic matching for other models:
1. Update `_is_o3_model_request()` to include your model
2. Update `_extract_semantic_fields()` if needed
3. Add tests in `test_cassette_semantic_matching.py`
Example:
```python
def _is_o3_model_request(self, content_dict: dict) -> bool:
"""Check if this is an o3 or other semantic-matching model request."""
model = content_dict.get("model", "")
return model.startswith("o3") or model.startswith("gpt-5") # Add more models
```
## Questions?
If you encounter issues with cassette testing:
1. Check this guide first
2. Review existing cassette tests for examples
3. Run semantic matching tests to verify the system
4. Open an issue if you find a bug in the matching logic
## Related Files
- `tests/http_transport_recorder.py` - Cassette recording/replay implementation
- `tests/transport_helpers.py` - Helper functions for injecting transports
- `tests/test_cassette_semantic_matching.py` - Tests for semantic matching
- `tests/test_o3_pro_output_text_fix.py` - Example of cassette usage
- `tests/openai_cassettes/` - Directory containing recorded cassettes
```
--------------------------------------------------------------------------------
/systemprompts/tracer_prompt.py:
--------------------------------------------------------------------------------
```python
"""
Tracer tool system prompts
"""
TRACER_PROMPT = """
You are an expert, seasoned software architect and code analysis specialist with deep expertise in code tracing,
execution flow analysis, and dependency mapping. You have extensive experience analyzing complex codebases,
tracing method calls, understanding data flow, and mapping structural relationships in software systems.
From microservices to monolithic applications, your ability to understand code structure, execution paths,
and dependencies is unmatched. There is nothing related to software architecture, design patterns, or code
analysis that you're not aware of. Your role is to systematically trace and analyze code to provide
comprehensive understanding of how software components interact and execute.
CRITICAL LINE NUMBER INSTRUCTIONS
Code is presented with line number markers "LINE│ code". These markers are for reference ONLY and MUST NOT be
included in any code you generate. Always reference specific line numbers in your replies in order to locate
exact positions if needed to point to exact locations. Include a very short code excerpt alongside for clarity.
Include context_start_text and context_end_text as backup references. Never include "LINE│" markers in generated code
snippets.
IF MORE INFORMATION IS NEEDED
If the agent is discussing specific code, functions, or project components that was not given as part of the context,
and you need additional context (e.g., related files, configuration, dependencies, test files) to provide meaningful
analysis, you MUST respond ONLY with this JSON format (and nothing else). Do NOT ask for the same file you've been
provided unless for some reason its content is missing or incomplete:
{
"status": "files_required_to_continue",
"mandatory_instructions": "<your critical instructions for the agent>",
"files_needed": ["[file name here]", "[or some folder/]"]
}
TRACING METHODOLOGY:
1. PRECISION MODE (Execution Flow):
- Trace method/function execution paths and call chains
- Identify entry points and usage patterns
- Map conditional branches and control flow
- Document side effects and state changes
- Analyze parameter flow and return values
2. DEPENDENCIES MODE (Structural Relationships):
- Map incoming and outgoing dependencies
- Identify type relationships (inheritance, composition, usage)
- Trace bidirectional connections between components
- Document interface contracts and protocols
- Analyze coupling and cohesion patterns
ANALYSIS STRUCTURE:
Each tracing step MUST include:
- Step number and current findings
- Files examined and methods analyzed
- Concrete evidence from code examination
- Relationships discovered (calls, dependencies, usage)
- Execution paths or structural patterns identified
- Areas requiring deeper investigation
TRACING PRINCIPLES:
- Start with target identification, then explore systematically
- Follow actual code paths, not assumed behavior
- Document concrete evidence with file:line references
- Consider edge cases, error handling, and conditional logic
- Map both direct and indirect relationships
- Verify assumptions with code examination
STRUCTURED JSON OUTPUT FORMAT:
You MUST respond with a properly formatted JSON object following this exact schema.
Do NOT include any text before or after the JSON. The response must be valid JSON only.
IF MORE INFORMATION IS NEEDED:
If you lack critical information to proceed with tracing, you MUST only respond with:
{
"status": "files_required_to_continue",
"mandatory_instructions": "<your critical instructions for the agent>",
"files_needed": ["<file name here>", "<or some folder/>"]
}
FOR NORMAL TRACING RESPONSES:
{
"status": "tracing_in_progress",
"step_number": <current step number>,
"total_steps": <estimated total steps>,
"next_step_required": <true/false>,
"step_content": "<detailed description of current tracing investigation>",
"metadata": {
"trace_mode": "<precision or dependencies>",
"target_description": "<what is being traced and why>",
"step_history_length": <number of steps completed so far>
},
"tracing_status": {
"files_checked": <number of files examined>,
"relevant_files": <number of files directly relevant>,
"relevant_context": <number of methods/functions involved>,
"issues_found": 0,
"images_collected": <number of diagrams/visuals>,
"current_confidence": "<exploring/low/medium/high/complete>",
"step_history_length": <current step count>
},
"continuation_id": "<thread_id for conversation continuity>",
"tracing_complete": <true/false - set to true only on final step>,
"trace_summary": "<complete trace summary - only include when tracing_complete is true>",
"next_steps": "<guidance for the agent on next investigation actions>",
"output": {
"instructions": "<formatting instructions for final output>",
"format": "<precision_trace_analysis or dependencies_trace_analysis>",
"rendering_instructions": "<detailed formatting rules>",
"presentation_guidelines": "<how to present the complete trace>"
}
}
TRACING CONTENT GUIDELINES:
- step_content: Provide detailed analysis of current tracing investigation
- Include specific files examined, methods analyzed, and relationships discovered
- Reference exact line numbers and code snippets for evidence
- Document execution paths, call chains, or dependency relationships
- When completing tracing, provide comprehensive trace_summary
- next_steps: Always guide the agent on what to investigate next
TRACE PRESENTATION GUIDELINES:
When tracing is complete (tracing_complete: true), the agent should present the final trace with:
FOR PRECISION MODE:
- Vertical indented call flow diagrams with exact file:line references
- Branching and side effect tables with specific conditions
- Usage points with context descriptions
- Entry points with trigger scenarios
- Visual call chains using arrows and indentation
FOR DEPENDENCIES MODE:
- Bidirectional arrow flow diagrams showing incoming/outgoing dependencies
- Type relationship mappings (inheritance, composition, usage)
- Dependency tables with file:line references
- Visual connection diagrams with proper arrow directions
- Structural relationship analysis
IMPORTANT FORMATTING RULES:
- Use exact file paths and line numbers from actual codebase
- Adapt method naming to match project's programming language conventions
- Use proper indentation and visual alignment for call flows
- Show conditional execution with explicit condition descriptions
- Mark uncertain or ambiguous paths clearly
- Include comprehensive side effects categorization
Be systematic, thorough, and provide concrete evidence. Your tracing should be detailed enough that someone could follow the exact execution paths or understand the complete dependency structure.
"""
```
--------------------------------------------------------------------------------
/systemprompts/codereview_prompt.py:
--------------------------------------------------------------------------------
```python
"""
CodeReview tool system prompt
"""
CODEREVIEW_PROMPT = """
ROLE
You are an expert code reviewer, combining the deep architectural knowledge of a principal engineer with the
precision of a sophisticated static analysis tool. Your task is to review the user's code and deliver precise, actionable
feedback covering architecture, maintainability, performance, and implementation correctness.
CRITICAL GUIDING PRINCIPLES
- **User-Centric Analysis:** Align your review with the user's specific goals and constraints. Tailor your analysis to what matters for their use case.
- **Scoped & Actionable Feedback:** Focus strictly on the provided code. Offer concrete, actionable fixes for issues within it. Avoid suggesting architectural overhauls, technology migrations, or unrelated improvements.
- **Pragmatic Solutions:** Prioritize practical improvements. Do not suggest solutions that add unnecessary complexity or abstraction for hypothetical future problems.
- **DO NOT OVERSTEP**: Do not suggest wholesale changes, technology migrations, or improvements unrelated to the specific issues found. Remain grounded in
the immediate task of reviewing the provided code for quality, security, and correctness. Avoid suggesting major refactors, migrations, or unrelated "nice-to-haves."
CRITICAL LINE NUMBER INSTRUCTIONS
Code is presented with line number markers "LINE│ code". These markers are for reference ONLY and MUST NOT be included in any code you generate.
Always reference specific line numbers in your replies to locate exact positions. Include a very short code excerpt alongside each finding for clarity.
Never include "LINE│" markers in generated code snippets.
Your review approach:
1. First, understand the user's context, expectations, constraints, and objectives.
2. Identify issues in order of severity (Critical > High > Medium > Low).
3. Provide specific, actionable, and precise fixes with concise code snippets where helpful.
4. Evaluate security, performance, and maintainability as they relate to the user's goals.
5. Acknowledge well-implemented aspects to reinforce good practices.
6. Remain constructive and unambiguous—do not downplay serious flaws.
7. Especially look for high-level architectural and design issues:
- Over-engineering or unnecessary complexity.
- Potentially serious performance bottlenecks.
- Design patterns that could be simplified or decomposed.
- Areas where the architecture might not scale well.
- Missing abstractions that would make future extensions much harder.
- Ways to reduce overall complexity while retaining functionality.
8. Simultaneously, perform a static analysis for common low-level pitfalls:
- **Concurrency:** Race conditions, deadlocks, incorrect usage of async/await, thread-safety violations (e.g., UI updates on background threads).
- **Resource Management:** Memory leaks, unclosed file handles or network connections, retain cycles.
- **Error Handling:** Swallowed exceptions, overly broad `catch` blocks, incomplete error paths, returning `nil` instead of throwing errors where appropriate.
- **API Usage:** Use of deprecated or unsafe functions, incorrect parameter passing, off-by-one errors.
- **Security:** Potential injection flaws (SQL, command), insecure data storage, hardcoded secrets, improper handling of sensitive data.
- **Performance:** Inefficient loops, unnecessary object allocations in tight loops, blocking I/O on critical threads.
9. Where further investigation is required, be direct and suggest which specific code or related file needs to be reviewed.
10. Remember: Overengineering is an anti-pattern. Avoid suggesting solutions that introduce unnecessary abstraction or indirection in anticipation of complexity that does not yet exist and is not justified by the current scope.
SEVERITY DEFINITIONS
🔴 CRITICAL: Security flaws, defects that cause crashes, data loss, or undefined behavior (e.g., race conditions).
🟠 HIGH: Bugs, performance bottlenecks, or anti-patterns that significantly impair usability, scalability, or reliability.
🟡 MEDIUM: Maintainability concerns, code smells, test gaps, or non-idiomatic code that increases cognitive load.
🟢 LOW: Style nits, minor improvements, or opportunities for code clarification.
EVALUATION AREAS (apply as relevant to the project or code)
- **Security:** Authentication/authorization flaws, input validation (SQLi, XSS), cryptography, sensitive-data handling, hardcoded secrets.
- **Performance & Scalability:** Algorithmic complexity, resource leaks (memory, file handles), concurrency issues (race conditions, deadlocks), caching strategies, blocking I/O on critical threads.
- **Code Quality & Maintainability:** Readability, structure, idiomatic usage of the language, error handling patterns, documentation, modularity, separation of concerns.
- **Testing:** Unit/integration test coverage, handling of edge cases, reliability and determinism of the test suite.
- **Dependencies:** Version health, known vulnerabilities, maintenance burden, transitive dependencies.
- **Architecture:** Design patterns, modularity, data flow, state management.
- **Operations:** Logging, monitoring, configuration management, feature flagging.
OUTPUT FORMAT
For each issue use:
[SEVERITY] File:Line – Issue description
→ Fix: Specific solution (code example only if appropriate, and only as much as needed)
After listing all issues, add:
• **Overall Code Quality Summary:** (one short paragraph)
• **Top 3 Priority Fixes:** (quick bullets)
• **Positive Aspects:** (what was done well and should be retained)
STRUCTURED RESPONSES FOR SPECIAL CASES
To ensure predictable interactions, use the following JSON formats for specific scenarios. Your entire response in these cases must be the JSON object and nothing else.
1. IF MORE INFORMATION IS NEEDED
If you need additional context (e.g., related files, configuration, dependencies) to provide a complete and accurate review, you MUST respond ONLY with this JSON format (and nothing else). Do NOT ask for the same file you've been provided unless its content is missing or incomplete:
{
"status": "files_required_to_continue",
"mandatory_instructions": "<your critical instructions for the agent>",
"files_needed": ["[file name here]", "[or some folder/]"]
}
2. IF SCOPE TOO LARGE FOR FOCUSED REVIEW
If the codebase is too large or complex to review effectively in a single response, you MUST request the agent to provide smaller, more focused subsets for review. Respond ONLY with this JSON format (and nothing else):
{
"status": "focused_review_required",
"reason": "<brief explanation of why the scope is too large>",
"suggestion": "<e.g., 'Review authentication module (auth.py, login.py)' or 'Focus on data layer (models/)' or 'Review payment processing functionality'>"
}
"""
```
--------------------------------------------------------------------------------
/conf/dial_models.json:
--------------------------------------------------------------------------------
```json
{
"_README": {
"description": "Model metadata for the DIAL (Data & AI Layer) aggregation provider.",
"documentation": "https://github.com/BeehiveInnovations/zen-mcp-server/blob/main/docs/configuration.md",
"usage": "Models listed here are exposed through the DIAL provider. Aliases are case-insensitive.",
"field_notes": "Matches providers/shared/model_capabilities.py.",
"field_descriptions": {
"model_name": "The model identifier as exposed by DIAL (typically deployment name)",
"aliases": "Array of shorthand names users can type instead of the full model name",
"context_window": "Total number of tokens the model can process (input + output combined)",
"max_output_tokens": "Maximum number of tokens the model can generate in a single response",
"supports_extended_thinking": "Whether the model supports extended reasoning tokens",
"supports_json_mode": "Whether the model can guarantee valid JSON output",
"supports_function_calling": "Whether the model supports function/tool calling",
"supports_images": "Whether the model can process images/visual input",
"max_image_size_mb": "Maximum total size in MB for all images combined",
"supports_temperature": "Whether the model accepts the temperature parameter",
"temperature_constraint": "Temperature constraint hint: 'fixed', 'range', or 'discrete'",
"description": "Human-readable description of the model",
"intelligence_score": "1-20 human rating used as the primary signal for auto-mode ordering"
}
},
"models": [
{
"model_name": "o3-2025-04-16",
"friendly_name": "DIAL (O3)",
"aliases": ["o3"],
"intelligence_score": 14,
"description": "OpenAI O3 via DIAL - Strong reasoning model",
"context_window": 200000,
"max_output_tokens": 100000,
"supports_extended_thinking": false,
"supports_function_calling": false,
"supports_json_mode": true,
"supports_images": true,
"max_image_size_mb": 20.0,
"supports_temperature": false,
"temperature_constraint": "fixed"
},
{
"model_name": "o4-mini-2025-04-16",
"friendly_name": "DIAL (O4-mini)",
"aliases": ["o4-mini"],
"intelligence_score": 11,
"description": "OpenAI O4-mini via DIAL - Fast reasoning model",
"context_window": 200000,
"max_output_tokens": 100000,
"supports_extended_thinking": false,
"supports_function_calling": false,
"supports_json_mode": true,
"supports_images": true,
"max_image_size_mb": 20.0,
"supports_temperature": false,
"temperature_constraint": "fixed"
},
{
"model_name": "anthropic.claude-sonnet-4.1-20250805-v1:0",
"friendly_name": "DIAL (Sonnet 4.1)",
"aliases": ["sonnet-4.1", "sonnet-4"],
"intelligence_score": 10,
"description": "Claude Sonnet 4.1 via DIAL - Balanced performance",
"context_window": 200000,
"max_output_tokens": 64000,
"supports_extended_thinking": false,
"supports_function_calling": false,
"supports_json_mode": true,
"supports_images": true,
"max_image_size_mb": 5.0,
"supports_temperature": true,
"temperature_constraint": "range"
},
{
"model_name": "anthropic.claude-sonnet-4.1-20250805-v1:0-with-thinking",
"friendly_name": "DIAL (Sonnet 4.1 Thinking)",
"aliases": ["sonnet-4.1-thinking", "sonnet-4-thinking"],
"intelligence_score": 11,
"description": "Claude Sonnet 4.1 with thinking mode via DIAL",
"context_window": 200000,
"max_output_tokens": 64000,
"supports_extended_thinking": true,
"supports_function_calling": false,
"supports_json_mode": true,
"supports_images": true,
"max_image_size_mb": 5.0,
"supports_temperature": true,
"temperature_constraint": "range"
},
{
"model_name": "anthropic.claude-opus-4.1-20250805-v1:0",
"friendly_name": "DIAL (Opus 4.1)",
"aliases": ["opus-4.1", "opus-4"],
"intelligence_score": 14,
"description": "Claude Opus 4.1 via DIAL - Most capable Claude model",
"context_window": 200000,
"max_output_tokens": 64000,
"supports_extended_thinking": false,
"supports_function_calling": false,
"supports_json_mode": true,
"supports_images": true,
"max_image_size_mb": 5.0,
"supports_temperature": true,
"temperature_constraint": "range"
},
{
"model_name": "anthropic.claude-opus-4.1-20250805-v1:0-with-thinking",
"friendly_name": "DIAL (Opus 4.1 Thinking)",
"aliases": ["opus-4.1-thinking", "opus-4-thinking"],
"intelligence_score": 15,
"description": "Claude Opus 4.1 with thinking mode via DIAL",
"context_window": 200000,
"max_output_tokens": 64000,
"supports_extended_thinking": true,
"supports_function_calling": false,
"supports_json_mode": true,
"supports_images": true,
"max_image_size_mb": 5.0,
"supports_temperature": true,
"temperature_constraint": "range"
},
{
"model_name": "gemini-2.5-pro-preview-03-25-google-search",
"friendly_name": "DIAL (Gemini 2.5 Pro Search)",
"aliases": ["gemini-2.5-pro-search"],
"intelligence_score": 17,
"description": "Gemini 2.5 Pro with Google Search via DIAL",
"context_window": 1000000,
"max_output_tokens": 65536,
"supports_extended_thinking": false,
"supports_function_calling": false,
"supports_json_mode": true,
"supports_images": true,
"max_image_size_mb": 20.0,
"supports_temperature": true,
"temperature_constraint": "range"
},
{
"model_name": "gemini-2.5-pro-preview-05-06",
"friendly_name": "DIAL (Gemini 2.5 Pro)",
"aliases": ["gemini-2.5-pro"],
"intelligence_score": 18,
"description": "Gemini 2.5 Pro via DIAL - Deep reasoning",
"context_window": 1000000,
"max_output_tokens": 65536,
"supports_extended_thinking": false,
"supports_function_calling": false,
"supports_json_mode": true,
"supports_images": true,
"max_image_size_mb": 20.0,
"supports_temperature": true,
"temperature_constraint": "range"
},
{
"model_name": "gemini-2.5-flash-preview-05-20",
"friendly_name": "DIAL (Gemini Flash 2.5)",
"aliases": ["gemini-2.5-flash"],
"intelligence_score": 10,
"description": "Gemini 2.5 Flash via DIAL - Ultra-fast",
"context_window": 1000000,
"max_output_tokens": 65536,
"supports_extended_thinking": false,
"supports_function_calling": false,
"supports_json_mode": true,
"supports_images": true,
"max_image_size_mb": 20.0,
"supports_temperature": true,
"temperature_constraint": "range"
}
]
}
```
--------------------------------------------------------------------------------
/docs/contributions.md:
--------------------------------------------------------------------------------
```markdown
# Contributing to Zen MCP Server
Thank you for your interest in contributing to Zen MCP Server! This guide will help you understand our development process, coding standards, and how to submit high-quality contributions.
## Getting Started
1. **Fork the repository** on GitHub
2. **Clone your fork** locally
3. **Set up the development environment**:
```bash
./run-server.sh
```
4. **Create a feature branch** from `main`:
```bash
git checkout -b feat/your-feature-name
```
## Development Process
### 1. Code Quality Standards
We maintain high code quality standards. **All contributions must pass our automated checks**.
#### Required Code Quality Checks
**Option 1 - Automated (Recommended):**
```bash
# Install pre-commit hooks (one-time setup)
pre-commit install
# Now linting runs automatically on every commit
# Includes: ruff (with auto-fix), black, isort
```
**Option 2 - Manual:**
```bash
# Run the comprehensive quality checks script
./code_quality_checks.sh
```
This script automatically runs:
- Ruff linting with auto-fix
- Black code formatting
- Import sorting with isort
- Complete unit test suite (361 tests)
- Verification that all checks pass 100%
**Manual commands** (if you prefer to run individually):
```bash
# Run all linting checks (MUST pass 100%)
ruff check .
black --check .
isort --check-only .
# Auto-fix issues if needed
ruff check . --fix
black .
isort .
# Run complete unit test suite (MUST pass 100%)
python -m pytest -xvs
# Run simulator tests for tool changes
python communication_simulator_test.py
```
**Important**:
- **Every single test must pass** - we have zero tolerance for failing tests in CI
- All linting must pass cleanly (ruff, black, isort)
- Import sorting must be correct
- Tests failing in GitHub Actions will result in PR rejection
### 2. Testing Requirements
#### When to Add Tests
1. **New features MUST include tests**:
- Add unit tests in `tests/` for new functions or classes
- Test both success and error cases
2. **Tool changes require simulator tests**:
- Add simulator tests in `simulator_tests/` for new or modified tools
- Use realistic prompts that demonstrate the feature
- Validate output through server logs
3. **Bug fixes require regression tests**:
- Add a test that would have caught the bug
- Ensure the bug cannot reoccur
#### Test Naming Conventions
- Unit tests: `test_<feature>_<scenario>.py`
- Simulator tests: `test_<tool>_<behavior>.py`
### 3. Pull Request Process
#### PR Title Format
Your PR title MUST follow one of these formats:
**Version Bumping Prefixes** (trigger version bump):
- `feat: <description>` - New features (MINOR version bump)
- `fix: <description>` - Bug fixes (PATCH version bump)
- `breaking: <description>` or `BREAKING CHANGE: <description>` - Breaking changes (MAJOR version bump)
- `perf: <description>` - Performance improvements (PATCH version bump)
- `refactor: <description>` - Code refactoring (PATCH version bump)
**Non-Version Prefixes** (no version bump):
- `docs: <description>` - Documentation only
- `chore: <description>` - Maintenance tasks
- `test: <description>` - Test additions/changes
- `ci: <description>` - CI/CD changes
- `style: <description>` - Code style changes
**Other Options**:
- `docs: <description>` - Documentation changes only
- `chore: <description>` - Maintenance tasks
#### PR Checklist
Use our [PR template](../.github/pull_request_template.md) and ensure:
- [ ] PR title follows the format guidelines above
- [ ] Activated venv and ran `./code_quality_checks.sh` (all checks passed 100%)
- [ ] Self-review completed
- [ ] Tests added for ALL changes
- [ ] Documentation updated as needed
- [ ] All unit tests passing
- [ ] Relevant simulator tests passing (if tool changes)
- [ ] Ready for review
### 4. Code Style Guidelines
#### Python Code Style
- Follow PEP 8 with Black formatting
- Use type hints for function parameters and returns
- Add docstrings to all public functions and classes
- Keep functions focused and under 50 lines when possible
- Use descriptive variable names
#### Example:
```python
def process_model_response(
response: ModelResponse,
max_tokens: Optional[int] = None
) -> ProcessedResult:
"""Process and validate model response.
Args:
response: Raw response from the model provider
max_tokens: Optional token limit for truncation
Returns:
ProcessedResult with validated and formatted content
Raises:
ValueError: If response is invalid or exceeds limits
"""
# Implementation here
```
#### Import Organization
Imports must be organized by isort into these groups:
1. Standard library imports
2. Third-party imports
3. Local application imports
### 5. Specific Contribution Types
#### Adding a New Provider
See our detailed guide: [Adding a New Provider](./adding_providers.md)
#### Adding a New Tool
See our detailed guide: [Adding a New Tool](./adding_tools.md)
#### Modifying Existing Tools
1. Ensure backward compatibility unless explicitly breaking
2. Update all affected tests
3. Update documentation if behavior changes
4. Add simulator tests for new functionality
### 6. Documentation Standards
- Update README.md for user-facing changes
- Add docstrings to all new code
- Update relevant docs/ files
- Include examples for new features
- Keep documentation concise and clear
### 7. Commit Message Guidelines
Write clear, descriptive commit messages:
- First line: Brief summary (50 chars or less)
- Blank line
- Detailed explanation if needed
- Reference issues: "Fixes #123"
Example:
```
feat: Add retry logic to Gemini provider
Implements exponential backoff for transient errors
in Gemini API calls. Retries up to 2 times with
configurable delays.
Fixes #45
```
## Common Issues and Solutions
### Linting Failures
```bash
# Auto-fix most issues
ruff check . --fix
black .
isort .
```
### Test Failures
- Check test output for specific errors
- Run individual tests for debugging: `pytest tests/test_specific.py -xvs`
- Ensure server environment is set up for simulator tests
### Import Errors
- Verify virtual environment is activated
- Check all dependencies are installed: `pip install -r requirements.txt`
## Getting Help
- **Questions**: Open a GitHub issue with the "question" label
- **Bug Reports**: Use the bug report template
- **Feature Requests**: Use the feature request template
- **Discussions**: Use GitHub Discussions for general topics
## Code of Conduct
- Be respectful and inclusive
- Welcome newcomers and help them get started
- Focus on constructive feedback
- Assume good intentions
## Recognition
Contributors are recognized in:
- GitHub contributors page
- Release notes for significant contributions
- Special mentions for exceptional work
Thank you for contributing to Zen MCP Server! Your efforts help make this tool better for everyone.
```
--------------------------------------------------------------------------------
/tools/apilookup.py:
--------------------------------------------------------------------------------
```python
"""API lookup tool - quickly gather the latest API/SDK information."""
from __future__ import annotations
import json
from typing import TYPE_CHECKING, Any
from pydantic import Field
from config import TEMPERATURE_ANALYTICAL
from tools.shared.base_models import ToolRequest
from tools.simple.base import SimpleTool
if TYPE_CHECKING:
from tools.models import ToolModelCategory
LOOKUP_FIELD_DESCRIPTIONS = {
"prompt": "The API, SDK, library, framework, or technology you need current documentation, version info, breaking changes, or migration guidance for.",
}
class LookupRequest(ToolRequest):
prompt: str = Field(..., description=LOOKUP_FIELD_DESCRIPTIONS["prompt"])
LOOKUP_PROMPT = """
MANDATORY: You MUST perform this research in a SEPARATE SUB-TASK using your web search tool.
CRITICAL RULES - READ CAREFULLY:
- Launch your environment's dedicated web search capability (for example `websearch`, `web_search`, or another native
web-search tool such as the one you use to perform a web search online) to gather sources - do NOT call this `apilookup` tool again
during the same lookup, this is ONLY an orchestration tool to guide you and has NO web search capability of its own.
- ALWAYS run the search from a separate sub-task/sub-process so the research happens outside this tool invocation.
- If the environment does not expose a web search tool, immediately report that limitation instead of invoking `apilookup` again.
MISSION:
Research the latest, most authoritative documentation for the requested API, SDK, library, framework, programming language feature, or tool to answer the user's question accurately using a SUB-AGENT in a separate process.
SEARCH STRATEGY (MAXIMUM 2-4 SEARCHES TOTAL FOR THIS MISSION - THEN STOP):
- IMPORTANT: Begin by determining today's date and current year
- MANDATORY FOR OS-TIED APIS/SDKs: If the request involves iOS, macOS, Windows, Linux, Android, watchOS, tvOS, or any OS-specific framework/API:
* FIRST perform a web search to determine "what is the latest [OS name] version [current year]"
* If the search is around a specific tool or an IDE, confirm the latest version "latest version [tool name]"
* DO NOT rely on your training data or knowledge cutoff for OS versions - you MUST search for current information
* ONLY AFTER confirming the current OS version, search for APIs/SDKs/frameworks for that specific version
* Example workflow: Search "latest iOS version [current year]" → Find current version → Then search "[current iOS version] SwiftUI glass effect button [current year]"
- MANDATORY FOR MAJOR FRAMEWORKS/LANGUAGES: For rapidly-evolving ecosystems, verify current stable version:
* Languages: Node.js, Python, Ruby, Rust, Go, Java, .NET/C#, PHP, Kotlin, Swift
* Web frameworks: React, Vue, Angular, Next.js, Nuxt, Svelte, SvelteKit, Remix, Astro, SolidJS
* Backend frameworks: Django, Flask, FastAPI, Rails, Laravel, Spring Boot, Express, NestJS, Axum
* Mobile: Flutter, React Native, Jetpack Compose, SwiftUI
* Build tools: Vite, Webpack, esbuild, Turbopack, Rollup
* Package managers: npm, pnpm, yarn, pip, cargo, go modules, maven, gradle
* Search pattern: "latest [framework/language/SDK] version [current year]" BEFORE searching for specific APIs
* ONLY consider articles, documentation, and resources dated within the current year or most recent release cycle
* Ignore or deprioritize results from previous years unless they are still the current official documentation
- ALWAYS find current official documentation, release notes, changelogs, migration guides, and authoritative blog posts. Newest APIs / SDKs released or updated in the current year trump older ones.
- Prioritize official sources: project documentation sites, GitHub repositories, package registries (npm, PyPI, crates.io, Maven Central, NuGet, RubyGems, Packagist, etc.), and official blogs
- Check version-specific documentation when relevant and add current year to ensure latest docs are retrieved (e.g., "React docs [current year]", "Python what's new [current year]", "TypeScript breaking changes [current year]", "Next.js app router [current year]")
- Look for recent Stack Overflow discussions, GitHub issues, RFC documents, or official discussion forums when official docs are incomplete
- Cross-reference multiple sources to validate syntax, method signatures, configuration options, and best practices
- Search for deprecation warnings, security advisories, or migration paths between major versions
- STOP IMMEDIATELY after 2-4 searches maximum - DO NOT continue exploring tangential topics, examples, tutorials, or supplementary material
- If latest, more current, authoritative information has been found: STOP looking further
- ALWAYS cite authoritative sources with links (official docs, changelogs, GitHub releases, package registry pages)
""".strip()
class LookupTool(SimpleTool):
"""Simple tool that wraps user queries with API lookup instructions."""
def get_name(self) -> str:
return "apilookup"
def get_description(self) -> str:
return (
"Use this tool automatically when you need current API/SDK documentation, latest version info, breaking changes, deprecations, migration guides, or official release notes. "
"This tool searches authoritative sources (official docs, GitHub, package registries) to ensure up-to-date accuracy."
)
def get_system_prompt(self) -> str:
return ""
def get_default_temperature(self) -> float:
return TEMPERATURE_ANALYTICAL
def requires_model(self) -> bool:
return False
def get_model_category(self) -> ToolModelCategory:
from tools.models import ToolModelCategory
return ToolModelCategory.FAST_RESPONSE
def get_request_model(self):
return LookupRequest
def get_tool_fields(self) -> dict[str, dict[str, Any]]:
return {
"prompt": {
"type": "string",
"description": LOOKUP_FIELD_DESCRIPTIONS["prompt"],
}
}
async def prepare_prompt(self, request) -> str: # pragma: no cover - not used
return ""
def get_input_schema(self) -> dict[str, Any]:
return {
"type": "object",
"properties": {
"prompt": {
"type": "string",
"description": LOOKUP_FIELD_DESCRIPTIONS["prompt"],
},
},
"required": ["prompt"],
}
async def execute(self, arguments: dict[str, Any]) -> list:
from mcp.types import TextContent
request = self.get_request_model()(**arguments)
response = {
"status": "web_lookup_needed",
"instructions": LOOKUP_PROMPT,
"user_prompt": request.prompt,
}
return [TextContent(type="text", text=json.dumps(response, ensure_ascii=False, indent=2))]
```
--------------------------------------------------------------------------------
/docs/tools/consensus.md:
--------------------------------------------------------------------------------
```markdown
# Consensus Tool - Multi-Model Perspective Gathering
**Get diverse expert opinions from multiple AI models on technical proposals and decisions**
The `consensus` tool orchestrates multiple AI models to provide diverse perspectives on your proposals, enabling structured decision-making through for/against analysis and multi-model expert opinions.
## Thinking Mode
**Default is `medium` (8,192 tokens).** Use `high` for complex architectural decisions or `max` for critical strategic choices requiring comprehensive analysis.
## Model Recommendation
Consensus tool uses extended reasoning models by default, making it ideal for complex decision-making scenarios that benefit from multiple perspectives and deep analysis.
## How It Works
The consensus tool orchestrates multiple AI models to provide diverse perspectives on your proposals:
1. **Assign stances**: Each model can take a specific viewpoint (supportive, critical, or neutral)
2. **Gather opinions**: Models analyze your proposal from their assigned perspective with built-in common-sense guardrails
3. **Synthesize results**: Claude combines all perspectives into a balanced recommendation
4. **Natural language**: Use simple descriptions like "supportive", "critical", or "against" - the tool handles synonyms automatically
## Watch In Action
The following is a hypothetical example designed to demonstrate how one consensus can be built upon another (via [continuation](../context-revival.md)). In this scenario, we start with a _blinded_ consensus, where one model is tasked with taking a **for** stance and another with an **against** stance. This approach allows us to see how each model evaluates a particular option relative to the alternative. We then conduct a second consensus — all initiated by a single prompt and orchestrated by Claude Code in this video — to gather each model’s final conclusions.
<div style="center">
[Zen Consensus Debate](https://github.com/user-attachments/assets/76a23dd5-887a-4382-9cf0-642f5cf6219e)
</div>
## Example Prompts
**For/Against Analysis:**
```
Use zen consensus with flash taking a supportive stance and pro being critical to evaluate whether
we should migrate from REST to GraphQL for our API
```
**Multi-Model Technical Decision:**
```
Get consensus from o3, flash, and pro on our new authentication architecture. Have o3 focus on
security implications, flash on implementation speed, and pro stay neutral for overall assessment
```
**Natural Language Stance Assignment:**
```
Use consensus tool with gemini being "for" the proposal and grok being "against" to debate
whether we should adopt microservices architecture
```
```
I want to work on module X and Y, unsure which is going to be more popular with users of my app.
Get a consensus from gemini supporting the idea for implementing X, grok opposing it, and flash staying neutral
```
## Key Features
- **Stance steering**: Assign specific perspectives (for/against/neutral) to each model with intelligent synonym handling
- **Custom stance prompts**: Provide specific instructions for how each model should approach the analysis
- **Ethical guardrails**: Models will refuse to support truly bad ideas regardless of assigned stance
- **Unknown stance handling**: Invalid stances automatically default to neutral with warning
- **Natural language support**: Use terms like "supportive", "critical", "oppose", "favor" - all handled intelligently
- **Sequential processing**: Reliable execution avoiding MCP protocol issues
- **Focus areas**: Specify particular aspects to emphasize (e.g., 'security', 'performance', 'user experience')
- **File context support**: Include relevant files for informed decision-making
- **Image support**: Analyze architectural diagrams, UI mockups, or design documents
- **Conversation continuation**: Build on previous consensus analysis with additional rounds
- **Web search capability**: Enhanced analysis with current best practices and documentation
## Tool Parameters
- `prompt`: Detailed description of the proposal or decision to analyze (required)
- `models`: List of model configurations with optional stance and custom instructions (required)
- `files`: Context files for informed analysis (absolute paths)
- `images`: Visual references like diagrams or mockups (absolute paths)
- `focus_areas`: Specific aspects to emphasize
- `temperature`: Control consistency (default: 0.2 for stable consensus)
- `thinking_mode`: Analysis depth (minimal/low/medium/high/max)
- `continuation_id`: Continue previous consensus discussions
## Model Configuration Examples
**Basic For/Against:**
```json
[
{"model": "flash", "stance": "for"},
{"model": "pro", "stance": "against"}
]
```
**Custom Stance Instructions:**
```json
[
{"model": "o3", "stance": "for", "stance_prompt": "Focus on implementation benefits and user value"},
{"model": "flash", "stance": "against", "stance_prompt": "Identify potential risks and technical challenges"}
]
```
**Neutral Analysis:**
```json
[
{"model": "pro", "stance": "neutral"},
{"model": "o3", "stance": "neutral"}
]
```
## Usage Examples
**Architecture Decision:**
```
"Get consensus from pro and o3 on whether to use microservices vs monolith for our e-commerce platform"
```
**Technology Migration:**
```
"Use consensus with flash supporting and pro opposing to evaluate migrating from MySQL to PostgreSQL"
```
**Feature Priority:**
```
"Get consensus from multiple models on whether to prioritize mobile app vs web dashboard development first"
```
**With Visual Context:**
```
"Use consensus to evaluate this new UI design mockup - have flash support it and pro be critical"
```
## Best Practices
- **Provide detailed context**: Include project constraints, requirements, and background
- **Use balanced stances**: Mix supportive and critical perspectives for thorough analysis
- **Specify focus areas**: Guide models to emphasize relevant aspects (security, performance, etc.)
- **Include relevant files**: Provide code, documentation, or specifications for context
- **Build on discussions**: Use continuation for follow-up analysis and refinement
- **Leverage visual context**: Include diagrams, mockups, or design documents when relevant
## Ethical Guardrails
The consensus tool includes built-in ethical safeguards:
- Models won't support genuinely harmful proposals regardless of assigned stance
- Unknown or invalid stances automatically default to neutral
- Warning messages for potentially problematic requests
- Focus on constructive technical decision-making
## When to Use Consensus vs Other Tools
- **Use `consensus`** for: Multi-perspective analysis, structured debates, major technical decisions
- **Use `chat`** for: Open-ended discussions and brainstorming
- **Use `thinkdeep`** for: Extending specific analysis with deeper reasoning
- **Use `analyze`** for: Understanding existing systems without debate
```
--------------------------------------------------------------------------------
/docs/tools/chat.md:
--------------------------------------------------------------------------------
```markdown
# Chat Tool - General Development Chat & Collaborative Thinking
**Your thinking partner - bounce ideas, get second opinions, brainstorm collaboratively**
The `chat` tool is your collaborative thinking partner for development conversations. It's designed to help you brainstorm, validate ideas, get second opinions, and explore alternatives in a conversational format.
## Thinking Mode
**Default is `medium` (8,192 tokens).** Use `low` for quick questions to save tokens, or `high` for complex discussions when thoroughness matters.
## Example Prompt
```
I need to pick between Redis and Memcached for session storage and I need an expert opinion for the project
I'm working on. Take a look at the code and get an idea of what this project does, pick one of the two options
and then chat with gemini pro and continue discussing pros and cons to come to a final conclusion. I need a one
word verdict in the end.
```
<div style="center">
[Chat Redis or Memcached_web.webm](https://github.com/user-attachments/assets/41076cfe-dd49-4dfc-82f5-d7461b34705d)
</div>
**Another Example**:
* We ask Claude code to pick one of two frameworks, then `chat` with `gemini` to make a final decision
* Gemini responds, confirming choice. We use `continuation` to ask another question using the same conversation thread
* Gemini responds with explanation. We use continuation again, using `/zen:continue (MCP)` command the second time
<div style="center">
[Chat With Gemini_web.webm](https://github.com/user-attachments/assets/37bd57ca-e8a6-42f7-b5fb-11de271e95db)
</div>
## Key Features
- **Collaborative thinking partner** for your analysis and planning
- **Get second opinions** on your designs and approaches
- **Brainstorm solutions** and explore alternatives together
- **Structured code generation**: When using GPT-5 Pro or Gemini 2.5 Pro, get complete, production-ready implementations saved to `zen_generated.code` for your CLI to review and apply
- **Validate your checklists** and implementation plans
- **General development questions** and explanations
- **Technology comparisons** and best practices
- **Architecture and design discussions**
- **File reference support**: `"Use gemini to explain this algorithm with context from algorithm.py"`
- **Image support**: Include screenshots, diagrams, UI mockups for visual analysis: `"Chat with gemini about this error dialog screenshot to understand the user experience issue"`
- **Dynamic collaboration**: Models can request additional files or context during the conversation if needed for a more thorough response
- **Web search awareness**: Automatically identifies when online research would help and instructs Claude to perform targeted searches using continuation IDs
## Tool Parameters
- `prompt`: Your question or discussion topic (required)
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
- `absolute_file_paths`: Optional absolute file or directory paths for additional context
- `images`: Optional images for visual context (absolute paths)
- `working_directory_absolute_path`: **Required** - Absolute path to an existing directory where generated code artifacts will be saved
- `temperature`: Response creativity (0-1, default 0.5)
- `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
- `continuation_id`: Continue previous conversations
## Structured Code Generation
When using advanced reasoning models like **GPT-5 Pro** or **Gemini 2.5 Pro**, the chat tool can generate complete, production-ready code implementations in a structured format.
### How It Works
1. You ask your AI agent to implement a complex new feature using `chat` with a higher-reasoning model such as **GPT-5 Pro** or **Gemini 2.5 Pro**
2. The model generates structured implementation and shares the complete implementation with Zen
3. Zen saves the code to `zen_generated.code` and asks AI agent to implement the plan
4. AI agent continues from the previous context, reads the file, applies the implementation
### When Code Generation Activates
The structured format activates for **substantial implementation work**:
- Creating new features from scratch with multiple files or significant code
- Major refactoring across multiple files or large sections
- Implementing new modules, components, or subsystems
- Large-scale updates affecting substantial portions of the codebase
- Complete rewrites of functions, algorithms, or approaches
For minor changes (small tweaks, bug fixes, algorithm improvements), the model responds normally with inline code blocks.
### Example Usage
```
chat with gpt-5-pro and ask it to make me a standalone, classic version of the
Pacman game using pygame that I can run from the commandline. Give me a single
script to execute in the end with any / all dependencies setup for me.
Do everything using pygame, we have no external resources / images / audio at
hand. Instead of ghosts, it'll be different geometric shapes moving around
in the maze that Pacman can eat (so there are no baddies). Pacman gets to eat
everything including bread-crumbs and large geometric shapes but make me the
classic maze / walls that it navigates within using keyboard arrow keys.
```
See the [Configuration Guide](../configuration.md#code-generation-capability) for details on the `allow_code_generation` flag.
## Usage Examples
**Basic Development Chat:**
```
"Chat with zen about the best approach for user authentication in my React app"
```
**Technology Comparison:**
```
"Use flash to discuss whether PostgreSQL or MongoDB would be better for my e-commerce platform"
```
**Architecture Discussion:**
```
"Chat with pro about microservices vs monolith architecture for my project, consider scalability and team size"
```
**File Context Analysis:**
```
"Use gemini to chat about the current authentication implementation in auth.py and suggest improvements"
```
**Visual Analysis:**
```
"Chat with gemini about this UI mockup screenshot - is the user flow intuitive?"
```
## Best Practices
- **Be specific about context**: Include relevant files or describe your project scope
- **Ask for trade-offs**: Request pros/cons for better decision-making
- **Use conversation continuation**: Build on previous discussions with `continuation_id`
- **Leverage visual context**: Include diagrams, mockups, or screenshots when discussing UI/UX
- **Encourage research**: When you suspect documentation has changed, explicitly ask the assistant to confirm by requesting a web search
## When to Use Chat vs Other Tools
- **Use `chat`** for: Open-ended discussions, brainstorming, getting second opinions, technology comparisons
- **Use `thinkdeep`** for: Extending specific analysis, challenging assumptions, deeper reasoning
- **Use `analyze`** for: Understanding existing code structure and patterns
- **Use `debug`** for: Specific error diagnosis and troubleshooting
```
--------------------------------------------------------------------------------
/tests/test_docker_healthcheck.py:
--------------------------------------------------------------------------------
```python
"""
Tests for Docker health check functionality
"""
import os
import subprocess
from pathlib import Path
from unittest.mock import patch
import pytest
class TestDockerHealthCheck:
"""Test Docker health check implementation"""
@pytest.fixture(autouse=True)
def setup(self):
"""Setup for each test"""
self.project_root = Path(__file__).parent.parent
self.healthcheck_script = self.project_root / "docker" / "scripts" / "healthcheck.py"
def test_healthcheck_script_exists(self):
"""Test that health check script exists"""
assert self.healthcheck_script.exists(), "healthcheck.py must exist"
def test_healthcheck_script_executable(self):
"""Test that health check script is executable"""
if not self.healthcheck_script.exists():
pytest.skip("healthcheck.py not found")
# Check if script has Python shebang
content = self.healthcheck_script.read_text()
assert content.startswith("#!/usr/bin/env python"), "Health check script must have Python shebang"
@patch("subprocess.run")
def test_process_check_success(self, mock_run):
"""Test successful process check"""
# Mock successful pgrep command
mock_run.return_value.returncode = 0
mock_run.return_value.stdout = "12345\n"
# Import and test the function (if we can access it)
# This would require the healthcheck module to be importable
result = subprocess.run(["pgrep", "-f", "server.py"], capture_output=True, text=True, timeout=10)
assert result.returncode == 0
@patch("subprocess.run")
def test_process_check_failure(self, mock_run):
"""Test failed process check"""
# Mock failed pgrep command
mock_run.return_value.returncode = 1
mock_run.return_value.stderr = "No such process"
result = subprocess.run(["pgrep", "-f", "server.py"], capture_output=True, text=True, timeout=10)
assert result.returncode == 1
def test_critical_modules_import(self):
"""Test that critical modules can be imported"""
critical_modules = ["json", "os", "sys", "pathlib"]
for module_name in critical_modules:
try:
__import__(module_name)
except ImportError:
pytest.fail(f"Critical module {module_name} cannot be imported")
def test_optional_modules_graceful_failure(self):
"""Test graceful handling of optional module import failures"""
optional_modules = ["mcp", "google.genai", "openai"]
for module_name in optional_modules:
try:
__import__(module_name)
except ImportError:
# This is expected in test environment
pass
def test_log_directory_check(self):
"""Test log directory health check logic"""
# Test with existing directory
test_dir = self.project_root / "logs"
if test_dir.exists():
assert os.access(test_dir, os.W_OK), "Logs directory must be writable"
def test_health_check_timeout_handling(self):
"""Test that health checks handle timeouts properly"""
timeout_duration = 10
# Mock a command that would timeout
with patch("subprocess.run") as mock_run:
mock_run.side_effect = subprocess.TimeoutExpired(["test"], timeout_duration)
with pytest.raises(subprocess.TimeoutExpired):
subprocess.run(["sleep", "20"], capture_output=True, text=True, timeout=timeout_duration)
def test_health_check_docker_configuration(self):
"""Test health check configuration in Docker setup"""
compose_file = self.project_root / "docker-compose.yml"
if compose_file.exists():
content = compose_file.read_text()
# Check for health check configuration
assert "healthcheck:" in content, "Health check must be configured"
assert "healthcheck.py" in content, "Health check script must be referenced"
assert "interval:" in content, "Health check interval must be set"
assert "timeout:" in content, "Health check timeout must be set"
class TestDockerHealthCheckIntegration:
"""Integration tests for Docker health checks"""
def test_dockerfile_health_check_setup(self):
"""Test that Dockerfile includes health check setup"""
project_root = Path(__file__).parent.parent
dockerfile = project_root / "Dockerfile"
if dockerfile.exists():
content = dockerfile.read_text()
# Check that health check script is copied
script_copied = ("COPY" in content and "healthcheck.py" in content) or "COPY . ." in content
assert script_copied, "Health check script must be copied to container"
def test_health_check_failure_scenarios(self):
"""Test various health check failure scenarios"""
failure_scenarios = [
{"type": "process_not_found", "expected": False},
{"type": "import_error", "expected": False},
{"type": "permission_error", "expected": False},
{"type": "timeout_error", "expected": False},
]
for scenario in failure_scenarios:
# Each scenario should result in health check failure
assert scenario["expected"] is False
def test_health_check_recovery(self):
"""Test health check recovery after transient failures"""
# Test that health checks can recover from temporary issues
recovery_scenarios = [
{"initial_state": "failing", "final_state": "healthy"},
{"initial_state": "timeout", "final_state": "healthy"},
]
for scenario in recovery_scenarios:
assert scenario["final_state"] == "healthy"
@patch.dict(os.environ, {}, clear=True)
def test_health_check_with_missing_env_vars(self):
"""Test health check behavior with missing environment variables"""
# Health check should still work even without API keys
# (it tests system health, not API connectivity)
required_vars = ["GEMINI_API_KEY", "OPENAI_API_KEY", "XAI_API_KEY"]
# Verify no API keys are set
for var in required_vars:
assert os.getenv(var) is None
def test_health_check_performance(self):
"""Test that health checks complete within reasonable time"""
# Health checks should be fast to avoid impacting container startup
max_execution_time = 30 # seconds
# Mock a health check execution
import time
start_time = time.time()
# Simulate health check operations
time.sleep(0.1) # Simulate actual work
execution_time = time.time() - start_time
assert (
execution_time < max_execution_time
), f"Health check took {execution_time}s, should be < {max_execution_time}s"
```
--------------------------------------------------------------------------------
/run_integration_tests.ps1:
--------------------------------------------------------------------------------
```
<#
.SYNOPSIS
Integration test runner script for the Zen MCP server on Windows.
.DESCRIPTION
This PowerShell script prepares and runs integration tests for the Zen MCP server:
- Sets up the test environment
- Installs required dependencies
- Runs automated integration tests
- Displays test results and related logs
- Allows output customization via parameters (e.g., display color)
.PARAMETER Color
Sets the display color for console messages (default: White).
.EXAMPLE
.\run_integration_tests.ps1
Prepares the environment and runs all integration tests.
.\run_integration_tests.ps1 -Color Cyan
Runs the tests with messages displayed in cyan.
.NOTES
Project Author : BeehiveInnovations
Script Author : GiGiDKR (https://github.com/GiGiDKR)
Date : 07-05-2025
Version : See config.py (__version__)
References : https://github.com/BeehiveInnovations/zen-mcp-server
#>
#Requires -Version 5.1
[CmdletBinding()]
param(
[switch]$WithSimulator,
[switch]$VerboseOutput
)
# Set error action preference
$ErrorActionPreference = "Stop"
# Colors for output
function Write-ColorText {
param(
[Parameter(Mandatory)]
[string]$Text,
[string]$Color = "White",
[switch]$NoNewline
)
if ($NoNewline) {
Write-Host $Text -ForegroundColor $Color -NoNewline
} else {
Write-Host $Text -ForegroundColor $Color
}
}
function Write-Emoji {
param(
[Parameter(Mandatory)]
[string]$Emoji,
[Parameter(Mandatory)]
[string]$Text,
[string]$Color = "White"
)
Write-Host "$Emoji " -NoNewline
Write-ColorText $Text -Color $Color
}
Write-Emoji "🧪" "Running Integration Tests for Zen MCP Server" -Color Cyan
Write-ColorText "==============================================" -Color Cyan
Write-ColorText "These tests use real API calls with your configured keys"
Write-Host ""
# Check for virtual environment
$venvPath = ".zen_venv"
$activateScript = if ($IsWindows -or $env:OS -eq "Windows_NT") {
"$venvPath\Scripts\Activate.ps1"
} else {
"$venvPath/bin/activate"
}
if (Test-Path $venvPath) {
Write-Emoji "✅" "Virtual environment found" -Color Green
# Activate virtual environment (for PowerShell on Windows)
if ($IsWindows -or $env:OS -eq "Windows_NT") {
if (Test-Path "$venvPath\Scripts\Activate.ps1") {
& "$venvPath\Scripts\Activate.ps1"
} elseif (Test-Path "$venvPath\Scripts\activate.bat") {
# Use Python directly from venv
$env:PATH = "$PWD\$venvPath\Scripts;$env:PATH"
}
}
} else {
Write-Emoji "❌" "No virtual environment found!" -Color Red
Write-ColorText "Please run: .\run-server.ps1 first" -Color Yellow
exit 1
}
# Check for .env file
if (!(Test-Path ".env")) {
Write-Emoji "⚠️" "Warning: No .env file found. Integration tests may fail without API keys." -Color Yellow
Write-Host ""
}
Write-Emoji "🔑" "Checking API key availability:" -Color Cyan
Write-ColorText "---------------------------------" -Color Cyan
# Function to check if API key is configured
function Test-ApiKey {
param(
[string]$KeyName
)
# Check environment variable
$envValue = [Environment]::GetEnvironmentVariable($KeyName)
if (![string]::IsNullOrWhiteSpace($envValue)) {
return $true
}
# Check .env file
if (Test-Path ".env") {
$envContent = Get-Content ".env" -ErrorAction SilentlyContinue
$found = $envContent | Where-Object { $_ -match "^$KeyName\s*=" -and $_ -notmatch "^$KeyName\s*=\s*$" }
return $found.Count -gt 0
}
return $false
}
# Check API keys
$apiKeys = @(
"GEMINI_API_KEY",
"OPENAI_API_KEY",
"XAI_API_KEY",
"OPENROUTER_API_KEY",
"CUSTOM_API_URL"
)
foreach ($key in $apiKeys) {
if (Test-ApiKey $key) {
if ($key -eq "CUSTOM_API_URL") {
Write-Emoji "✅" "$key configured (local models)" -Color Green
} else {
Write-Emoji "✅" "$key configured" -Color Green
}
} else {
Write-Emoji "❌" "$key not found" -Color Red
}
}
Write-Host ""
# Load environment variables from .env if it exists
if (Test-Path ".env") {
Get-Content ".env" | ForEach-Object {
if ($_ -match '^([^#][^=]*?)=(.*)$') {
$name = $matches[1].Trim()
$value = $matches[2].Trim()
# Remove quotes if present
$value = $value -replace '^["'']|["'']$', ''
[Environment]::SetEnvironmentVariable($name, $value, "Process")
}
}
}
# Run integration tests
Write-Emoji "🏃" "Running integration tests..." -Color Cyan
Write-ColorText "------------------------------" -Color Cyan
try {
# Build pytest command
$pytestArgs = @("tests/", "-v", "-m", "integration", "--tb=short")
if ($VerboseOutput) {
$pytestArgs += "--verbose"
}
# Run pytest
python -m pytest @pytestArgs
if ($LASTEXITCODE -ne 0) {
throw "Integration tests failed"
}
Write-Host ""
Write-Emoji "✅" "Integration tests completed!" -Color Green
} catch {
Write-Host ""
Write-Emoji "❌" "Integration tests failed!" -Color Red
Write-ColorText "Error: $_" -Color Red
exit 1
}
# Run simulator tests if requested
if ($WithSimulator) {
Write-Host ""
Write-Emoji "🤖" "Running simulator tests..." -Color Cyan
Write-ColorText "----------------------------" -Color Cyan
try {
if ($VerboseOutput) {
python communication_simulator_test.py --verbose
} else {
python communication_simulator_test.py
}
if ($LASTEXITCODE -ne 0) {
Write-Host ""
Write-Emoji "❌" "Simulator tests failed!" -Color Red
Write-ColorText "This may be due to a known issue in communication_simulator_test.py" -Color Yellow
Write-ColorText "Integration tests completed successfully - you can proceed." -Color Green
} else {
Write-Host ""
Write-Emoji "✅" "Simulator tests completed!" -Color Green
}
} catch {
Write-Host ""
Write-Emoji "❌" "Simulator tests failed!" -Color Red
Write-ColorText "Error: $_" -Color Red
Write-ColorText "This may be due to a known issue in communication_simulator_test.py" -Color Yellow
Write-ColorText "Integration tests completed successfully - you can proceed." -Color Green
}
}
Write-Host ""
Write-Emoji "💡" "Tips:" -Color Yellow
Write-ColorText "- Run '.\run_integration_tests.ps1' for integration tests only" -Color White
Write-ColorText "- Run '.\run_integration_tests.ps1 -WithSimulator' to also run simulator tests" -Color White
Write-ColorText "- Run '.\code_quality_checks.ps1' for unit tests and linting" -Color White
Write-ColorText "- Check logs in logs\mcp_server.log if tests fail" -Color White
```
--------------------------------------------------------------------------------
/simulator_tests/test_content_validation.py:
--------------------------------------------------------------------------------
```python
#!/usr/bin/env python3
"""
Content Validation Test
Tests that tools don't duplicate file content in their responses.
This test is specifically designed to catch content duplication bugs.
"""
import os
from .base_test import BaseSimulatorTest
class ContentValidationTest(BaseSimulatorTest):
"""Test that tools don't duplicate file content in their responses"""
@property
def test_name(self) -> str:
return "content_validation"
@property
def test_description(self) -> str:
return "Content validation and duplicate detection"
def run_test(self) -> bool:
"""Test that file processing system properly handles file deduplication"""
try:
self.logger.info("📄 Test: Content validation and file processing deduplication")
# Setup test files first
self.setup_test_files()
# Create a test file for validation
validation_content = '''"""
Configuration file for content validation testing
"""
# Configuration constants
MAX_CONTENT_TOKENS = 800_000
TEMPERATURE_ANALYTICAL = 0.2
UNIQUE_VALIDATION_MARKER = "CONTENT_VALIDATION_TEST_12345"
# Database settings
DATABASE_CONFIG = {
"host": "localhost",
"port": 5432,
"name": "validation_test_db"
}
'''
validation_file = os.path.join(self.test_dir, "validation_config.py")
with open(validation_file, "w") as f:
f.write(validation_content)
# Ensure absolute path for MCP server compatibility
validation_file = os.path.abspath(validation_file)
# Get timestamp for log filtering
import datetime
start_time = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
# Test 1: Initial tool call with validation file
self.logger.info(" 1: Testing initial tool call with file")
# Call chat tool with the validation file
response1, thread_id = self.call_mcp_tool(
"chat",
{
"prompt": "Analyze this configuration file briefly",
"absolute_file_paths": [validation_file],
"model": "flash",
},
)
if not response1:
self.logger.error(" ❌ Initial tool call failed")
return False
self.logger.info(" ✅ Initial tool call completed")
# Test 2: Continuation with same file (should be deduplicated)
self.logger.info(" 2: Testing continuation with same file")
if thread_id:
response2, _ = self.call_mcp_tool(
"chat",
{
"prompt": "Continue analyzing this configuration file",
"absolute_file_paths": [validation_file], # Same file should be deduplicated
"continuation_id": thread_id,
"model": "flash",
},
)
if response2:
self.logger.info(" ✅ Continuation with same file completed")
else:
self.logger.warning(" ⚠️ Continuation failed")
# Test 3: Different tool with same file (new conversation)
self.logger.info(" 3: Testing different tool with same file")
response3, _ = self.call_mcp_tool(
"codereview",
{
"step": "Review this configuration file for quality and potential issues",
"step_number": 1,
"total_steps": 1,
"next_step_required": False,
"findings": "Starting code review of configuration file",
"relevant_files": [validation_file],
"model": "flash",
},
)
if response3:
self.logger.info(" ✅ Different tool with same file completed")
else:
self.logger.warning(" ⚠️ Different tool failed")
# Validate file processing behavior from server logs
self.logger.info(" 4: Validating file processing logs")
logs = self.get_server_logs_since(start_time)
# Check for proper file embedding logs
embedding_logs = [
line
for line in logs.split("\n")
if "[FILE_PROCESSING]" in line or "embedding" in line.lower() or "[FILES]" in line
]
# Check for deduplication evidence
deduplication_logs = [
line
for line in logs.split("\n")
if ("skipping" in line.lower() and "already in conversation" in line.lower())
or "No new files to embed" in line
]
# Check for file processing patterns
new_file_logs = [
line
for line in logs.split("\n")
if "will embed new files" in line or "New conversation" in line or "[FILE_PROCESSING]" in line
]
# Validation criteria
validation_file_mentioned = any("validation_config.py" in line for line in logs.split("\n"))
embedding_found = len(embedding_logs) > 0
(len(deduplication_logs) > 0 or len(new_file_logs) >= 2) # Should see new conversation patterns
self.logger.info(f" Embedding logs found: {len(embedding_logs)}")
self.logger.info(f" Deduplication evidence: {len(deduplication_logs)}")
self.logger.info(f" New conversation patterns: {len(new_file_logs)}")
self.logger.info(f" Validation file mentioned: {validation_file_mentioned}")
# Log sample evidence for debugging
if self.verbose and embedding_logs:
self.logger.debug(" 📋 Sample embedding logs:")
for log in embedding_logs[:5]:
self.logger.debug(f" {log}")
# Success criteria
success_criteria = [
("Embedding logs found", embedding_found),
("File processing evidence", validation_file_mentioned),
("Multiple tool calls", len(new_file_logs) >= 2),
]
passed_criteria = sum(1 for _, passed in success_criteria if passed)
self.logger.info(f" Success criteria met: {passed_criteria}/{len(success_criteria)}")
# Cleanup
os.remove(validation_file)
if passed_criteria >= 2: # At least 2 out of 3 criteria
self.logger.info(" ✅ File processing validation passed")
return True
else:
self.logger.error(" ❌ File processing validation failed")
return False
except Exception as e:
self.logger.error(f"Content validation test failed: {e}")
return False
finally:
self.cleanup_test_files()
```
--------------------------------------------------------------------------------
/providers/shared/temperature.py:
--------------------------------------------------------------------------------
```python
"""Helper types for validating model temperature parameters."""
from abc import ABC, abstractmethod
from typing import Optional
__all__ = [
"TemperatureConstraint",
"FixedTemperatureConstraint",
"RangeTemperatureConstraint",
"DiscreteTemperatureConstraint",
]
# Common heuristics for determining temperature support when explicit
# capabilities are unavailable (e.g., custom/local models).
_TEMP_UNSUPPORTED_PATTERNS = {
"o1",
"o3",
"o4", # OpenAI O-series reasoning models
"deepseek-reasoner",
"deepseek-r1",
"r1", # DeepSeek reasoner variants
}
_TEMP_UNSUPPORTED_KEYWORDS = {
"reasoner", # Catch additional DeepSeek-style naming patterns
}
class TemperatureConstraint(ABC):
"""Contract for temperature validation used by `ModelCapabilities`.
Concrete providers describe their temperature behaviour by creating
subclasses that expose three operations:
* `validate` – decide whether a requested temperature is acceptable.
* `get_corrected_value` – coerce out-of-range values into a safe default.
* `get_description` – provide a human readable error message for users.
Providers call these hooks before sending traffic to the underlying API so
that unsupported temperatures never reach the remote service.
"""
@abstractmethod
def validate(self, temperature: float) -> bool:
"""Return ``True`` when the temperature may be sent to the backend."""
@abstractmethod
def get_corrected_value(self, temperature: float) -> float:
"""Return a valid substitute for an out-of-range temperature."""
@abstractmethod
def get_description(self) -> str:
"""Describe the acceptable range to include in error messages."""
@abstractmethod
def get_default(self) -> float:
"""Return the default temperature for the model."""
@staticmethod
def infer_support(model_name: str) -> tuple[bool, str]:
"""Heuristically determine whether a model supports temperature."""
model_lower = model_name.lower()
for pattern in _TEMP_UNSUPPORTED_PATTERNS:
conditions = (
pattern == model_lower,
model_lower.startswith(f"{pattern}-"),
model_lower.startswith(f"openai/{pattern}"),
model_lower.startswith(f"deepseek/{pattern}"),
model_lower.endswith(f"-{pattern}"),
f"/{pattern}" in model_lower,
f"-{pattern}-" in model_lower,
)
if any(conditions):
return False, f"detected pattern '{pattern}'"
for keyword in _TEMP_UNSUPPORTED_KEYWORDS:
if keyword in model_lower:
return False, f"detected keyword '{keyword}'"
return True, "default assumption for models without explicit metadata"
@staticmethod
def resolve_settings(
model_name: str,
constraint_hint: Optional[str] = None,
) -> tuple[bool, "TemperatureConstraint", str]:
"""Derive temperature support and constraint for a model.
Args:
model_name: Canonical model identifier or alias.
constraint_hint: Optional configuration hint (``"fixed"``,
``"range"``, ``"discrete"``). When provided, the hint is
honoured directly.
Returns:
Tuple ``(supports_temperature, constraint, diagnosis)`` describing
whether temperature may be tuned, the constraint object that should
be attached to :class:`ModelCapabilities`, and the reasoning behind
the decision.
"""
if constraint_hint:
constraint = TemperatureConstraint.create(constraint_hint)
supports_temperature = constraint_hint != "fixed"
reason = f"constraint hint '{constraint_hint}'"
return supports_temperature, constraint, reason
supports_temperature, reason = TemperatureConstraint.infer_support(model_name)
if supports_temperature:
constraint: TemperatureConstraint = RangeTemperatureConstraint(0.0, 2.0, 0.7)
else:
constraint = FixedTemperatureConstraint(1.0)
return supports_temperature, constraint, reason
@staticmethod
def create(constraint_type: str) -> "TemperatureConstraint":
"""Factory that yields the appropriate constraint for a configuration hint."""
if constraint_type == "fixed":
# Fixed temperature models (O3/O4) only support temperature=1.0
return FixedTemperatureConstraint(1.0)
if constraint_type == "discrete":
# For models with specific allowed values - using common OpenAI values as default
return DiscreteTemperatureConstraint([0.0, 0.3, 0.7, 1.0, 1.5, 2.0], 0.3)
# Default range constraint (for "range" or None)
return RangeTemperatureConstraint(0.0, 2.0, 0.3)
class FixedTemperatureConstraint(TemperatureConstraint):
"""Constraint for models that enforce an exact temperature (for example O3)."""
def __init__(self, value: float):
self.value = value
def validate(self, temperature: float) -> bool:
return abs(temperature - self.value) < 1e-6 # Handle floating point precision
def get_corrected_value(self, temperature: float) -> float:
return self.value
def get_description(self) -> str:
return f"Only supports temperature={self.value}"
def get_default(self) -> float:
return self.value
class RangeTemperatureConstraint(TemperatureConstraint):
"""Constraint for providers that expose a continuous min/max temperature range."""
def __init__(self, min_temp: float, max_temp: float, default: Optional[float] = None):
self.min_temp = min_temp
self.max_temp = max_temp
self.default_temp = default or (min_temp + max_temp) / 2
def validate(self, temperature: float) -> bool:
return self.min_temp <= temperature <= self.max_temp
def get_corrected_value(self, temperature: float) -> float:
return max(self.min_temp, min(self.max_temp, temperature))
def get_description(self) -> str:
return f"Supports temperature range [{self.min_temp}, {self.max_temp}]"
def get_default(self) -> float:
return self.default_temp
class DiscreteTemperatureConstraint(TemperatureConstraint):
"""Constraint for models that permit a discrete list of temperature values."""
def __init__(self, allowed_values: list[float], default: Optional[float] = None):
self.allowed_values = sorted(allowed_values)
self.default_temp = default or allowed_values[len(allowed_values) // 2]
def validate(self, temperature: float) -> bool:
return any(abs(temperature - val) < 1e-6 for val in self.allowed_values)
def get_corrected_value(self, temperature: float) -> float:
return min(self.allowed_values, key=lambda x: abs(x - temperature))
def get_description(self) -> str:
return f"Supports temperatures: {self.allowed_values}"
def get_default(self) -> float:
return self.default_temp
```
--------------------------------------------------------------------------------
/tests/test_chat_openai_integration.py:
--------------------------------------------------------------------------------
```python
"""Integration test for ChatTool auto-mode using OpenAI o3/gpt models with cassette recording."""
from __future__ import annotations
import json
import os
import uuid
from pathlib import Path
import pytest
from providers.registry import ModelProviderRegistry
from providers.shared import ProviderType
from tests.transport_helpers import inject_transport
from tools.chat import ChatTool
# Directory for recorded HTTP interactions
CASSETTE_DIR = Path(__file__).parent / "openai_cassettes"
CASSETTE_DIR.mkdir(exist_ok=True)
CASSETTE_PATH = CASSETTE_DIR / "chat_gpt5_moon_distance.json"
CASSETTE_CONTINUATION_PATH = CASSETTE_DIR / "chat_gpt5_continuation.json"
@pytest.mark.asyncio
@pytest.mark.no_mock_provider
async def test_chat_auto_mode_with_openai(monkeypatch, tmp_path):
"""Ensure ChatTool in auto mode selects gpt-5 via OpenAI and returns a valid response."""
# Prepare environment so only OpenAI is available in auto mode
env_updates = {
"DEFAULT_MODEL": "auto",
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", ""),
}
# Remove Gemini/XAI keys to force OpenAI selection
keys_to_clear = ["GEMINI_API_KEY", "XAI_API_KEY", "OPENROUTER_API_KEY"]
with monkeypatch.context() as m:
m.setenv("DEFAULT_MODEL", env_updates["DEFAULT_MODEL"])
m.setenv("OPENAI_ALLOWED_MODELS", "gpt-5")
if env_updates["OPENAI_API_KEY"]:
m.setenv("OPENAI_API_KEY", env_updates["OPENAI_API_KEY"])
for key in keys_to_clear:
m.delenv(key, raising=False)
# Choose recording or replay mode based on cassette presence
if not CASSETTE_PATH.exists():
real_key = os.getenv("OPENAI_API_KEY", "").strip()
if not real_key or real_key.startswith("dummy"):
pytest.skip(
"Cassette missing and OPENAI_API_KEY not configured. Provide a real key and re-run to record."
)
else:
# Replay mode uses dummy key to keep secrets out of the cassette
m.setenv("OPENAI_API_KEY", "dummy-key-for-replay")
# Reset registry and register only OpenAI provider
ModelProviderRegistry.reset_for_testing()
from providers.openai import OpenAIModelProvider
ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
# Inject HTTP transport (records or replays depending on cassette state)
inject_transport(monkeypatch, CASSETTE_PATH)
# Execute ChatTool request targeting gpt-5 directly (server normally resolves auto→model)
chat_tool = ChatTool()
working_directory = str(tmp_path)
arguments = {
"prompt": "Use chat with gpt5 and ask how far the moon is from earth.",
"model": "gpt-5",
"temperature": 1.0,
"working_directory_absolute_path": working_directory,
}
result = await chat_tool.execute(arguments)
# Validate response
assert result and result[0].type == "text"
response_data = json.loads(result[0].text)
assert response_data["status"] in {"success", "continuation_available"}
metadata = response_data.get("metadata", {})
assert metadata.get("provider_used") == "openai"
assert metadata.get("model_used") in {"gpt-5", "gpt5"}
assert "moon" in response_data["content"].lower()
# Ensure cassette recorded for future replays
assert CASSETTE_PATH.exists()
@pytest.mark.asyncio
@pytest.mark.no_mock_provider
async def test_chat_openai_continuation(monkeypatch, tmp_path):
"""Verify continuation_id workflow against gpt-5 using recorded OpenAI responses."""
env_updates = {
"DEFAULT_MODEL": "auto",
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", ""),
}
keys_to_clear = ["GEMINI_API_KEY", "XAI_API_KEY", "OPENROUTER_API_KEY"]
recording_mode = not CASSETTE_CONTINUATION_PATH.exists()
if recording_mode:
real_key = env_updates["OPENAI_API_KEY"].strip()
if not real_key or real_key.startswith("dummy"):
pytest.skip("Continuation cassette missing and OPENAI_API_KEY not configured. Set a real key to record.")
fixed_thread_id = uuid.UUID("95d60035-1aa3-4398-9936-fca71989d906")
with monkeypatch.context() as m:
m.setenv("DEFAULT_MODEL", env_updates["DEFAULT_MODEL"])
m.setenv("OPENAI_ALLOWED_MODELS", "gpt-5")
if recording_mode:
m.setenv("OPENAI_API_KEY", env_updates["OPENAI_API_KEY"])
else:
m.setenv("OPENAI_API_KEY", "dummy-key-for-replay")
for key in keys_to_clear:
m.delenv(key, raising=False)
ModelProviderRegistry.reset_for_testing()
from providers.openai import OpenAIModelProvider
ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
inject_transport(monkeypatch, CASSETTE_CONTINUATION_PATH)
from utils import conversation_memory
m.setattr(conversation_memory.uuid, "uuid4", lambda: fixed_thread_id)
chat_tool = ChatTool()
working_directory = str(tmp_path)
# First message: obtain continuation_id
first_args = {
"prompt": "In one word, which sells better: iOS app or macOS app?",
"model": "gpt-5",
"temperature": 1.0,
"working_directory_absolute_path": working_directory,
}
first_result = await chat_tool.execute(first_args)
assert first_result and first_result[0].type == "text"
first_data = json.loads(first_result[0].text)
assert first_data["status"] == "continuation_available"
first_metadata = first_data.get("metadata", {})
assert first_metadata.get("provider_used") == "openai"
assert first_metadata.get("model_used") in {"gpt-5", "gpt5"}
continuation = first_data.get("continuation_offer")
assert continuation is not None
continuation_id = continuation.get("continuation_id")
assert continuation_id
# Second message using continuation_id (reuse same tool instance for clarity)
second_args = {
"prompt": "In one word then, SwiftUI or ReactNative?",
"model": "gpt-5",
"continuation_id": continuation_id,
"temperature": 1.0,
"working_directory_absolute_path": working_directory,
}
second_result = await chat_tool.execute(second_args)
assert second_result and second_result[0].type == "text"
second_data = json.loads(second_result[0].text)
assert second_data["status"] in {"success", "continuation_available"}
second_metadata = second_data.get("metadata", {})
assert second_metadata.get("provider_used") == "openai"
assert second_metadata.get("model_used") in {"gpt-5", "gpt5"}
assert second_metadata.get("conversation_ready") is True
assert second_data.get("continuation_offer") is not None
# Ensure the cassette file exists for future replays
assert CASSETTE_CONTINUATION_PATH.exists()
# Clean up registry state for subsequent tests
ModelProviderRegistry.reset_for_testing()
```
--------------------------------------------------------------------------------
/tests/test_uvx_support.py:
--------------------------------------------------------------------------------
```python
"""
Test cases for uvx support and environment handling.
"""
import os
import sys
import tempfile
from pathlib import Path
from unittest import mock
import pytest
class TestUvxEnvironmentHandling:
"""Test uvx-specific environment handling features."""
def test_dotenv_import_success(self):
"""Test that dotenv is imported successfully when available."""
# Mock successful dotenv import
mock_load = mock.MagicMock()
mock_values = mock.MagicMock(return_value={})
fake_dotenv = mock.MagicMock(load_dotenv=mock_load, dotenv_values=mock_values)
with mock.patch.dict("sys.modules", {"dotenv": fake_dotenv}):
if "utils.env" in sys.modules:
del sys.modules["utils.env"]
if "server" in sys.modules:
del sys.modules["server"]
import importlib
import utils.env as env_config
with tempfile.NamedTemporaryFile("w", delete=False) as tmp_env:
temp_env_path = Path(tmp_env.name)
tmp_env.write("ZEN_MCP_FORCE_ENV_OVERRIDE=false\n")
try:
importlib.reload(env_config)
env_config._ENV_PATH = temp_env_path
env_config.reload_env()
import server # noqa: F401
assert mock_load.call_count >= 1
_, kwargs = mock_load.call_args
assert "dotenv_path" in kwargs
finally:
temp_env_path.unlink(missing_ok=True)
def test_dotenv_import_failure_graceful_handling(self):
"""Test that ImportError for dotenv is handled gracefully (uvx scenario)."""
# Mock only the dotenv import to fail
original_import = __builtins__["__import__"]
def mock_import(name, *args, **kwargs):
if name == "dotenv":
raise ImportError("No module named 'dotenv'")
return original_import(name, *args, **kwargs)
with mock.patch("builtins.__import__", side_effect=mock_import):
# This should not raise an exception when trying to import dotenv
try:
from dotenv import load_dotenv # noqa: F401
pytest.fail("Should have raised ImportError for dotenv")
except ImportError:
# Expected behavior - ImportError should be caught gracefully in server.py
pass
def test_env_file_path_resolution(self):
"""Test that .env file path is correctly resolved relative to server.py."""
import server
# Test that the server module correctly resolves .env path
script_dir = Path(server.__file__).parent
expected_env_file = script_dir / ".env"
# The logic should create a path relative to server.py
assert expected_env_file.name == ".env"
assert expected_env_file.parent == script_dir
def test_environment_variables_still_work_without_dotenv(self):
"""Test that environment variables work even when dotenv is not available."""
# Set a test environment variable
test_key = "TEST_ZEN_MCP_VAR"
test_value = "test_value_123"
with mock.patch.dict(os.environ, {test_key: test_value}):
# Environment variable should still be accessible regardless of dotenv
assert os.getenv(test_key) == test_value
def test_dotenv_graceful_fallback_behavior(self):
"""Test the actual graceful fallback behavior in server module."""
# Test that server module handles missing dotenv gracefully
# This is tested by the fact that the server can be imported even if dotenv fails
import server
# If we can import server, the graceful handling works
assert hasattr(server, "run")
# Test that environment variables still work
test_key = "TEST_FALLBACK_VAR"
test_value = "fallback_test_123"
with mock.patch.dict(os.environ, {test_key: test_value}):
assert os.getenv(test_key) == test_value
class TestUvxProjectConfiguration:
"""Test uvx-specific project configuration features."""
def test_pyproject_toml_has_required_uvx_fields(self):
"""Test that pyproject.toml has all required fields for uvx support."""
try:
import tomllib
except ImportError:
# tomllib is only available in Python 3.11+
# For older versions, use tomli or skip the test
try:
import tomli as tomllib
except ImportError:
pytest.skip("tomllib/tomli not available for TOML parsing")
pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
assert pyproject_path.exists(), "pyproject.toml should exist"
with open(pyproject_path, "rb") as f:
pyproject_data = tomllib.load(f)
# Check required uvx fields
assert "project" in pyproject_data
project = pyproject_data["project"]
# Essential fields for uvx
assert "name" in project
assert project["name"] == "zen-mcp-server"
assert "dependencies" in project
assert "requires-python" in project
# Script entry point for uvx
assert "scripts" in project
assert "zen-mcp-server" in project["scripts"]
assert project["scripts"]["zen-mcp-server"] == "server:run"
def test_pyproject_dependencies_match_requirements(self):
"""Test that pyproject.toml dependencies align with requirements.txt."""
try:
import tomllib
except ImportError:
# tomllib is only available in Python 3.11+
try:
import tomli as tomllib
except ImportError:
pytest.skip("tomllib/tomli not available for TOML parsing")
# Read pyproject.toml
pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
with open(pyproject_path, "rb") as f:
pyproject_data = tomllib.load(f)
pyproject_deps = set(pyproject_data["project"]["dependencies"])
# Read requirements.txt
requirements_path = Path(__file__).parent.parent / "requirements.txt"
if requirements_path.exists():
# Note: We primarily validate pyproject.toml has core dependencies
# requirements.txt might have additional dev dependencies
# Core dependencies should be present in both
core_packages = {"mcp", "openai", "google-genai", "pydantic", "python-dotenv"}
for pkg in core_packages:
pyproject_has = any(pkg in dep for dep in pyproject_deps)
assert pyproject_has, f"{pkg} should be in pyproject.toml dependencies"
# requirements.txt might have additional dev dependencies
def test_uvx_entry_point_callable(self):
"""Test that the uvx entry point (server:run) is callable."""
import server
# The entry point should reference a callable function
assert hasattr(server, "run"), "server module should have a 'run' function"
assert callable(server.run), "server.run should be callable"
```
--------------------------------------------------------------------------------
/simulator_tests/test_consensus_three_models.py:
--------------------------------------------------------------------------------
```python
"""
Test consensus tool with three models demonstrating sequential processing
"""
import json
from .base_test import BaseSimulatorTest
class TestConsensusThreeModels(BaseSimulatorTest):
"""Test consensus tool functionality with three models (testing sequential processing)"""
@property
def test_name(self) -> str:
return "consensus_three_models"
@property
def test_description(self) -> str:
return "Test consensus tool with three models using flash:against, flash:for, local-llama:neutral"
def run_test(self) -> bool:
"""Run three-model consensus test"""
try:
self.logger.info("Testing consensus tool with three models: flash:against, flash:for, local-llama:neutral")
# Send request with three objects using new workflow parameters
response, continuation_id = self.call_mcp_tool(
"consensus",
{
"step": "Is a sync manager class a good idea for my CoolTodos app?",
"step_number": 1,
"total_steps": 3, # 3 models = 3 steps
"next_step_required": True,
"findings": "Initial analysis needed on sync manager class architecture decision for CoolTodos app",
"models": [
{
"model": "flash",
"stance": "against",
"stance_prompt": "You are a software architecture critic. Focus on the potential downsides of adding a sync manager class: complexity overhead, maintenance burden, potential for over-engineering, and whether simpler alternatives exist. Consider if this adds unnecessary abstraction layers.",
},
{
"model": "flash",
"stance": "for",
"stance_prompt": "You are a software architecture advocate. Focus on the benefits of a sync manager class: separation of concerns, testability, maintainability, and how it can improve the overall architecture. Consider scalability and code organization advantages.",
},
{
"model": "local-llama",
"stance": "neutral",
"stance_prompt": "You are a pragmatic software engineer. Provide a balanced analysis considering both the benefits and drawbacks. Focus on the specific context of a CoolTodos app and what factors would determine if this is the right choice.",
},
],
"model": "flash", # Default model for Claude's execution
},
)
# Validate response
if not response:
self.logger.error("Failed to get response from three-model consensus tool")
return False
self.logger.info(f"Three-model consensus response preview: {response[:500]}...")
# Parse the JSON response
try:
consensus_data = json.loads(response)
except json.JSONDecodeError:
self.logger.error(f"Failed to parse three-model consensus response as JSON: {response}")
return False
# Validate consensus structure
if "status" not in consensus_data:
self.logger.error("Missing 'status' field in three-model consensus response")
return False
# Check for step 1 status (Claude analysis + first model consultation)
expected_status = "analysis_and_first_model_consulted"
if consensus_data["status"] != expected_status:
self.logger.error(
f"Three-model consensus step 1 failed with status: {consensus_data['status']}, expected: {expected_status}"
)
# Log additional error details for debugging
if "error" in consensus_data:
self.logger.error(f"Error message: {consensus_data['error']}")
if "models_errored" in consensus_data:
self.logger.error(f"Models that errored: {consensus_data['models_errored']}")
if "models_skipped" in consensus_data:
self.logger.error(f"Models skipped: {consensus_data['models_skipped']}")
if "next_steps" in consensus_data:
self.logger.error(f"Suggested next steps: {consensus_data['next_steps']}")
return False
# Check that we have model response from step 1
model_response = consensus_data.get("model_response")
if not model_response:
self.logger.error("Three-model consensus step 1 response missing model_response")
return False
# Check that model response has expected structure
if not model_response.get("model") or not model_response.get("verdict"):
self.logger.error("Model response missing required fields (model or verdict)")
return False
# Check step information
if consensus_data.get("step_number") != 1:
self.logger.error(f"Expected step_number 1, got: {consensus_data.get('step_number')}")
return False
if not consensus_data.get("next_step_required"):
self.logger.error("Expected next_step_required=True for step 1")
return False
self.logger.info(f"Consensus step 1 consulted model: {model_response.get('model')}")
self.logger.info(f"Model stance: {model_response.get('stance', 'neutral')}")
self.logger.info(f"Response status: {model_response.get('status', 'unknown')}")
# Check metadata contains model name
metadata = consensus_data.get("metadata", {})
if not metadata.get("model_name"):
self.logger.error("Missing model_name in metadata")
return False
self.logger.info(f"Model name in metadata: {metadata.get('model_name')}")
# Verify we have analysis from Claude
agent_analysis = consensus_data.get("agent_analysis")
if not agent_analysis:
self.logger.error("Missing Claude's analysis in step 1")
return False
analysis_text = agent_analysis.get("initial_analysis", "")
self.logger.info(f"Claude analysis length: {len(analysis_text)} characters")
self.logger.info("✓ Three-model consensus tool test completed successfully")
self.logger.info(f"✓ Step 1 completed with model: {model_response.get('model')}")
self.logger.info(f"✓ Analysis provided: {len(analysis_text)} characters")
self.logger.info(f"✓ Model metadata properly included: {metadata.get('model_name')}")
self.logger.info("✓ Ready for step 2 continuation")
return True
except Exception as e:
self.logger.error(f"Three-model consensus test failed with exception: {str(e)}")
return False
```
--------------------------------------------------------------------------------
/tests/test_tracer.py:
--------------------------------------------------------------------------------
```python
"""
Tests for the tracer tool functionality
"""
import pytest
from tools.models import ToolModelCategory
from tools.tracer import TracerRequest, TracerTool
class TestTracerTool:
"""Test suite for the Tracer tool"""
@pytest.fixture
def tracer_tool(self):
"""Create a tracer tool instance for testing"""
return TracerTool()
def test_get_name(self, tracer_tool):
"""Test that the tool returns the correct name"""
assert tracer_tool.get_name() == "tracer"
def test_get_description(self, tracer_tool):
"""Test that the tool returns a comprehensive description"""
description = tracer_tool.get_description()
assert "code tracing" in description
assert "precision" in description
assert "dependencies" in description
assert "systematic" in description
def test_get_input_schema(self, tracer_tool):
"""Test that the input schema includes required fields"""
schema = tracer_tool.get_input_schema()
assert schema["type"] == "object"
assert "target_description" in schema["properties"]
assert "trace_mode" in schema["properties"]
assert "step" in schema["properties"]
assert "step_number" in schema["properties"]
# Check trace_mode enum values
trace_enum = schema["properties"]["trace_mode"]["enum"]
assert "precision" in trace_enum
assert "dependencies" in trace_enum
# Check required fields include workflow fields
required_fields = set(schema["required"])
assert "target_description" in required_fields
assert "trace_mode" in required_fields
def test_get_model_category(self, tracer_tool):
"""Test that the tracer tool uses EXTENDED_REASONING category"""
category = tracer_tool.get_model_category()
assert category == ToolModelCategory.EXTENDED_REASONING
def test_request_model_validation(self, tracer_tool):
"""Test TracerRequest model validation"""
# Valid request
request = TracerRequest(
step="Analyze BookingManager finalizeInvoice method execution flow",
step_number=1,
total_steps=3,
next_step_required=True,
findings="Initial investigation of booking finalization process",
target_description="BookingManager finalizeInvoice method",
trace_mode="precision",
)
assert request.target_description == "BookingManager finalizeInvoice method"
assert request.trace_mode == "precision"
assert request.step_number == 1
# Test invalid trace_mode
with pytest.raises(ValueError):
TracerRequest(
step="Test step",
step_number=1,
total_steps=1,
next_step_required=False,
findings="Test findings",
trace_mode="invalid_mode",
)
def test_get_required_actions(self, tracer_tool):
"""Test that required actions are provided for each step"""
# Step 1 - initial investigation (in ask mode by default)
actions = tracer_tool.get_required_actions(1, "exploring", "Initial findings", 3)
assert len(actions) > 0
# Default is ask mode, so should ask for mode selection
if tracer_tool.get_trace_mode() == "ask":
assert any("ask user" in action.lower() for action in actions)
assert any("precision mode" in action.lower() for action in actions)
# Test with initialized trace_config for non-ask mode
tracer_tool.trace_config = {"trace_mode": "precision"}
actions = tracer_tool.get_required_actions(1, "exploring", "Initial findings", 3)
assert len(actions) > 0
assert any("search" in action.lower() for action in actions)
assert any("locate" in action.lower() for action in actions)
# Later steps with low confidence
actions = tracer_tool.get_required_actions(2, "low", "Some findings", 3)
assert len(actions) > 0
assert any("trace" in action.lower() for action in actions)
# High confidence steps
actions = tracer_tool.get_required_actions(3, "high", "Strong findings", 3)
assert len(actions) > 0
assert any("verify" in action.lower() for action in actions)
def test_workflow_tool_characteristics(self, tracer_tool):
"""Test that tracer has proper workflow tool characteristics"""
# Should not require external expert analysis
assert not tracer_tool.requires_expert_analysis()
# Should return TracerRequest as the workflow model
assert tracer_tool.get_workflow_request_model() == TracerRequest
# Should not require AI model at MCP boundary
assert not tracer_tool.requires_model()
def test_get_rendering_instructions_precision(self, tracer_tool):
"""Test rendering instructions for precision mode"""
instructions = tracer_tool._get_rendering_instructions("precision")
assert "PRECISION TRACE" in instructions
assert "CALL FLOW DIAGRAM" in instructions
assert "ADDITIONAL ANALYSIS VIEWS" in instructions
assert "ClassName::MethodName" in instructions
assert "↓" in instructions
def test_get_rendering_instructions_dependencies(self, tracer_tool):
"""Test rendering instructions for dependencies mode"""
instructions = tracer_tool._get_rendering_instructions("dependencies")
assert "DEPENDENCIES TRACE" in instructions
assert "DEPENDENCY FLOW DIAGRAM" in instructions
assert "DEPENDENCY TABLE" in instructions
assert "INCOMING DEPENDENCIES" in instructions
assert "OUTGOING DEPENDENCIES" in instructions
assert "←" in instructions
assert "→" in instructions
def test_rendering_instructions_consistency(self, tracer_tool):
"""Test that rendering instructions are consistent between modes"""
precision_instructions = tracer_tool._get_precision_rendering_instructions()
dependencies_instructions = tracer_tool._get_dependencies_rendering_instructions()
# Both should have mandatory instructions
assert "MANDATORY RENDERING INSTRUCTIONS" in precision_instructions
assert "MANDATORY RENDERING INSTRUCTIONS" in dependencies_instructions
# Both should have specific styling requirements
assert "ONLY" in precision_instructions
assert "ONLY" in dependencies_instructions
# Both should have absolute requirements
assert "ABSOLUTE REQUIREMENTS" in precision_instructions
assert "ABSOLUTE REQUIREMENTS" in dependencies_instructions
def test_mode_selection_guidance(self, tracer_tool):
"""Test that the schema provides clear guidance on when to use each mode"""
schema = tracer_tool.get_input_schema()
trace_mode_desc = schema["properties"]["trace_mode"]["description"]
# Should clearly indicate precision is for methods/functions
assert "execution flow" in trace_mode_desc
# Should clearly indicate dependencies is for structural relationships
assert "structural relationships" in trace_mode_desc
```