This is page 6 of 19. Use http://codebase.md/beehiveinnovations/gemini-mcp-server?lines=false&page={x} to view the full context.
# Directory Structure
```
├── .claude
│ ├── commands
│ │ └── fix-github-issue.md
│ └── settings.json
├── .coveragerc
├── .dockerignore
├── .env.example
├── .gitattributes
├── .github
│ ├── FUNDING.yml
│ ├── ISSUE_TEMPLATE
│ │ ├── bug_report.yml
│ │ ├── config.yml
│ │ ├── documentation.yml
│ │ ├── feature_request.yml
│ │ └── tool_addition.yml
│ ├── pull_request_template.md
│ └── workflows
│ ├── docker-pr.yml
│ ├── docker-release.yml
│ ├── semantic-pr.yml
│ ├── semantic-release.yml
│ └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── AGENTS.md
├── CHANGELOG.md
├── claude_config_example.json
├── CLAUDE.md
├── clink
│ ├── __init__.py
│ ├── agents
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── claude.py
│ │ ├── codex.py
│ │ └── gemini.py
│ ├── constants.py
│ ├── models.py
│ ├── parsers
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── claude.py
│ │ ├── codex.py
│ │ └── gemini.py
│ └── registry.py
├── code_quality_checks.ps1
├── code_quality_checks.sh
├── communication_simulator_test.py
├── conf
│ ├── __init__.py
│ ├── azure_models.json
│ ├── cli_clients
│ │ ├── claude.json
│ │ ├── codex.json
│ │ └── gemini.json
│ ├── custom_models.json
│ ├── dial_models.json
│ ├── gemini_models.json
│ ├── openai_models.json
│ ├── openrouter_models.json
│ └── xai_models.json
├── config.py
├── docker
│ ├── README.md
│ └── scripts
│ ├── build.ps1
│ ├── build.sh
│ ├── deploy.ps1
│ ├── deploy.sh
│ └── healthcheck.py
├── docker-compose.yml
├── Dockerfile
├── docs
│ ├── adding_providers.md
│ ├── adding_tools.md
│ ├── advanced-usage.md
│ ├── ai_banter.md
│ ├── ai-collaboration.md
│ ├── azure_openai.md
│ ├── configuration.md
│ ├── context-revival.md
│ ├── contributions.md
│ ├── custom_models.md
│ ├── docker-deployment.md
│ ├── gemini-setup.md
│ ├── getting-started.md
│ ├── index.md
│ ├── locale-configuration.md
│ ├── logging.md
│ ├── model_ranking.md
│ ├── testing.md
│ ├── tools
│ │ ├── analyze.md
│ │ ├── apilookup.md
│ │ ├── challenge.md
│ │ ├── chat.md
│ │ ├── clink.md
│ │ ├── codereview.md
│ │ ├── consensus.md
│ │ ├── debug.md
│ │ ├── docgen.md
│ │ ├── listmodels.md
│ │ ├── planner.md
│ │ ├── precommit.md
│ │ ├── refactor.md
│ │ ├── secaudit.md
│ │ ├── testgen.md
│ │ ├── thinkdeep.md
│ │ ├── tracer.md
│ │ └── version.md
│ ├── troubleshooting.md
│ ├── vcr-testing.md
│ └── wsl-setup.md
├── examples
│ ├── claude_config_macos.json
│ └── claude_config_wsl.json
├── LICENSE
├── providers
│ ├── __init__.py
│ ├── azure_openai.py
│ ├── base.py
│ ├── custom.py
│ ├── dial.py
│ ├── gemini.py
│ ├── openai_compatible.py
│ ├── openai.py
│ ├── openrouter.py
│ ├── registries
│ │ ├── __init__.py
│ │ ├── azure.py
│ │ ├── base.py
│ │ ├── custom.py
│ │ ├── dial.py
│ │ ├── gemini.py
│ │ ├── openai.py
│ │ ├── openrouter.py
│ │ └── xai.py
│ ├── registry_provider_mixin.py
│ ├── registry.py
│ ├── shared
│ │ ├── __init__.py
│ │ ├── model_capabilities.py
│ │ ├── model_response.py
│ │ ├── provider_type.py
│ │ └── temperature.py
│ └── xai.py
├── pyproject.toml
├── pytest.ini
├── README.md
├── requirements-dev.txt
├── requirements.txt
├── run_integration_tests.ps1
├── run_integration_tests.sh
├── run-server.ps1
├── run-server.sh
├── scripts
│ └── sync_version.py
├── server.py
├── simulator_tests
│ ├── __init__.py
│ ├── base_test.py
│ ├── conversation_base_test.py
│ ├── log_utils.py
│ ├── test_analyze_validation.py
│ ├── test_basic_conversation.py
│ ├── test_chat_simple_validation.py
│ ├── test_codereview_validation.py
│ ├── test_consensus_conversation.py
│ ├── test_consensus_three_models.py
│ ├── test_consensus_workflow_accurate.py
│ ├── test_content_validation.py
│ ├── test_conversation_chain_validation.py
│ ├── test_cross_tool_comprehensive.py
│ ├── test_cross_tool_continuation.py
│ ├── test_debug_certain_confidence.py
│ ├── test_debug_validation.py
│ ├── test_line_number_validation.py
│ ├── test_logs_validation.py
│ ├── test_model_thinking_config.py
│ ├── test_o3_model_selection.py
│ ├── test_o3_pro_expensive.py
│ ├── test_ollama_custom_url.py
│ ├── test_openrouter_fallback.py
│ ├── test_openrouter_models.py
│ ├── test_per_tool_deduplication.py
│ ├── test_planner_continuation_history.py
│ ├── test_planner_validation_old.py
│ ├── test_planner_validation.py
│ ├── test_precommitworkflow_validation.py
│ ├── test_prompt_size_limit_bug.py
│ ├── test_refactor_validation.py
│ ├── test_secaudit_validation.py
│ ├── test_testgen_validation.py
│ ├── test_thinkdeep_validation.py
│ ├── test_token_allocation_validation.py
│ ├── test_vision_capability.py
│ └── test_xai_models.py
├── systemprompts
│ ├── __init__.py
│ ├── analyze_prompt.py
│ ├── chat_prompt.py
│ ├── clink
│ │ ├── codex_codereviewer.txt
│ │ ├── default_codereviewer.txt
│ │ ├── default_planner.txt
│ │ └── default.txt
│ ├── codereview_prompt.py
│ ├── consensus_prompt.py
│ ├── debug_prompt.py
│ ├── docgen_prompt.py
│ ├── generate_code_prompt.py
│ ├── planner_prompt.py
│ ├── precommit_prompt.py
│ ├── refactor_prompt.py
│ ├── secaudit_prompt.py
│ ├── testgen_prompt.py
│ ├── thinkdeep_prompt.py
│ └── tracer_prompt.py
├── tests
│ ├── __init__.py
│ ├── CASSETTE_MAINTENANCE.md
│ ├── conftest.py
│ ├── gemini_cassettes
│ │ ├── chat_codegen
│ │ │ └── gemini25_pro_calculator
│ │ │ └── mldev.json
│ │ ├── chat_cross
│ │ │ └── step1_gemini25_flash_number
│ │ │ └── mldev.json
│ │ └── consensus
│ │ └── step2_gemini25_flash_against
│ │ └── mldev.json
│ ├── http_transport_recorder.py
│ ├── mock_helpers.py
│ ├── openai_cassettes
│ │ ├── chat_cross_step2_gpt5_reminder.json
│ │ ├── chat_gpt5_continuation.json
│ │ ├── chat_gpt5_moon_distance.json
│ │ ├── consensus_step1_gpt5_for.json
│ │ └── o3_pro_basic_math.json
│ ├── pii_sanitizer.py
│ ├── sanitize_cassettes.py
│ ├── test_alias_target_restrictions.py
│ ├── test_auto_mode_comprehensive.py
│ ├── test_auto_mode_custom_provider_only.py
│ ├── test_auto_mode_model_listing.py
│ ├── test_auto_mode_provider_selection.py
│ ├── test_auto_mode.py
│ ├── test_auto_model_planner_fix.py
│ ├── test_azure_openai_provider.py
│ ├── test_buggy_behavior_prevention.py
│ ├── test_cassette_semantic_matching.py
│ ├── test_challenge.py
│ ├── test_chat_codegen_integration.py
│ ├── test_chat_cross_model_continuation.py
│ ├── test_chat_openai_integration.py
│ ├── test_chat_simple.py
│ ├── test_clink_claude_agent.py
│ ├── test_clink_claude_parser.py
│ ├── test_clink_codex_agent.py
│ ├── test_clink_gemini_agent.py
│ ├── test_clink_gemini_parser.py
│ ├── test_clink_integration.py
│ ├── test_clink_parsers.py
│ ├── test_clink_tool.py
│ ├── test_collaboration.py
│ ├── test_config.py
│ ├── test_consensus_integration.py
│ ├── test_consensus_schema.py
│ ├── test_consensus.py
│ ├── test_conversation_continuation_integration.py
│ ├── test_conversation_field_mapping.py
│ ├── test_conversation_file_features.py
│ ├── test_conversation_memory.py
│ ├── test_conversation_missing_files.py
│ ├── test_custom_openai_temperature_fix.py
│ ├── test_custom_provider.py
│ ├── test_debug.py
│ ├── test_deploy_scripts.py
│ ├── test_dial_provider.py
│ ├── test_directory_expansion_tracking.py
│ ├── test_disabled_tools.py
│ ├── test_docker_claude_desktop_integration.py
│ ├── test_docker_config_complete.py
│ ├── test_docker_healthcheck.py
│ ├── test_docker_implementation.py
│ ├── test_docker_mcp_validation.py
│ ├── test_docker_security.py
│ ├── test_docker_volume_persistence.py
│ ├── test_file_protection.py
│ ├── test_gemini_token_usage.py
│ ├── test_image_support_integration.py
│ ├── test_image_validation.py
│ ├── test_integration_utf8.py
│ ├── test_intelligent_fallback.py
│ ├── test_issue_245_simple.py
│ ├── test_large_prompt_handling.py
│ ├── test_line_numbers_integration.py
│ ├── test_listmodels_restrictions.py
│ ├── test_listmodels.py
│ ├── test_mcp_error_handling.py
│ ├── test_model_enumeration.py
│ ├── test_model_metadata_continuation.py
│ ├── test_model_resolution_bug.py
│ ├── test_model_restrictions.py
│ ├── test_o3_pro_output_text_fix.py
│ ├── test_o3_temperature_fix_simple.py
│ ├── test_openai_compatible_token_usage.py
│ ├── test_openai_provider.py
│ ├── test_openrouter_provider.py
│ ├── test_openrouter_registry.py
│ ├── test_parse_model_option.py
│ ├── test_per_tool_model_defaults.py
│ ├── test_pii_sanitizer.py
│ ├── test_pip_detection_fix.py
│ ├── test_planner.py
│ ├── test_precommit_workflow.py
│ ├── test_prompt_regression.py
│ ├── test_prompt_size_limit_bug_fix.py
│ ├── test_provider_retry_logic.py
│ ├── test_provider_routing_bugs.py
│ ├── test_provider_utf8.py
│ ├── test_providers.py
│ ├── test_rate_limit_patterns.py
│ ├── test_refactor.py
│ ├── test_secaudit.py
│ ├── test_server.py
│ ├── test_supported_models_aliases.py
│ ├── test_thinking_modes.py
│ ├── test_tools.py
│ ├── test_tracer.py
│ ├── test_utf8_localization.py
│ ├── test_utils.py
│ ├── test_uvx_resource_packaging.py
│ ├── test_uvx_support.py
│ ├── test_workflow_file_embedding.py
│ ├── test_workflow_metadata.py
│ ├── test_workflow_prompt_size_validation_simple.py
│ ├── test_workflow_utf8.py
│ ├── test_xai_provider.py
│ ├── transport_helpers.py
│ └── triangle.png
├── tools
│ ├── __init__.py
│ ├── analyze.py
│ ├── apilookup.py
│ ├── challenge.py
│ ├── chat.py
│ ├── clink.py
│ ├── codereview.py
│ ├── consensus.py
│ ├── debug.py
│ ├── docgen.py
│ ├── listmodels.py
│ ├── models.py
│ ├── planner.py
│ ├── precommit.py
│ ├── refactor.py
│ ├── secaudit.py
│ ├── shared
│ │ ├── __init__.py
│ │ ├── base_models.py
│ │ ├── base_tool.py
│ │ ├── exceptions.py
│ │ └── schema_builders.py
│ ├── simple
│ │ ├── __init__.py
│ │ └── base.py
│ ├── testgen.py
│ ├── thinkdeep.py
│ ├── tracer.py
│ ├── version.py
│ └── workflow
│ ├── __init__.py
│ ├── base.py
│ ├── schema_builders.py
│ └── workflow_mixin.py
├── utils
│ ├── __init__.py
│ ├── client_info.py
│ ├── conversation_memory.py
│ ├── env.py
│ ├── file_types.py
│ ├── file_utils.py
│ ├── image_utils.py
│ ├── model_context.py
│ ├── model_restrictions.py
│ ├── security_config.py
│ ├── storage_backend.py
│ └── token_utils.py
└── zen-mcp-server
```
# Files
--------------------------------------------------------------------------------
/docs/docker-deployment.md:
--------------------------------------------------------------------------------
```markdown
# Docker Deployment Guide
This guide covers deploying Zen MCP Server using Docker and Docker Compose for production environments.
## Quick Start
1. **Clone the repository**:
```bash
git clone https://github.com/BeehiveInnovations/zen-mcp-server.git
cd zen-mcp-server
```
2. **Configure environment variables**:
```bash
cp .env.example .env
# Edit .env with your API keys
```
3. **Deploy with Docker Compose**:
```bash
# Linux/macOS
./docker/scripts/deploy.sh
# Windows PowerShell
.\docker\scripts\deploy.ps1
```
## Environment Configuration
### Required API Keys
At least one API key must be configured in your `.env` file:
```env
# Google Gemini (Recommended)
GEMINI_API_KEY=your_gemini_api_key_here
# OpenAI
OPENAI_API_KEY=your_openai_api_key_here
# X.AI GROK
XAI_API_KEY=your_xai_api_key_here
# OpenRouter (unified access)
OPENROUTER_API_KEY=your_openrouter_api_key_here
# Additional providers
DIAL_API_KEY=your_dial_api_key_here
DIAL_API_HOST=your_dial_host
```
### Optional Configuration
```env
# Default model selection
DEFAULT_MODEL=auto
# Logging
LOG_LEVEL=INFO
LOG_MAX_SIZE=10MB
LOG_BACKUP_COUNT=5
# Advanced settings
DEFAULT_THINKING_MODE_THINKDEEP=high
DISABLED_TOOLS=
MAX_MCP_OUTPUT_TOKENS=
# Timezone
TZ=UTC
```
## Deployment Scripts
### Linux/macOS Deployment
Use the provided bash script for robust deployment:
```bash
./docker/scripts/deploy.sh
```
**Features:**
- ✅ Environment validation
- ✅ Exponential backoff health checks
- ✅ Automatic log management
- ✅ Service status monitoring
### Windows PowerShell Deployment
Use the PowerShell script for Windows environments:
```powershell
.\docker\scripts\deploy.ps1
```
**Additional Options:**
```powershell
# Skip health check
.\docker\scripts\deploy.ps1 -SkipHealthCheck
# Custom timeout
.\docker\scripts\deploy.ps1 -HealthCheckTimeout 120
```
## Docker Architecture
### Multi-Stage Build
The Dockerfile uses a multi-stage build for optimal image size:
1. **Builder Stage**: Installs dependencies and creates virtual environment
2. **Runtime Stage**: Copies only necessary files for minimal footprint
### Security Features
- **Non-root user**: Runs as `zenuser` (UID/GID 1000)
- **Read-only filesystem**: Container filesystem is immutable
- **No new privileges**: Prevents privilege escalation
- **Secure tmpfs**: Temporary directories with strict permissions
### Resource Management
Default resource limits:
```yaml
deploy:
resources:
limits:
memory: 512M
cpus: '0.5'
reservations:
memory: 256M
cpus: '0.25'
```
## Service Management
### Starting the Service
```bash
# Start in background
docker-compose up -d
# Start with logs
docker-compose up
```
### Monitoring
```bash
# View service status
docker-compose ps
# Follow logs
docker-compose logs -f zen-mcp
# View health status
docker inspect zen-mcp-server --format='{{.State.Health.Status}}'
```
### Stopping the Service
```bash
# Graceful stop
docker-compose down
# Force stop
docker-compose down --timeout 10
```
## Health Checks
The container includes comprehensive health checks:
- **Process check**: Verifies server.py is running
- **Import check**: Validates critical Python modules
- **Directory check**: Ensures log directory is writable
- **API check**: Tests provider connectivity
Health check configuration:
```yaml
healthcheck:
test: ["CMD", "python", "/usr/local/bin/healthcheck.py"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
```
## Persistent Data
### Volumes
- **Logs**: `./logs:/app/logs` - Application logs
- **Config**: `zen-mcp-config:/app/conf` - Configuration persistence
- **Time sync**: `/etc/localtime:/etc/localtime:ro` - Host timezone sync
**Note:** The `zen-mcp-config` is a named Docker volume that persists configuration data between container restarts. All data placed in `/app/conf` inside the container is preserved thanks to this persistent volume. This applies to both `docker-compose run` and `docker-compose up` commands.
### Log Management
Logs are automatically rotated with configurable retention:
```env
LOG_MAX_SIZE=10MB # Maximum log file size
LOG_BACKUP_COUNT=5 # Number of backup files to keep
```
## Networking
### Default Configuration
- **Network**: `zen-network` (bridge)
- **Subnet**: `172.20.0.0/16`
- **Isolation**: Container runs in isolated network
### Port Exposure
By default, no ports are exposed. The MCP server communicates via stdio when used with Claude Desktop or other MCP clients.
For external access (advanced users):
```yaml
ports:
- "3000:3000" # Add to service configuration if needed
```
## Troubleshooting
### Common Issues
**1. Health check failures:**
```bash
# Check logs
docker-compose logs zen-mcp
# Manual health check
docker exec zen-mcp-server python /usr/local/bin/healthcheck.py
```
**2. Permission errors:**
```bash
# Fix log directory permissions
sudo chown -R 1000:1000 ./logs
```
**3. Environment variables not loaded:**
```bash
# Verify .env file exists and is readable
ls -la .env
cat .env
```
**4. API key validation errors:**
```bash
# Check environment variables in container
docker exec zen-mcp-server env | grep -E "(GEMINI|OPENAI|XAI)"
```
### Debug Mode
Enable verbose logging for troubleshooting:
```env
LOG_LEVEL=DEBUG
```
## Production Considerations
### Security
1. **Use Docker secrets** for API keys in production:
```yaml
secrets:
gemini_api_key:
external: true
```
2. **Enable AppArmor/SELinux** if available
3. **Regular security updates**:
```bash
docker-compose pull
docker-compose up -d
```
### Monitoring
Consider integrating with monitoring solutions:
- **Prometheus**: Health check metrics
- **Grafana**: Log visualization
- **AlertManager**: Health status alerts
### Backup
Backup persistent volumes:
```bash
# Backup configuration
docker run --rm -v zen-mcp-config:/data -v $(pwd):/backup alpine tar czf /backup/config-backup.tar.gz -C /data .
# Restore configuration
docker run --rm -v zen-mcp-config:/data -v $(pwd):/backup alpine tar xzf /backup/config-backup.tar.gz -C /data
```
## Performance Tuning
### Resource Optimization
Adjust limits based on your workload:
```yaml
deploy:
resources:
limits:
memory: 1G # Increase for heavy workloads
cpus: '1.0' # More CPU for concurrent requests
```
### Memory Management
Monitor memory usage:
```bash
docker stats zen-mcp-server
```
Adjust Python memory settings if needed:
```env
PYTHONMALLOC=pymalloc
MALLOC_ARENA_MAX=2
```
## Integration with Claude Desktop
Configure Claude Desktop to use the containerized server. **Choose one of the configurations below based on your needs:**
### Option 1: Direct Docker Run (Recommended)
**The simplest and most reliable option for most users.**
```json
{
"mcpServers": {
"zen-mcp": {
"command": "docker",
"args": [
"run",
"--rm",
"-i",
"--env-file",
"/absolute/path/to/zen-mcp-server/.env",
"-v",
"/absolute/path/to/zen-mcp-server/logs:/app/logs",
"zen-mcp-server:latest"
]
}
}
}
```
**Exemple Windows** :
```json
{
"mcpServers": {
"zen-mcp": {
"command": "docker",
"args": [
"run",
"--rm",
"-i",
"--env-file",
"C:/path/to/zen-mcp-server/.env",
"-v",
"C:/path/to/zen-mcp-server/logs:/app/logs",
"zen-mcp-server:latest"
]
}
}
}
```
### Option 2: Docker Compose Run (one-shot, uses docker-compose.yml)
**To use the advanced configuration from docker-compose.yml without a persistent container.**
```json
{
"mcpServers": {
"zen-mcp": {
"command": "docker-compose",
"args": [
"-f", "/absolute/path/to/zen-mcp-server/docker-compose.yml",
"run", "--rm", "zen-mcp"
]
}
}
}
```
### Option 3: Inline Environment Variables (Advanced)
**For highly customized needs.**
```json
{
"mcpServers": {
"zen-mcp": {
"command": "docker",
"args": [
"run",
"--rm",
"-i",
"-e", "GEMINI_API_KEY=your_key_here",
"-e", "LOG_LEVEL=INFO",
"-e", "DEFAULT_MODEL=auto",
"-v", "/path/to/logs:/app/logs",
"zen-mcp-server:latest"
]
}
}
}
```
### Configuration Notes
**Important notes:**
- Replace `/absolute/path/to/zen-mcp-server` with the actual path to your project.
- Always use forward slashes `/` for Docker volumes, even on Windows.
- Ensure the `.env` file exists and contains your API keys.
- **Persistent volumes**: Docker Compose options (Options 2) automatically use the `zen-mcp-config` named volume for persistent configuration storage.
**Environment file requirements:**
```env
# At least one API key is required
GEMINI_API_KEY=your_gemini_key
OPENAI_API_KEY=your_openai_key
# ... other keys
```
**Troubleshooting:**
- If Option 1 fails: check that the Docker image exists (`docker images zen-mcp-server`).
- If Option 2 fails: verify the compose file path and ensure the service is not already in use.
- Permission issues: make sure the `logs` folder is writable.
## Advanced Configuration
### Custom Networks
For complex deployments:
```yaml
networks:
zen-network:
driver: bridge
ipam:
config:
- subnet: 172.20.0.0/16
gateway: 172.20.0.1
```
### Multiple Instances
Run multiple instances with different configurations:
```bash
# Copy compose file
cp docker-compose.yml docker-compose.dev.yml
# Modify service names and ports
# Deploy with custom compose file
docker-compose -f docker-compose.dev.yml up -d
```
## Migration and Updates
### Updating the Server
```bash
# Pull latest changes
git pull origin main
# Rebuild and restart
docker-compose down
docker-compose build --no-cache
./docker/scripts/deploy.sh
```
### Data Migration
When upgrading, configuration is preserved in the named volume `zen-mcp-config`.
For major version upgrades, check the [CHANGELOG](../CHANGELOG.md) for breaking changes.
## Support
For any questions, open an issue on GitHub or consult the official documentation.
---
**Next Steps:**
- Review the [Configuration Guide](configuration.md) for detailed environment variable options
- Check [Advanced Usage](advanced-usage.md) for custom model configurations
- See [Troubleshooting](troubleshooting.md) for common issues and solutions
```
--------------------------------------------------------------------------------
/simulator_tests/test_consensus_conversation.py:
--------------------------------------------------------------------------------
```python
#!/usr/bin/env python3
"""
Consensus Conversation Continuation Test
Tests that the consensus tool properly handles conversation continuation
and builds conversation context correctly when using continuation_id.
"""
import json
from .conversation_base_test import ConversationBaseTest
class TestConsensusConversation(ConversationBaseTest):
"""Test consensus tool conversation continuation functionality"""
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple:
"""Call an MCP tool in-process"""
response_text, continuation_id = self.call_mcp_tool_direct(tool_name, params)
return response_text, continuation_id
@property
def test_name(self) -> str:
return "consensus_conversation"
@property
def test_description(self) -> str:
return "Test consensus tool conversation building and continuation"
def get_server_logs(self):
"""Get server logs from local log file"""
try:
log_file_path = "logs/mcp_server.log"
with open(log_file_path) as f:
lines = f.readlines()
# Return last 100 lines
return [line.strip() for line in lines[-100:]]
except Exception as e:
self.logger.warning(f"Exception getting server logs: {e}")
return []
def run_test(self) -> bool:
"""Test consensus conversation continuation"""
try:
self.logger.info("Testing consensus tool conversation continuation")
# Initialize for in-process tool calling
self.setUp()
# Setup test files for context
self.setup_test_files()
# Phase 1: Start conversation with chat tool (which properly creates continuation_id)
self.logger.info("Phase 1: Starting conversation with chat tool")
initial_response, continuation_id = self.call_mcp_tool(
"chat",
{
"prompt": "Please use low thinking mode. I'm working on a web application and need advice on authentication. Can you look at this code?",
"absolute_file_paths": [self.test_files["python"]],
"model": "flash",
},
)
# Validate initial response
if not initial_response:
self.logger.error("Failed to get initial chat response")
return False
if not continuation_id:
self.logger.error("Failed to get continuation_id from initial chat")
return False
self.logger.info(f"Initial chat response preview: {initial_response[:200]}...")
self.logger.info(f"Got continuation_id: {continuation_id}")
# Phase 2: Use consensus with continuation_id to test conversation building
self.logger.info("Phase 2: Using consensus with continuation_id to test conversation building")
consensus_response, _ = self.call_mcp_tool(
"consensus",
{
"step": "Based on our previous discussion about authentication, I need expert consensus: Should we implement OAuth2 or stick with simple session-based auth?",
"step_number": 1,
"total_steps": 2,
"next_step_required": True,
"findings": "Initial analysis needed on OAuth2 vs session-based authentication approaches for our web application",
"models": [
{
"model": "flash",
"stance": "for",
"stance_prompt": "Focus on OAuth2 benefits: security, scalability, and industry standards.",
},
{
"model": "flash",
"stance": "against",
"stance_prompt": "Focus on OAuth2 complexity: implementation challenges and simpler alternatives.",
},
],
"continuation_id": continuation_id,
"model": "flash",
},
)
# Validate consensus response
if not consensus_response:
self.logger.error("Failed to get consensus response with continuation_id")
return False
self.logger.info(f"Consensus response preview: {consensus_response[:300]}...")
# Log the full response for debugging if it's not JSON
if not consensus_response.startswith("{"):
self.logger.error(f"Consensus response is not JSON. Full response: {consensus_response}")
return False
# Parse consensus response
try:
consensus_data = json.loads(consensus_response)
except json.JSONDecodeError:
self.logger.error(f"Failed to parse consensus response as JSON. Full response: {consensus_response}")
return False
# Check for step 1 status (Claude analysis + first model consultation)
expected_status = "analysis_and_first_model_consulted"
if consensus_data.get("status") != expected_status:
self.logger.error(
f"Consensus step 1 failed with status: {consensus_data.get('status')}, expected: {expected_status}"
)
if "error" in consensus_data:
self.logger.error(f"Error: {consensus_data['error']}")
return False
# Phase 3: Check server logs for conversation building
self.logger.info("Phase 3: Checking server logs for conversation building")
# Check for conversation-related log entries
logs = self.get_server_logs()
if not logs:
self.logger.warning("Could not retrieve server logs for verification")
else:
# Look for conversation building indicators
conversation_logs = [
line
for line in logs
if any(
keyword in line
for keyword in [
"CONVERSATION HISTORY",
"continuation_id",
"build_conversation_history",
"ThreadContext",
f"thread:{continuation_id}",
]
)
]
if conversation_logs:
self.logger.info(f"Found {len(conversation_logs)} conversation-related log entries")
# Show a few examples (truncated)
for i, log in enumerate(conversation_logs[:3]):
self.logger.info(f" Conversation log {i+1}: {log[:100]}...")
else:
self.logger.warning(
"No conversation-related logs found (may indicate conversation not properly built)"
)
# Check for any ERROR entries related to consensus
error_logs = [
line
for line in logs
if "ERROR" in line
and any(keyword in line for keyword in ["consensus", "conversation", continuation_id])
]
if error_logs:
self.logger.error(f"Found {len(error_logs)} error logs related to consensus conversation:")
for error in error_logs:
self.logger.error(f" ERROR: {error}")
return False
# Phase 4: Verify response structure
self.logger.info("Phase 4: Verifying consensus response structure")
# Check that we have model response from step 1
model_response = consensus_data.get("model_response")
if not model_response:
self.logger.error("Consensus step 1 response missing model_response")
return False
# Check that model response has expected structure
if not model_response.get("model") or not model_response.get("verdict"):
self.logger.error("Model response missing required fields (model or verdict)")
return False
# Check step information
if consensus_data.get("step_number") != 1:
self.logger.error(f"Expected step_number 1, got: {consensus_data.get('step_number')}")
return False
if not consensus_data.get("next_step_required"):
self.logger.error("Expected next_step_required=True for step 1")
return False
self.logger.info(f"Consensus step 1 consulted model: {model_response.get('model')}")
self.logger.info(f"Model stance: {model_response.get('stance', 'neutral')}")
self.logger.info(f"Response status: {model_response.get('status', 'unknown')}")
# Phase 5: Cross-tool continuation test
self.logger.info("Phase 5: Testing cross-tool continuation from consensus")
# Try to continue the conversation with a different tool
chat_response, _ = self.call_mcp_tool(
"chat",
{
"prompt": "Based on our consensus discussion about authentication, can you summarize the key points?",
"continuation_id": continuation_id,
"model": "flash",
},
)
if not chat_response:
self.logger.warning("Cross-tool continuation from consensus failed")
# Don't fail the test for this - it's a bonus check
else:
self.logger.info("✓ Cross-tool continuation from consensus working")
self.logger.info(f"Chat continuation preview: {chat_response[:200]}...")
self.logger.info("✓ Consensus conversation continuation test completed successfully")
return True
except Exception as e:
self.logger.error(f"Consensus conversation test failed with exception: {str(e)}")
import traceback
self.logger.error(f"Traceback: {traceback.format_exc()}")
return False
finally:
self.cleanup_test_files()
```
--------------------------------------------------------------------------------
/simulator_tests/conversation_base_test.py:
--------------------------------------------------------------------------------
```python
#!/usr/bin/env python3
"""
Conversation Base Test Class for In-Process MCP Tool Testing
This class enables testing MCP tools within the same process to maintain conversation
memory state across tool calls. Unlike BaseSimulatorTest which runs each tool call
as a separate subprocess (losing memory state), this class calls tools directly
in-process, allowing conversation functionality to work correctly.
USAGE:
- Inherit from ConversationBaseTest instead of BaseSimulatorTest for conversation tests
- Use call_mcp_tool_direct() to call tools in-process
- Conversation memory persists across tool calls within the same test
- setUp() clears memory between test methods for proper isolation
EXAMPLE:
class TestConversationFeature(ConversationBaseTest):
def test_cross_tool_continuation(self):
# Step 1: Call precommit tool
result1, continuation_id = self.call_mcp_tool_direct("precommit", {
"path": "/path/to/repo",
"prompt": "Review these changes"
})
# Step 2: Continue with codereview tool - memory is preserved!
result2, _ = self.call_mcp_tool_direct("codereview", {
"step": "Focus on security issues in this code",
"step_number": 1,
"total_steps": 1,
"next_step_required": False,
"findings": "Starting security-focused code review",
"relevant_files": ["/path/to/file.py"],
"continuation_id": continuation_id
})
"""
import asyncio
import json
from typing import Optional
from tools.shared.exceptions import ToolExecutionError
from .base_test import BaseSimulatorTest
class ConversationBaseTest(BaseSimulatorTest):
"""Base class for conversation tests that require in-process tool calling"""
def __init__(self, verbose: bool = False):
super().__init__(verbose)
self._tools = None
self._loop = None
def setUp(self):
"""Set up test environment - clears conversation memory between tests"""
super().setup_test_files()
# Clear conversation memory for test isolation
self._clear_conversation_memory()
# Import tools from server.py for in-process calling
if self._tools is None:
self._import_tools()
def _clear_conversation_memory(self):
"""Clear all conversation memory to ensure test isolation"""
try:
from utils.storage_backend import get_storage_backend
storage = get_storage_backend()
# Clear all stored conversation threads
with storage._lock:
storage._store.clear()
self.logger.debug("Cleared conversation memory for test isolation")
except Exception as e:
self.logger.warning(f"Could not clear conversation memory: {e}")
def _import_tools(self):
"""Import tools from server.py for direct calling"""
try:
import os
import sys
# Add project root to Python path if not already there
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if project_root not in sys.path:
sys.path.insert(0, project_root)
# Import and configure providers first (this is what main() does)
from server import TOOLS, configure_providers
configure_providers()
self._tools = TOOLS
self.logger.debug(f"Imported {len(self._tools)} tools for in-process testing")
except ImportError as e:
raise RuntimeError(f"Could not import tools from server.py: {e}")
def _get_event_loop(self):
"""Get or create event loop for async tool execution"""
if self._loop is None:
try:
self._loop = asyncio.get_event_loop()
except RuntimeError:
self._loop = asyncio.new_event_loop()
asyncio.set_event_loop(self._loop)
return self._loop
def call_mcp_tool_direct(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
"""
Call an MCP tool directly in-process without subprocess isolation.
This method maintains conversation memory across calls, enabling proper
testing of conversation functionality.
Args:
tool_name: Name of the tool to call (e.g., "precommit", "codereview")
params: Parameters to pass to the tool
Returns:
tuple: (response_content, continuation_id) where continuation_id
can be used for follow-up calls
"""
if self._tools is None:
raise RuntimeError("Tools not imported. Call setUp() first.")
if tool_name not in self._tools:
raise ValueError(f"Tool '{tool_name}' not found. Available: {list(self._tools.keys())}")
try:
tool = self._tools[tool_name]
self.logger.debug(f"Calling tool '{tool_name}' directly in-process")
# Set up minimal model context if not provided
if "model" not in params:
params["model"] = "flash" # Use fast model for testing
# Execute tool directly using asyncio
loop = self._get_event_loop()
# Import required modules for model resolution (similar to server.py)
from config import DEFAULT_MODEL
from providers.registry import ModelProviderRegistry
from utils.model_context import ModelContext
# Resolve model (simplified version of server.py logic)
model_name = params.get("model", DEFAULT_MODEL)
provider = ModelProviderRegistry.get_provider_for_model(model_name)
if not provider:
# Fallback to available model for testing
available_models = list(ModelProviderRegistry.get_available_models(respect_restrictions=True).keys())
if available_models:
model_name = available_models[0]
params["model"] = model_name
self.logger.debug(f"Using fallback model for testing: {model_name}")
# Create model context
model_context = ModelContext(model_name)
params["_model_context"] = model_context
params["_resolved_model_name"] = model_name
# Execute tool asynchronously
try:
result = loop.run_until_complete(tool.execute(params))
except ToolExecutionError as exc:
response_text = exc.payload
continuation_id = self._extract_continuation_id_from_response(response_text)
self.logger.debug(f"Tool '{tool_name}' returned error payload in-process")
if self.verbose and response_text:
self.logger.debug(f"Error response preview: {response_text[:500]}...")
return response_text, continuation_id
if not result or len(result) == 0:
return None, None
# Extract response content
response_text = result[0].text if hasattr(result[0], "text") else str(result[0])
# Parse response to extract continuation_id
continuation_id = self._extract_continuation_id_from_response(response_text)
self.logger.debug(f"Tool '{tool_name}' completed successfully in-process")
if self.verbose and response_text:
self.logger.debug(f"Response preview: {response_text[:500]}...")
return response_text, continuation_id
except Exception as e:
self.logger.error(f"Direct tool call failed for '{tool_name}': {e}")
return None, None
def _extract_continuation_id_from_response(self, response_text: str) -> Optional[str]:
"""Extract continuation_id from tool response"""
try:
# Parse the response as JSON to look for continuation metadata
response_data = json.loads(response_text)
# Look for continuation_id in various places
if isinstance(response_data, dict):
# Check top-level continuation_id (workflow tools)
if "continuation_id" in response_data:
return response_data["continuation_id"]
# Check metadata
metadata = response_data.get("metadata", {})
if "thread_id" in metadata:
return metadata["thread_id"]
# Check continuation_offer
continuation_offer = response_data.get("continuation_offer", {})
if continuation_offer and "continuation_id" in continuation_offer:
return continuation_offer["continuation_id"]
# Check follow_up_request
follow_up = response_data.get("follow_up_request", {})
if follow_up and "continuation_id" in follow_up:
return follow_up["continuation_id"]
# Special case: files_required_to_continue may have nested content
if response_data.get("status") == "files_required_to_continue":
content = response_data.get("content", "")
if isinstance(content, str):
try:
# Try to parse nested JSON
nested_data = json.loads(content)
if isinstance(nested_data, dict):
# Check for continuation in nested data
follow_up = nested_data.get("follow_up_request", {})
if follow_up and "continuation_id" in follow_up:
return follow_up["continuation_id"]
except json.JSONDecodeError:
pass
return None
except (json.JSONDecodeError, AttributeError):
# If response is not JSON or doesn't have expected structure, return None
return None
def tearDown(self):
"""Clean up after test"""
super().cleanup_test_files()
# Clear memory again for good measure
self._clear_conversation_memory()
@property
def test_name(self) -> str:
"""Get the test name"""
return self.__class__.__name__
@property
def test_description(self) -> str:
"""Get the test description"""
return "In-process conversation test"
```
--------------------------------------------------------------------------------
/tests/test_image_validation.py:
--------------------------------------------------------------------------------
```python
"""Tests for image validation utility helpers."""
import base64
import os
import tempfile
from unittest.mock import Mock, patch
import pytest
from utils.image_utils import DEFAULT_MAX_IMAGE_SIZE_MB, validate_image
class TestImageValidation:
"""Test suite for image validation functionality."""
def test_validate_data_url_valid(self) -> None:
"""Test validation of valid data URL."""
# Create a small test image (1x1 PNG)
test_image_data = base64.b64decode(
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
)
data_url = f"data:image/png;base64,{base64.b64encode(test_image_data).decode()}"
image_bytes, mime_type = validate_image(data_url)
assert image_bytes == test_image_data
assert mime_type == "image/png"
@pytest.mark.parametrize(
"invalid_url,expected_error",
[
("data:image/png", "Invalid data URL format"), # Missing base64 part
("data:image/png;base64", "Invalid data URL format"), # Missing data
("data:text/plain;base64,dGVzdA==", "Unsupported image type"), # Not an image
],
)
def test_validate_data_url_invalid_format(self, invalid_url: str, expected_error: str) -> None:
"""Test validation of malformed data URL."""
with pytest.raises(ValueError) as excinfo:
validate_image(invalid_url)
assert expected_error in str(excinfo.value)
def test_non_data_url_treated_as_file_path(self) -> None:
"""Test that non-data URLs are treated as file paths."""
# Test case that's not a data URL at all
with pytest.raises(ValueError) as excinfo:
validate_image("image/png;base64,abc123")
assert "Image file not found" in str(excinfo.value) # Treated as file path
def test_validate_data_url_unsupported_type(self) -> None:
"""Test validation of unsupported image type in data URL."""
data_url = "" # BMP format
with pytest.raises(ValueError) as excinfo:
validate_image(data_url)
assert "Unsupported image type: image/bmp" in str(excinfo.value)
def test_validate_data_url_invalid_base64(self) -> None:
"""Test validation of data URL with invalid base64."""
data_url = "data:image/png;base64,@@@invalid@@@"
with pytest.raises(ValueError) as excinfo:
validate_image(data_url)
assert "Invalid base64 data" in str(excinfo.value)
def test_validate_large_data_url(self) -> None:
"""Test validation of large data URL to ensure size limits work."""
# Create a large image (21MB)
large_data = b"x" * (21 * 1024 * 1024) # 21MB
# Encode as base64 and create data URL
import base64
encoded_data = base64.b64encode(large_data).decode()
data_url = f"data:image/png;base64,{encoded_data}"
# Should fail with default 20MB limit
with pytest.raises(ValueError) as excinfo:
validate_image(data_url)
assert f"Image too large: 21.0MB (max: {DEFAULT_MAX_IMAGE_SIZE_MB:.1f}MB)" in str(excinfo.value)
# Should succeed with higher limit
image_bytes, mime_type = validate_image(data_url, max_size_mb=25.0)
assert len(image_bytes) == len(large_data)
assert mime_type == "image/png"
def test_validate_file_path_valid(self) -> None:
"""Test validation of valid image file."""
# Create a temporary image file
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
# Write a small test PNG
test_image_data = base64.b64decode(
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
)
tmp_file.write(test_image_data)
tmp_file_path = tmp_file.name
try:
image_bytes, mime_type = validate_image(tmp_file_path)
assert image_bytes == test_image_data
assert mime_type == "image/png"
finally:
os.unlink(tmp_file_path)
def test_validate_file_path_not_found(self) -> None:
"""Test validation of non-existent file."""
with pytest.raises(ValueError) as excinfo:
validate_image("/path/to/nonexistent/image.png")
assert "Image file not found" in str(excinfo.value)
def test_validate_file_path_unsupported_extension(self) -> None:
"""Test validation of file with unsupported extension."""
with tempfile.NamedTemporaryFile(suffix=".bmp", delete=False) as tmp_file:
tmp_file.write(b"dummy data")
tmp_file_path = tmp_file.name
try:
with pytest.raises(ValueError) as excinfo:
validate_image(tmp_file_path)
assert "Unsupported image format: .bmp" in str(excinfo.value)
finally:
os.unlink(tmp_file_path)
def test_validate_file_path_read_error(self) -> None:
"""Test validation when file cannot be read."""
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
tmp_file_path = tmp_file.name
# Remove the file but keep the path
os.unlink(tmp_file_path)
with pytest.raises(ValueError) as excinfo:
validate_image(tmp_file_path)
assert "Image file not found" in str(excinfo.value)
def test_validate_image_size_limit(self) -> None:
"""Test validation of image size limits."""
# Create a large "image" (just random data)
large_data = b"x" * (21 * 1024 * 1024) # 21MB
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
tmp_file.write(large_data)
tmp_file_path = tmp_file.name
try:
with pytest.raises(ValueError) as excinfo:
validate_image(tmp_file_path, max_size_mb=20.0)
assert "Image too large: 21.0MB (max: 20.0MB)" in str(excinfo.value)
finally:
os.unlink(tmp_file_path)
def test_validate_image_custom_size_limit(self) -> None:
"""Test validation with custom size limit."""
# Create a 2MB "image"
data = b"x" * (2 * 1024 * 1024)
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
tmp_file.write(data)
tmp_file_path = tmp_file.name
try:
# Should fail with 1MB limit
with pytest.raises(ValueError) as excinfo:
validate_image(tmp_file_path, max_size_mb=1.0)
assert "Image too large: 2.0MB (max: 1.0MB)" in str(excinfo.value)
# Should succeed with 3MB limit
image_bytes, mime_type = validate_image(tmp_file_path, max_size_mb=3.0)
assert len(image_bytes) == len(data)
assert mime_type == "image/png"
finally:
os.unlink(tmp_file_path)
def test_validate_image_default_size_limit(self) -> None:
"""Test validation with default size limit (None)."""
# Create a small image that's under the default limit
data = b"x" * (1024 * 1024) # 1MB
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
tmp_file.write(data)
tmp_file_path = tmp_file.name
try:
# Should succeed with default limit (20MB)
image_bytes, mime_type = validate_image(tmp_file_path)
assert len(image_bytes) == len(data)
assert mime_type == "image/jpeg"
# Should also succeed when explicitly passing None
image_bytes, mime_type = validate_image(tmp_file_path, max_size_mb=None)
assert len(image_bytes) == len(data)
assert mime_type == "image/jpeg"
finally:
os.unlink(tmp_file_path)
def test_validate_all_supported_formats(self) -> None:
"""Test validation of all supported image formats."""
supported_formats = {
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".gif": "image/gif",
".webp": "image/webp",
}
for ext, expected_mime in supported_formats.items():
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp_file:
tmp_file.write(b"dummy image data")
tmp_file_path = tmp_file.name
try:
image_bytes, mime_type = validate_image(tmp_file_path)
assert mime_type == expected_mime
assert image_bytes == b"dummy image data"
finally:
os.unlink(tmp_file_path)
class TestProviderIntegration:
"""Test image validation integration with different providers."""
@patch("providers.gemini.logger")
def test_gemini_provider_uses_validation(self, mock_logger: Mock) -> None:
"""Test that Gemini provider uses the base validation."""
from providers.gemini import GeminiModelProvider
# Create a provider instance
provider = GeminiModelProvider(api_key="test-key")
# Test with non-existent file
result = provider._process_image("/nonexistent/image.png")
assert result is None
mock_logger.warning.assert_called_with("Image file not found: /nonexistent/image.png")
@patch("providers.openai_compatible.logging")
def test_openai_compatible_provider_uses_validation(self, mock_logging: Mock) -> None:
"""Test that OpenAI-compatible providers use the base validation."""
from providers.xai import XAIModelProvider
# Create a provider instance (XAI inherits from OpenAICompatibleProvider)
provider = XAIModelProvider(api_key="test-key")
# Test with non-existent file
result = provider._process_image("/nonexistent/image.png")
assert result is None
mock_logging.warning.assert_called_with("Image file not found: /nonexistent/image.png")
def test_data_url_preservation(self) -> None:
"""Test that data URLs are properly preserved through validation."""
from providers.xai import XAIModelProvider
provider = XAIModelProvider(api_key="test-key")
# Valid data URL
data_url = ""
result = provider._process_image(data_url)
assert result is not None
assert result["type"] == "image_url"
assert result["image_url"]["url"] == data_url
```
--------------------------------------------------------------------------------
/tests/test_consensus_integration.py:
--------------------------------------------------------------------------------
```python
"""Integration test for ConsensusTool using OpenAI and Gemini recordings."""
from __future__ import annotations
import json
import os
from pathlib import Path
import pytest
from providers.registry import ModelProviderRegistry
from providers.shared import ProviderType
from tests.transport_helpers import inject_transport
from tools.consensus import ConsensusTool
# Directories for recorded HTTP interactions
CASSETTE_DIR = Path(__file__).parent / "openai_cassettes"
CASSETTE_DIR.mkdir(exist_ok=True)
CONSENSUS_CASSETTE_PATH = CASSETTE_DIR / "consensus_step1_gpt5_for.json"
GEMINI_REPLAY_DIR = Path(__file__).parent / "gemini_cassettes"
GEMINI_REPLAY_DIR.mkdir(exist_ok=True)
GEMINI_REPLAY_ID = "consensus/step2_gemini25_flash_against/mldev"
GEMINI_REPLAY_PATH = GEMINI_REPLAY_DIR / "consensus" / "step2_gemini25_flash_against" / "mldev.json"
@pytest.mark.asyncio
@pytest.mark.no_mock_provider
async def test_consensus_multi_model_consultations(monkeypatch):
"""Exercise ConsensusTool against gpt-5 (supporting) and gemini-2.0-flash (critical)."""
env_updates = {
"DEFAULT_MODEL": "auto",
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", ""),
"GEMINI_API_KEY": os.getenv("GEMINI_API_KEY", ""),
}
keys_to_clear = [
"XAI_API_KEY",
"OPENROUTER_API_KEY",
"ANTHROPIC_API_KEY",
"MISTRAL_API_KEY",
"CUSTOM_API_KEY",
"CUSTOM_API_URL",
]
recording_mode = not CONSENSUS_CASSETTE_PATH.exists() or not GEMINI_REPLAY_PATH.exists()
if recording_mode:
openai_key = env_updates["OPENAI_API_KEY"].strip()
gemini_key = env_updates["GEMINI_API_KEY"].strip()
if (not openai_key or openai_key.startswith("dummy")) or (not gemini_key or gemini_key.startswith("dummy")):
pytest.skip(
"Consensus cassette missing and OPENAI_API_KEY/GEMINI_API_KEY not configured. Provide real keys to record."
)
GEMINI_REPLAY_PATH.parent.mkdir(parents=True, exist_ok=True)
with monkeypatch.context() as m:
m.setenv("DEFAULT_MODEL", env_updates["DEFAULT_MODEL"])
if recording_mode:
m.setenv("OPENAI_API_KEY", env_updates["OPENAI_API_KEY"])
m.setenv("GEMINI_API_KEY", env_updates["GEMINI_API_KEY"])
m.setenv("GOOGLE_GENAI_CLIENT_MODE", "record")
else:
m.setenv("OPENAI_API_KEY", "dummy-key-for-replay")
m.setenv("GEMINI_API_KEY", "dummy-key-for-replay")
m.setenv("GOOGLE_GENAI_CLIENT_MODE", "replay")
m.setenv("GOOGLE_GENAI_REPLAYS_DIRECTORY", str(GEMINI_REPLAY_DIR))
m.setenv("GOOGLE_GENAI_REPLAY_ID", GEMINI_REPLAY_ID)
for key in keys_to_clear:
m.delenv(key, raising=False)
# Reset providers and register only OpenAI & Gemini for deterministic behavior
ModelProviderRegistry.reset_for_testing()
from providers.gemini import GeminiModelProvider
from providers.openai import OpenAIModelProvider
ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
# Inject HTTP transport for OpenAI interactions
inject_transport(monkeypatch, CONSENSUS_CASSETTE_PATH)
tool = ConsensusTool()
models_to_consult = [
{"model": "gpt-5", "stance": "for"},
{"model": "gemini-2.5-flash", "stance": "against"},
]
# Step 1: CLI agent analysis followed by first model consultation
step1_arguments = {
"step": "Evaluate SwiftUI vs UIKit adoption and recommend ONE word (SwiftUI or UIKit).",
"step_number": 1,
"total_steps": len(models_to_consult),
"next_step_required": True,
"findings": "SwiftUI momentum is strong but UIKit remains battle-tested.",
"models": models_to_consult,
}
step1_response = await tool.execute(step1_arguments)
assert step1_response and step1_response[0].type == "text"
step1_data = json.loads(step1_response[0].text)
assert step1_data["status"] == "analysis_and_first_model_consulted"
assert step1_data["model_consulted"] == "gpt-5"
assert step1_data["model_response"]["status"] == "success"
assert step1_data["model_response"]["metadata"]["provider"] == "openai"
assert step1_data["model_response"]["verdict"]
continuation_offer = step1_data.get("continuation_offer")
assert continuation_offer is not None
continuation_id = continuation_offer["continuation_id"]
# Prepare step 2 inputs using the first model's response summary
summary_for_step2 = step1_data["model_response"]["verdict"][:200]
step2_arguments = {
"step": f"Incorporated gpt-5 perspective: {summary_for_step2}",
"step_number": 2,
"total_steps": len(models_to_consult),
"next_step_required": False,
"findings": "Ready to gather opposing stance before synthesis.",
"continuation_id": continuation_id,
"current_model_index": step1_data.get("current_model_index", 1),
"model_responses": step1_data.get("model_responses", []),
}
step2_response = await tool.execute(step2_arguments)
assert step2_response and step2_response[0].type == "text"
step2_data = json.loads(step2_response[0].text)
assert step2_data["status"] == "consensus_workflow_complete"
assert step2_data["model_consulted"] == "gemini-2.5-flash"
assert step2_data["model_response"]["metadata"]["provider"] == "google"
assert step2_data["model_response"]["verdict"]
assert step2_data["complete_consensus"]["models_consulted"] == [
"gpt-5:for",
"gemini-2.5-flash:against",
]
assert step2_data["consensus_complete"] is True
continuation_offer_final = step2_data.get("continuation_offer")
assert continuation_offer_final is not None
assert continuation_offer_final["continuation_id"] == continuation_id
# Ensure Gemini replay session is flushed to disk before verification
gemini_provider = ModelProviderRegistry.get_provider_for_model("gemini-2.5-flash")
if gemini_provider is not None:
try:
client = gemini_provider.client
if hasattr(client, "close"):
client.close()
finally:
if hasattr(gemini_provider, "_client"):
gemini_provider._client = None
# Ensure cassettes exist for future replays
assert CONSENSUS_CASSETTE_PATH.exists()
assert GEMINI_REPLAY_PATH.exists()
# Clean up provider registry state after test
ModelProviderRegistry.reset_for_testing()
@pytest.mark.asyncio
@pytest.mark.no_mock_provider
async def test_consensus_auto_mode_with_openrouter_and_gemini(monkeypatch):
"""Ensure continuation flow resolves to real models instead of leaking 'auto'."""
gemini_key = os.getenv("GEMINI_API_KEY", "").strip() or "dummy-key-for-replay"
openrouter_key = os.getenv("OPENROUTER_API_KEY", "").strip() or "dummy-key-for-replay"
with monkeypatch.context() as m:
m.setenv("DEFAULT_MODEL", "auto")
m.setenv("GEMINI_API_KEY", gemini_key)
m.setenv("OPENROUTER_API_KEY", openrouter_key)
for key in [
"OPENAI_API_KEY",
"XAI_API_KEY",
"DIAL_API_KEY",
"CUSTOM_API_KEY",
"CUSTOM_API_URL",
]:
m.delenv(key, raising=False)
import importlib
import config
m.setattr(config, "DEFAULT_MODEL", "auto")
import server as server_module
server = importlib.reload(server_module)
m.setattr(server, "DEFAULT_MODEL", "auto", raising=False)
ModelProviderRegistry.reset_for_testing()
from providers.gemini import GeminiModelProvider
from providers.openrouter import OpenRouterProvider
ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
ModelProviderRegistry.register_provider(ProviderType.OPENROUTER, OpenRouterProvider)
from utils.storage_backend import get_storage_backend
# Clear conversation storage to avoid cross-test leakage
storage = get_storage_backend()
storage._store.clear()
models_to_consult = [
{"model": "claude-3-5-flash-20241022", "stance": "neutral"},
{"model": "gpt-5-mini", "stance": "neutral"},
]
step1_args = {
"step": "Evaluate framework options.",
"step_number": 1,
"total_steps": len(models_to_consult),
"next_step_required": True,
"findings": "Initial analysis requested.",
"models": models_to_consult,
}
step1_output = await server.handle_call_tool("consensus", step1_args)
assert step1_output and step1_output[0].type == "text"
step1_payload = json.loads(step1_output[0].text)
assert step1_payload["status"] == "analysis_and_first_model_consulted"
assert step1_payload["model_consulted"] == "claude-3-5-flash-20241022"
assert step1_payload["model_response"]["status"] == "error"
assert "claude-3-5-flash-20241022" in step1_payload["model_response"]["error"]
continuation_offer = step1_payload.get("continuation_offer")
assert continuation_offer is not None
continuation_id = continuation_offer["continuation_id"]
step2_args = {
"step": "Continue consultation sequence.",
"step_number": 2,
"total_steps": len(models_to_consult),
"next_step_required": False,
"findings": "Ready for next model.",
"continuation_id": continuation_id,
"models": models_to_consult,
}
try:
step2_output = await server.handle_call_tool("consensus", step2_args)
finally:
# Reset provider registry regardless of outcome to avoid cross-test bleed
ModelProviderRegistry.reset_for_testing()
assert step2_output and step2_output[0].type == "text"
step2_payload = json.loads(step2_output[0].text)
serialized = json.dumps(step2_payload)
assert "auto" not in serialized.lower(), "Auto model leakage should be resolved"
assert "gpt-5-mini" in serialized or "claude-3-5-flash-20241022" in serialized
# Restore server module to reflect original configuration for other tests
import importlib
import server as server_module
importlib.reload(server_module)
```
--------------------------------------------------------------------------------
/tests/test_o3_temperature_fix_simple.py:
--------------------------------------------------------------------------------
```python
"""
Simple integration test for the O3 model temperature parameter fix.
This test confirms that the fix properly excludes temperature parameters
for O3 models while maintaining them for regular models.
"""
from unittest.mock import Mock, patch
from providers.openai import OpenAIModelProvider
class TestO3TemperatureParameterFixSimple:
"""Simple test for O3 model parameter filtering."""
@patch("utils.model_restrictions.get_restriction_service")
@patch("providers.openai_compatible.OpenAI")
def test_o3_models_exclude_temperature_from_api_call(self, mock_openai_class, mock_restriction_service):
"""Test that O3 models don't send temperature to the API."""
# Mock restriction service to allow all models
mock_service = Mock()
mock_service.is_allowed.return_value = True
mock_restriction_service.return_value = mock_service
# Setup mock client
mock_client = Mock()
mock_openai_class.return_value = mock_client
# Setup mock response
mock_response = Mock()
mock_response.choices = [Mock()]
mock_response.choices[0].message.content = "Test response"
mock_response.choices[0].finish_reason = "stop"
mock_response.model = "o3-mini"
mock_response.id = "test-id"
mock_response.created = 1234567890
mock_response.usage = Mock()
mock_response.usage.prompt_tokens = 10
mock_response.usage.completion_tokens = 5
mock_response.usage.total_tokens = 15
mock_client.chat.completions.create.return_value = mock_response
# Create provider
provider = OpenAIModelProvider(api_key="test-key")
# Override _resolve_model_name to return the resolved model name
provider._resolve_model_name = lambda name: name
# Override model validation to bypass restrictions
provider.validate_model_name = lambda name: True
# Call generate_content with O3 model
provider.generate_content(prompt="Test prompt", model_name="o3-mini", temperature=0.5, max_output_tokens=100)
# Verify the API call was made without temperature or max_tokens
mock_client.chat.completions.create.assert_called_once()
call_kwargs = mock_client.chat.completions.create.call_args[1]
assert "temperature" not in call_kwargs, "O3 models should not include temperature parameter"
assert "max_tokens" not in call_kwargs, "O3 models should not include max_tokens parameter"
assert call_kwargs["model"] == "o3-mini"
assert "messages" in call_kwargs
@patch("utils.model_restrictions.get_restriction_service")
@patch("providers.openai_compatible.OpenAI")
def test_regular_models_include_temperature_in_api_call(self, mock_openai_class, mock_restriction_service):
"""Test that regular models still send temperature to the API."""
# Mock restriction service to allow all models
mock_service = Mock()
mock_service.is_allowed.return_value = True
mock_restriction_service.return_value = mock_service
# Setup mock client
mock_client = Mock()
mock_openai_class.return_value = mock_client
# Setup mock response
mock_response = Mock()
mock_response.choices = [Mock()]
mock_response.choices[0].message.content = "Test response"
mock_response.choices[0].finish_reason = "stop"
mock_response.model = "gpt-4.1-2025-04-14"
mock_response.id = "test-id"
mock_response.created = 1234567890
mock_response.usage = Mock()
mock_response.usage.prompt_tokens = 10
mock_response.usage.completion_tokens = 5
mock_response.usage.total_tokens = 15
mock_client.chat.completions.create.return_value = mock_response
# Create provider
provider = OpenAIModelProvider(api_key="test-key")
# Override _resolve_model_name to return the resolved model name
provider._resolve_model_name = lambda name: name
# Override model validation to bypass restrictions
provider.validate_model_name = lambda name: True
# Call generate_content with regular model (use supported model)
provider.generate_content(
prompt="Test prompt", model_name="gpt-4.1-2025-04-14", temperature=0.5, max_output_tokens=100
)
# Verify the API call was made WITH temperature and max_tokens
mock_client.chat.completions.create.assert_called_once()
call_kwargs = mock_client.chat.completions.create.call_args[1]
assert call_kwargs["temperature"] == 0.5, "Regular models should include temperature parameter"
assert call_kwargs["max_tokens"] == 100, "Regular models should include max_tokens parameter"
assert call_kwargs["model"] == "gpt-4.1-2025-04-14"
@patch("utils.model_restrictions.get_restriction_service")
@patch("providers.openai_compatible.OpenAI")
def test_o3_models_filter_unsupported_parameters(self, mock_openai_class, mock_restriction_service):
"""Test that O3 models filter out top_p, frequency_penalty, etc."""
# Mock restriction service to allow all models
mock_service = Mock()
mock_service.is_allowed.return_value = True
mock_restriction_service.return_value = mock_service
# Setup mock client
mock_client = Mock()
mock_openai_class.return_value = mock_client
# Setup mock response
mock_response = Mock()
mock_response.choices = [Mock()]
mock_response.choices[0].message.content = "Test response"
mock_response.choices[0].finish_reason = "stop"
mock_response.model = "o3"
mock_response.id = "test-id"
mock_response.created = 1234567890
mock_response.usage = Mock()
mock_response.usage.prompt_tokens = 10
mock_response.usage.completion_tokens = 5
mock_response.usage.total_tokens = 15
mock_client.chat.completions.create.return_value = mock_response
# Create provider
provider = OpenAIModelProvider(api_key="test-key")
# Override _resolve_model_name to return the resolved model name
provider._resolve_model_name = lambda name: name
# Override model validation to bypass restrictions
provider.validate_model_name = lambda name: True
# Call generate_content with O3 model and unsupported parameters
provider.generate_content(
prompt="Test prompt",
model_name="o3",
temperature=0.5,
top_p=0.9,
frequency_penalty=0.1,
presence_penalty=0.1,
seed=42,
stop=["END"],
)
# Verify the API call filters out unsupported parameters
mock_client.chat.completions.create.assert_called_once()
call_kwargs = mock_client.chat.completions.create.call_args[1]
# Should be excluded for O3 models
assert "temperature" not in call_kwargs, "O3 should not include temperature"
assert "top_p" not in call_kwargs, "O3 should not include top_p"
assert "frequency_penalty" not in call_kwargs, "O3 should not include frequency_penalty"
assert "presence_penalty" not in call_kwargs, "O3 should not include presence_penalty"
# Should be included (supported parameters)
assert call_kwargs["seed"] == 42, "O3 should include seed parameter"
assert call_kwargs["stop"] == ["END"], "O3 should include stop parameter"
@patch("utils.model_restrictions.get_restriction_service")
def test_all_o3_models_have_correct_temperature_capability(self, mock_restriction_service):
"""Test that all O3/O4 models have supports_temperature=False in their capabilities."""
from providers.openai import OpenAIModelProvider
# Mock restriction service to allow all models
mock_service = Mock()
mock_service.is_allowed.return_value = True
mock_restriction_service.return_value = mock_service
provider = OpenAIModelProvider(api_key="test-key")
# Test O3/O4 models that should NOT support temperature parameter
o3_o4_models = ["o3", "o3-mini", "o3-pro", "o4-mini"]
for model in o3_o4_models:
capabilities = provider.get_capabilities(model)
assert hasattr(
capabilities, "supports_temperature"
), f"Model {model} capabilities should have supports_temperature field"
assert capabilities.supports_temperature is False, f"Model {model} should have supports_temperature=False"
# Test that regular models DO support temperature parameter
regular_models = ["gpt-4.1-2025-04-14"]
for model in regular_models:
try:
capabilities = provider.get_capabilities(model)
assert hasattr(
capabilities, "supports_temperature"
), f"Model {model} capabilities should have supports_temperature field"
assert capabilities.supports_temperature is True, f"Model {model} should have supports_temperature=True"
except ValueError:
# Skip if model not in MODEL_CAPABILITIES (that's okay for this test)
pass
@patch("utils.model_restrictions.get_restriction_service")
def test_openai_provider_temperature_constraints(self, mock_restriction_service):
"""Test that OpenAI provider has correct temperature constraints for O3 models."""
from providers.openai import OpenAIModelProvider
# Mock restriction service to allow all models
mock_service = Mock()
mock_service.is_allowed.return_value = True
mock_restriction_service.return_value = mock_service
provider = OpenAIModelProvider(api_key="test-key")
# Test O3 model constraints
o3_capabilities = provider.get_capabilities("o3-mini")
assert o3_capabilities.temperature_constraint is not None
# O3 models should have fixed temperature constraint
temp_constraint = o3_capabilities.temperature_constraint
assert temp_constraint.validate(1.0) is True
assert temp_constraint.validate(0.5) is False
# Test regular model constraints - use gpt-4.1 which is supported
gpt41_capabilities = provider.get_capabilities("gpt-4.1")
assert gpt41_capabilities.temperature_constraint is not None
# Regular models should allow a range
temp_constraint = gpt41_capabilities.temperature_constraint
assert temp_constraint.validate(0.5) is True
assert temp_constraint.validate(1.0) is True
```
--------------------------------------------------------------------------------
/tests/test_supported_models_aliases.py:
--------------------------------------------------------------------------------
```python
"""Test the MODEL_CAPABILITIES aliases structure across all providers."""
from providers.dial import DIALModelProvider
from providers.gemini import GeminiModelProvider
from providers.openai import OpenAIModelProvider
from providers.xai import XAIModelProvider
class TestSupportedModelsAliases:
"""Test that all providers have correctly structured MODEL_CAPABILITIES with aliases."""
def test_gemini_provider_aliases(self):
"""Test Gemini provider's alias structure."""
provider = GeminiModelProvider("test-key")
# Check that all models have ModelCapabilities with aliases
for model_name, config in provider.MODEL_CAPABILITIES.items():
assert hasattr(config, "aliases"), f"{model_name} must have aliases attribute"
assert isinstance(config.aliases, list), f"{model_name} aliases must be a list"
# Test specific aliases
assert "flash" in provider.MODEL_CAPABILITIES["gemini-2.5-flash"].aliases
assert "pro" in provider.MODEL_CAPABILITIES["gemini-2.5-pro"].aliases
assert "flash-2.0" in provider.MODEL_CAPABILITIES["gemini-2.0-flash"].aliases
assert "flash2" in provider.MODEL_CAPABILITIES["gemini-2.0-flash"].aliases
assert "flashlite" in provider.MODEL_CAPABILITIES["gemini-2.0-flash-lite"].aliases
assert "flash-lite" in provider.MODEL_CAPABILITIES["gemini-2.0-flash-lite"].aliases
# Test alias resolution
assert provider._resolve_model_name("flash") == "gemini-2.5-flash"
assert provider._resolve_model_name("pro") == "gemini-2.5-pro"
assert provider._resolve_model_name("flash-2.0") == "gemini-2.0-flash"
assert provider._resolve_model_name("flash2") == "gemini-2.0-flash"
assert provider._resolve_model_name("flashlite") == "gemini-2.0-flash-lite"
# Test case insensitive resolution
assert provider._resolve_model_name("Flash") == "gemini-2.5-flash"
assert provider._resolve_model_name("PRO") == "gemini-2.5-pro"
def test_openai_provider_aliases(self):
"""Test OpenAI provider's alias structure."""
provider = OpenAIModelProvider("test-key")
# Check that all models have ModelCapabilities with aliases
for model_name, config in provider.MODEL_CAPABILITIES.items():
assert hasattr(config, "aliases"), f"{model_name} must have aliases attribute"
assert isinstance(config.aliases, list), f"{model_name} aliases must be a list"
# Test specific aliases
# "mini" is now an alias for gpt-5-mini, not o4-mini
assert "mini" in provider.MODEL_CAPABILITIES["gpt-5-mini"].aliases
assert "o4mini" in provider.MODEL_CAPABILITIES["o4-mini"].aliases
# o4-mini is no longer in its own aliases (removed self-reference)
assert "o3mini" in provider.MODEL_CAPABILITIES["o3-mini"].aliases
assert "o3pro" in provider.MODEL_CAPABILITIES["o3-pro"].aliases
assert "gpt4.1" in provider.MODEL_CAPABILITIES["gpt-4.1"].aliases
# Test alias resolution
assert provider._resolve_model_name("mini") == "gpt-5-mini" # mini -> gpt-5-mini now
assert provider._resolve_model_name("o3mini") == "o3-mini"
assert provider._resolve_model_name("o3pro") == "o3-pro" # o3pro resolves to o3-pro
assert provider._resolve_model_name("o4mini") == "o4-mini"
assert provider._resolve_model_name("gpt4.1") == "gpt-4.1" # gpt4.1 resolves to gpt-4.1
# Test case insensitive resolution
assert provider._resolve_model_name("Mini") == "gpt-5-mini" # mini -> gpt-5-mini now
assert provider._resolve_model_name("O3MINI") == "o3-mini"
def test_xai_provider_aliases(self):
"""Test XAI provider's alias structure."""
provider = XAIModelProvider("test-key")
# Check that all models have ModelCapabilities with aliases
for model_name, config in provider.MODEL_CAPABILITIES.items():
assert hasattr(config, "aliases"), f"{model_name} must have aliases attribute"
assert isinstance(config.aliases, list), f"{model_name} aliases must be a list"
# Test specific aliases
assert "grok" in provider.MODEL_CAPABILITIES["grok-4"].aliases
assert "grok4" in provider.MODEL_CAPABILITIES["grok-4"].aliases
assert "grok3" in provider.MODEL_CAPABILITIES["grok-3"].aliases
assert "grok3fast" in provider.MODEL_CAPABILITIES["grok-3-fast"].aliases
assert "grokfast" in provider.MODEL_CAPABILITIES["grok-3-fast"].aliases
# Test alias resolution
assert provider._resolve_model_name("grok") == "grok-4"
assert provider._resolve_model_name("grok4") == "grok-4"
assert provider._resolve_model_name("grok3") == "grok-3"
assert provider._resolve_model_name("grok3fast") == "grok-3-fast"
assert provider._resolve_model_name("grokfast") == "grok-3-fast"
# Test case insensitive resolution
assert provider._resolve_model_name("Grok") == "grok-4"
assert provider._resolve_model_name("GROKFAST") == "grok-3-fast"
def test_dial_provider_aliases(self):
"""Test DIAL provider's alias structure."""
provider = DIALModelProvider("test-key")
# Check that all models have ModelCapabilities with aliases
for model_name, config in provider.MODEL_CAPABILITIES.items():
assert hasattr(config, "aliases"), f"{model_name} must have aliases attribute"
assert isinstance(config.aliases, list), f"{model_name} aliases must be a list"
# Test specific aliases
assert "o3" in provider.MODEL_CAPABILITIES["o3-2025-04-16"].aliases
assert "o4-mini" in provider.MODEL_CAPABILITIES["o4-mini-2025-04-16"].aliases
assert "sonnet-4.1" in provider.MODEL_CAPABILITIES["anthropic.claude-sonnet-4.1-20250805-v1:0"].aliases
assert "opus-4.1" in provider.MODEL_CAPABILITIES["anthropic.claude-opus-4.1-20250805-v1:0"].aliases
assert "gemini-2.5-pro" in provider.MODEL_CAPABILITIES["gemini-2.5-pro-preview-05-06"].aliases
# Test alias resolution
assert provider._resolve_model_name("o3") == "o3-2025-04-16"
assert provider._resolve_model_name("o4-mini") == "o4-mini-2025-04-16"
assert provider._resolve_model_name("sonnet-4.1") == "anthropic.claude-sonnet-4.1-20250805-v1:0"
assert provider._resolve_model_name("opus-4.1") == "anthropic.claude-opus-4.1-20250805-v1:0"
# Test case insensitive resolution
assert provider._resolve_model_name("O3") == "o3-2025-04-16"
assert provider._resolve_model_name("SONNET-4.1") == "anthropic.claude-sonnet-4.1-20250805-v1:0"
def test_list_models_includes_aliases(self):
"""Test that list_models returns both base models and aliases."""
# Test Gemini
gemini_provider = GeminiModelProvider("test-key")
gemini_models = gemini_provider.list_models(respect_restrictions=False)
assert "gemini-2.5-flash" in gemini_models
assert "flash" in gemini_models
assert "gemini-2.5-pro" in gemini_models
assert "pro" in gemini_models
# Test OpenAI
openai_provider = OpenAIModelProvider("test-key")
openai_models = openai_provider.list_models(respect_restrictions=False)
assert "o4-mini" in openai_models
assert "mini" in openai_models
assert "o3-mini" in openai_models
assert "o3mini" in openai_models
# Test XAI
xai_provider = XAIModelProvider("test-key")
xai_models = xai_provider.list_models(respect_restrictions=False)
assert "grok-3" in xai_models
assert "grok" in xai_models
assert "grok-3-fast" in xai_models
assert "grokfast" in xai_models
# Test DIAL
dial_provider = DIALModelProvider("test-key")
dial_models = dial_provider.list_models(respect_restrictions=False)
assert "o3-2025-04-16" in dial_models
assert "o3" in dial_models
def test_list_models_all_known_variant_includes_aliases(self):
"""Unified list_models should support lowercase, alias-inclusive listings."""
# Test Gemini
gemini_provider = GeminiModelProvider("test-key")
gemini_all = gemini_provider.list_models(
respect_restrictions=False,
include_aliases=True,
lowercase=True,
unique=True,
)
assert "gemini-2.5-flash" in gemini_all
assert "flash" in gemini_all
assert "gemini-2.5-pro" in gemini_all
assert "pro" in gemini_all
# All should be lowercase
assert all(model == model.lower() for model in gemini_all)
# Test OpenAI
openai_provider = OpenAIModelProvider("test-key")
openai_all = openai_provider.list_models(
respect_restrictions=False,
include_aliases=True,
lowercase=True,
unique=True,
)
assert "o4-mini" in openai_all
assert "mini" in openai_all
assert "o3-mini" in openai_all
assert "o3mini" in openai_all
# All should be lowercase
assert all(model == model.lower() for model in openai_all)
def test_no_string_shorthand_in_supported_models(self):
"""Test that no provider has string-based shorthands anymore."""
providers = [
GeminiModelProvider("test-key"),
OpenAIModelProvider("test-key"),
XAIModelProvider("test-key"),
DIALModelProvider("test-key"),
]
for provider in providers:
for model_name, config in provider.MODEL_CAPABILITIES.items():
# All values must be ModelCapabilities objects, not strings or dicts
from providers.shared import ModelCapabilities
assert isinstance(config, ModelCapabilities), (
f"{provider.__class__.__name__}.MODEL_CAPABILITIES['{model_name}'] "
f"must be a ModelCapabilities object, not {type(config).__name__}"
)
def test_resolve_returns_original_if_not_found(self):
"""Test that _resolve_model_name returns original name if alias not found."""
providers = [
GeminiModelProvider("test-key"),
OpenAIModelProvider("test-key"),
XAIModelProvider("test-key"),
DIALModelProvider("test-key"),
]
for provider in providers:
# Test with unknown model name
assert provider._resolve_model_name("unknown-model") == "unknown-model"
assert provider._resolve_model_name("gpt-4") == "gpt-4"
assert provider._resolve_model_name("claude-3") == "claude-3"
```
--------------------------------------------------------------------------------
/tests/test_docker_claude_desktop_integration.py:
--------------------------------------------------------------------------------
```python
"""
Tests for Docker integration with Claude Desktop MCP
"""
import json
import os
import tempfile
from pathlib import Path
import pytest
class TestDockerClaudeDesktopIntegration:
"""Test Docker integration with Claude Desktop"""
@pytest.fixture(autouse=True)
def setup(self):
"""Setup for each test"""
self.project_root = Path(__file__).parent.parent
def test_mcp_config_docker_run_format(self):
"""Test MCP configuration for direct docker run"""
config = {
"mcpServers": {
"zen-mcp": {
"command": "docker",
"args": [
"run",
"--rm",
"-i",
"--env-file",
"/path/to/.env",
"-v",
"/path/to/logs:/app/logs",
"zen-mcp-server:latest",
],
}
}
}
# Validate configuration structure
assert "mcpServers" in config
assert "zen-mcp" in config["mcpServers"]
assert config["mcpServers"]["zen-mcp"]["command"] == "docker"
args = config["mcpServers"]["zen-mcp"]["args"]
assert "run" in args
assert "--rm" in args
assert "-i" in args
assert "--env-file" in args
def test_mcp_config_docker_compose_format(self):
"""Test MCP configuration for docker-compose run"""
config = {
"mcpServers": {
"zen-mcp": {
"command": "docker-compose",
"args": ["-f", "/path/to/docker-compose.yml", "run", "--rm", "zen-mcp"],
}
}
}
# Validate configuration structure
assert config["mcpServers"]["zen-mcp"]["command"] == "docker-compose"
args = config["mcpServers"]["zen-mcp"]["args"]
assert "-f" in args
assert "run" in args
assert "--rm" in args
assert "zen-mcp" in args
def test_mcp_config_environment_variables(self):
"""Test MCP configuration with inline environment variables"""
config = {
"mcpServers": {
"zen-mcp": {
"command": "docker",
"args": [
"run",
"--rm",
"-i",
"-e",
"GEMINI_API_KEY=test_key",
"-e",
"LOG_LEVEL=INFO",
"zen-mcp-server:latest",
],
}
}
}
args = config["mcpServers"]["zen-mcp"]["args"]
# Check that environment variables are properly formatted
env_args = [arg for arg in args if arg.startswith("-e")]
assert len(env_args) > 0, "Environment variables should be present"
# Check for API key environment variable
api_key_present = any("GEMINI_API_KEY=" in args[i + 1] for i, arg in enumerate(args[:-1]) if arg == "-e")
assert api_key_present, "API key environment variable should be set"
def test_windows_path_format(self):
"""Test Windows-specific path formatting"""
windows_config = {
"mcpServers": {
"zen-mcp": {
"command": "docker",
"args": [
"run",
"--rm",
"-i",
"--env-file",
"C:/Users/User/zen-mcp-server/.env",
"-v",
"C:/Users/User/zen-mcp-server/logs:/app/logs",
"zen-mcp-server:latest",
],
}
}
}
args = windows_config["mcpServers"]["zen-mcp"]["args"]
# Check Windows path format
windows_paths = [arg for arg in args if arg.startswith("C:/")]
assert len(windows_paths) > 0, "Windows paths should use forward slashes"
for path in windows_paths:
assert "\\" not in path, "Windows paths should use forward slashes"
def test_mcp_config_validation(self):
"""Test validation of MCP configuration"""
# Valid configuration
valid_config = {
"mcpServers": {"zen-mcp": {"command": "docker", "args": ["run", "--rm", "-i", "zen-mcp-server:latest"]}}
}
# Validate JSON serialization
config_json = json.dumps(valid_config)
loaded_config = json.loads(config_json)
assert loaded_config == valid_config
def test_mcp_stdio_communication(self):
"""Test that MCP configuration supports stdio communication"""
config = {
"mcpServers": {
"zen-mcp": {
"command": "docker",
"args": [
"run",
"--rm",
"-i", # Interactive mode for stdio
"zen-mcp-server:latest",
],
}
}
}
args = config["mcpServers"]["zen-mcp"]["args"]
# Check for interactive mode
assert "-i" in args, "Interactive mode required for stdio communication"
# Should not expose network ports for stdio communication
port_args = [arg for arg in args if arg.startswith("-p")]
assert len(port_args) == 0, "No ports should be exposed for stdio mode"
def test_docker_image_reference(self):
"""Test that Docker image is properly referenced"""
configs = [
{"image": "zen-mcp-server:latest"},
{"image": "zen-mcp-server:v1.0.0"},
{"image": "registry/zen-mcp-server:latest"},
]
for config in configs:
image = config["image"]
# Basic image format validation
assert ":" in image, "Image should have a tag"
assert len(image.split(":")) == 2, "Image should have exactly one tag"
@pytest.fixture
def temp_mcp_config(self):
"""Create temporary MCP configuration file"""
config = {
"mcpServers": {
"zen-mcp": {
"command": "docker",
"args": ["run", "--rm", "-i", "--env-file", "/tmp/.env", "zen-mcp-server:latest"],
}
}
}
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, encoding="utf-8") as f:
json.dump(config, f, indent=2)
temp_file_path = f.name
yield temp_file_path
os.unlink(temp_file_path)
def test_mcp_config_file_parsing(self, temp_mcp_config):
"""Test parsing of MCP configuration file"""
# Read and parse the temporary config file
with open(temp_mcp_config, encoding="utf-8") as f:
config = json.load(f)
assert "mcpServers" in config
assert "zen-mcp" in config["mcpServers"]
def test_environment_file_integration(self):
"""Test integration with .env file"""
# Test .env file format expected by Docker
env_content = """GEMINI_API_KEY=test_key
OPENAI_API_KEY=test_key_2
LOG_LEVEL=INFO
DEFAULT_MODEL=auto
"""
# Parse environment content
env_vars = {}
for line in env_content.strip().split("\n"):
if "=" in line and not line.startswith("#"):
key, value = line.split("=", 1)
env_vars[key] = value
# Validate required environment variables
assert "GEMINI_API_KEY" in env_vars
assert len(env_vars["GEMINI_API_KEY"]) > 0
def test_docker_volume_mount_paths(self):
"""Test Docker volume mount path configurations"""
mount_configs = [
{"host": "./logs", "container": "/app/logs"},
{"host": "/absolute/path/logs", "container": "/app/logs"},
{"host": "C:/Windows/path/logs", "container": "/app/logs"},
]
for config in mount_configs:
mount_arg = f"{config['host']}:{config['container']}"
# Validate mount format
assert ":" in mount_arg
parts = mount_arg.split(":")
assert len(parts) >= 2
assert parts[-1].startswith("/"), "Container path should be absolute"
class TestDockerMCPErrorHandling:
"""Test error handling for Docker MCP integration"""
def test_missing_docker_image_handling(self):
"""Test handling of missing Docker image"""
# This would test what happens when the image doesn't exist
# In practice, Claude Desktop would show an error
nonexistent_config = {
"mcpServers": {"zen-mcp": {"command": "docker", "args": ["run", "--rm", "-i", "nonexistent:latest"]}}
}
# Configuration should be valid even if image doesn't exist
assert "zen-mcp" in nonexistent_config["mcpServers"]
def test_invalid_env_file_path(self):
"""Test handling of invalid .env file path"""
config_with_invalid_env = {
"mcpServers": {
"zen-mcp": {
"command": "docker",
"args": ["run", "--rm", "-i", "--env-file", "/nonexistent/.env", "zen-mcp-server:latest"],
}
}
}
# Configuration structure should still be valid
args = config_with_invalid_env["mcpServers"]["zen-mcp"]["args"]
assert "--env-file" in args
def test_docker_permission_issues(self):
"""Test configuration for potential Docker permission issues"""
# On some systems, Docker requires specific permissions
# The configuration should work with both cases
configs = [
# Regular Docker command
{"command": "docker"},
# Sudo Docker command (if needed)
{"command": "sudo", "extra_args": ["docker"]},
]
for config in configs:
assert len(config["command"]) > 0
def test_resource_limit_configurations(self):
"""Test Docker resource limit configurations"""
config_with_limits = {
"mcpServers": {
"zen-mcp": {
"command": "docker",
"args": ["run", "--rm", "-i", "--memory=512m", "--cpus=1.0", "zen-mcp-server:latest"],
}
}
}
args = config_with_limits["mcpServers"]["zen-mcp"]["args"]
# Check for resource limits
memory_limit = any("--memory" in arg for arg in args)
cpu_limit = any("--cpus" in arg for arg in args)
assert memory_limit or cpu_limit, "Resource limits should be configurable"
```
--------------------------------------------------------------------------------
/simulator_tests/test_consensus_workflow_accurate.py:
--------------------------------------------------------------------------------
```python
"""
Accurate Consensus Workflow Test
This test validates the complete consensus workflow step-by-step to ensure:
1. Step 1: Claude provides its own analysis
2. Step 2: Tool consults first model and returns response to Claude
3. Step 3: Tool consults second model and returns response to Claude
4. Step 4: Claude synthesizes all perspectives
This replaces the old faulty test that used non-workflow parameters.
"""
import json
from .conversation_base_test import ConversationBaseTest
class TestConsensusWorkflowAccurate(ConversationBaseTest):
"""Test complete consensus workflow with accurate step-by-step behavior"""
@property
def test_name(self) -> str:
return "consensus_workflow_accurate"
@property
def test_description(self) -> str:
return "Test NEW efficient consensus workflow: 2 models = 2 steps (Claude+model1, model2+synthesis)"
def run_test(self) -> bool:
"""Run complete consensus workflow test"""
# Set up the test environment
self.setUp()
try:
self.logger.info("Testing complete consensus workflow step-by-step")
self.logger.info("Expected NEW flow: Step1(Claude+Model1) -> Step2(Model2+Synthesis)")
# ============================================================================
# STEP 1: Claude analysis + first model consultation
# ============================================================================
self.logger.info("=== STEP 1: Claude analysis + flash:for consultation ===")
step1_response, continuation_id = self.call_mcp_tool_direct(
"consensus",
{
"step": "Should we add a new AI-powered search feature to our application? Please analyze the technical feasibility, user value, and implementation complexity.",
"step_number": 1,
"total_steps": 2, # 2 models (each step includes consultation + analysis)
"next_step_required": True,
"findings": "Initial assessment of AI search feature proposal considering user needs, technical constraints, and business value.",
"models": [
{
"model": "flash",
"stance": "for",
"stance_prompt": "Focus on innovation benefits and competitive advantages.",
},
{
"model": "flash",
"stance": "against",
"stance_prompt": "Focus on implementation complexity and resource requirements.",
},
],
"model": "flash", # Claude's execution model
},
)
if not step1_response:
self.logger.error("Step 1 failed - no response")
return False
step1_data = json.loads(step1_response)
self.logger.info(f"Step 1 status: {step1_data.get('status')}")
# Validate step 1 response (should include Claude's analysis + first model consultation)
if step1_data.get("status") != "analysis_and_first_model_consulted":
self.logger.error(
f"Expected status 'analysis_and_first_model_consulted', got: {step1_data.get('status')}"
)
return False
if step1_data.get("step_number") != 1:
self.logger.error(f"Expected step_number 1, got: {step1_data.get('step_number')}")
return False
if not step1_data.get("next_step_required"):
self.logger.error("Expected next_step_required=True for step 1")
return False
# Verify Claude's analysis is included
if "agent_analysis" not in step1_data:
self.logger.error("Expected agent_analysis in step 1 response")
return False
# Verify first model response is included
if "model_response" not in step1_data:
self.logger.error("Expected model_response in step 1 response")
return False
model1_response = step1_data["model_response"]
if model1_response.get("model") != "flash" or model1_response.get("stance") != "for":
self.logger.error(
f"Expected flash:for model response in step 1, got: {model1_response.get('model')}:{model1_response.get('stance')}"
)
return False
self.logger.info("✓ Step 1 completed - Claude analysis + first model (flash:for) consulted")
# ============================================================================
# STEP 2: Final step - second model consultation + synthesis
# ============================================================================
self.logger.info("=== STEP 2: Final step - second model (flash:against) + synthesis ===")
step2_response, _ = self.call_mcp_tool_direct(
"consensus",
{
"step": "I need to review the second model's perspective and provide final synthesis.",
"step_number": 2,
"total_steps": 2,
"next_step_required": False, # Final step
"findings": "Analyzed first model's 'for' perspective. Now ready for second model's 'against' stance and final synthesis.",
"continuation_id": continuation_id,
"model": "flash",
},
)
if not step2_response:
self.logger.error("Step 2 failed - no response")
return False
self.logger.info(f"Step 2 raw response: {step2_response[:500]}...")
step2_data = json.loads(step2_response)
self.logger.info(f"Step 2 status: {step2_data.get('status')}")
# Validate step 2 - should show consensus completion
if step2_data.get("status") != "consensus_workflow_complete":
self.logger.error(f"Expected status 'consensus_workflow_complete', got: {step2_data.get('status')}")
return False
if step2_data.get("model_consulted") != "flash":
self.logger.error(f"Expected model_consulted 'flash', got: {step2_data.get('model_consulted')}")
return False
if step2_data.get("model_stance") != "against":
self.logger.error(f"Expected model_stance 'against', got: {step2_data.get('model_stance')}")
return False
# Verify model response is included
if "model_response" not in step2_data:
self.logger.error("Expected model_response in step 2")
return False
model2_response = step2_data["model_response"]
if model2_response.get("model") != "flash":
self.logger.error(f"Expected model_response.model 'flash', got: {model2_response.get('model')}")
return False
# Verify consensus completion data
if not step2_data.get("consensus_complete"):
self.logger.error("Expected consensus_complete=True in final step")
return False
if "complete_consensus" not in step2_data:
self.logger.error("Expected complete_consensus data in final step")
return False
self.logger.info("✓ Step 2 completed - Second model (flash:against) consulted and consensus complete")
self.logger.info(f"Model 2 verdict preview: {model2_response.get('verdict', 'No verdict')[:100]}...")
# Validate final consensus completion data
complete_consensus = step2_data["complete_consensus"]
if complete_consensus.get("total_responses") != 2:
self.logger.error(f"Expected 2 model responses, got: {complete_consensus.get('total_responses')}")
return False
models_consulted = complete_consensus.get("models_consulted", [])
expected_models = ["flash:for", "flash:against"]
if models_consulted != expected_models:
self.logger.error(f"Expected models {expected_models}, got: {models_consulted}")
return False
# ============================================================================
# VALIDATION: Check accumulated responses are available
# ============================================================================
self.logger.info("=== VALIDATION: Checking accumulated responses ===")
if "accumulated_responses" not in step2_data:
self.logger.error("Expected accumulated_responses in final step")
return False
accumulated = step2_data["accumulated_responses"]
if len(accumulated) != 2:
self.logger.error(f"Expected 2 accumulated responses, got: {len(accumulated)}")
return False
# Verify first response (flash:for)
response1 = accumulated[0]
if response1.get("model") != "flash" or response1.get("stance") != "for":
self.logger.error(f"First response incorrect: {response1}")
return False
# Verify second response (flash:against)
response2 = accumulated[1]
if response2.get("model") != "flash" or response2.get("stance") != "against":
self.logger.error(f"Second response incorrect: {response2}")
return False
self.logger.info("✓ All accumulated responses validated")
# ============================================================================
# SUCCESS
# ============================================================================
self.logger.info("🎉 CONSENSUS WORKFLOW TEST PASSED")
self.logger.info("✓ Step 1: Claude analysis + first model (flash:for) consulted")
self.logger.info("✓ Step 2: Second model (flash:against) consulted + synthesis completed")
self.logger.info("✓ All model responses accumulated correctly")
self.logger.info("✓ New efficient workflow: 2 models = 2 steps (not 4)")
self.logger.info("✓ Workflow progression validated at each step")
return True
except Exception as e:
self.logger.error(f"Consensus workflow test failed with exception: {str(e)}")
import traceback
self.logger.error(f"Traceback: {traceback.format_exc()}")
return False
```
--------------------------------------------------------------------------------
/docs/configuration.md:
--------------------------------------------------------------------------------
```markdown
# Configuration Guide
This guide covers all configuration options for the Zen MCP Server. The server is configured through environment variables defined in your `.env` file.
## Quick Start Configuration
**Auto Mode (Recommended):** Set `DEFAULT_MODEL=auto` and let Claude intelligently select the best model for each task:
```env
# Basic configuration
DEFAULT_MODEL=auto
GEMINI_API_KEY=your-gemini-key
OPENAI_API_KEY=your-openai-key
```
## Complete Configuration Reference
### Required Configuration
**Workspace Root:**
```env
### API Keys (At least one required)
**Important:** Use EITHER OpenRouter OR native APIs, not both! Having both creates ambiguity about which provider serves each model.
**Option 1: Native APIs (Recommended for direct access)**
```env
# Google Gemini API
GEMINI_API_KEY=your_gemini_api_key_here
# Get from: https://makersuite.google.com/app/apikey
# OpenAI API
OPENAI_API_KEY=your_openai_api_key_here
# Get from: https://platform.openai.com/api-keys
# X.AI GROK API
XAI_API_KEY=your_xai_api_key_here
# Get from: https://console.x.ai/
```
**Option 2: OpenRouter (Access multiple models through one API)**
```env
# OpenRouter for unified model access
OPENROUTER_API_KEY=your_openrouter_api_key_here
# Get from: https://openrouter.ai/
# If using OpenRouter, comment out native API keys above
```
**Option 3: Custom API Endpoints (Local models)**
```env
# For Ollama, vLLM, LM Studio, etc.
CUSTOM_API_URL=http://localhost:11434/v1 # Ollama example
CUSTOM_API_KEY= # Empty for Ollama
CUSTOM_MODEL_NAME=llama3.2 # Default model
```
**Local Model Connection:**
- Use standard localhost URLs since the server runs natively
- Example: `http://localhost:11434/v1` for Ollama
### Model Configuration
**Default Model Selection:**
```env
# Options: 'auto', 'pro', 'flash', 'o3', 'o3-mini', 'o4-mini', etc.
DEFAULT_MODEL=auto # Claude picks best model for each task (recommended)
```
- **Available Models:** The canonical capability data for native providers lives in JSON manifests under `conf/`:
- `conf/openai_models.json` – OpenAI catalogue (can be overridden with `OPENAI_MODELS_CONFIG_PATH`)
- `conf/gemini_models.json` – Gemini catalogue (`GEMINI_MODELS_CONFIG_PATH`)
- `conf/xai_models.json` – X.AI / GROK catalogue (`XAI_MODELS_CONFIG_PATH`)
- `conf/openrouter_models.json` – OpenRouter catalogue (`OPENROUTER_MODELS_CONFIG_PATH`)
- `conf/dial_models.json` – DIAL aggregation catalogue (`DIAL_MODELS_CONFIG_PATH`)
- `conf/custom_models.json` – Custom/OpenAI-compatible endpoints (`CUSTOM_MODELS_CONFIG_PATH`)
Each JSON file documents the allowed fields via its `_README` block and controls model aliases, capability limits, and feature flags (including `allow_code_generation`). Edit these files (or point the matching `*_MODELS_CONFIG_PATH` variable to your own copy) when you want to adjust context windows, enable JSON mode, enable structured code generation, or expose additional aliases without touching Python code.
The shipped defaults cover:
| Provider | Canonical Models | Notable Aliases |
|----------|-----------------|-----------------|
| OpenAI | `gpt-5`, `gpt-5-pro`, `gpt-5-mini`, `gpt-5-nano`, `gpt-5-codex`, `gpt-4.1`, `o3`, `o3-mini`, `o3-pro`, `o4-mini` | `gpt5`, `gpt5pro`, `mini`, `nano`, `codex`, `o3mini`, `o3pro`, `o4mini` |
| Gemini | `gemini-2.5-pro`, `gemini-2.5-flash`, `gemini-2.0-flash`, `gemini-2.0-flash-lite` | `pro`, `gemini-pro`, `flash`, `flash-2.0`, `flashlite` |
| X.AI | `grok-4`, `grok-3`, `grok-3-fast` | `grok`, `grok4`, `grok3`, `grok3fast`, `grokfast` |
| OpenRouter | See `conf/openrouter_models.json` for the continually evolving catalogue | e.g., `opus`, `sonnet`, `flash`, `pro`, `mistral` |
| Custom | User-managed entries such as `llama3.2` | Define your own aliases per entry |
> **Tip:** Copy the JSON file you need, customise it, and point the corresponding `*_MODELS_CONFIG_PATH` environment variable to your version. This lets you enable or disable capabilities (JSON mode, function calling, temperature support, code generation) without editing Python.
### Code Generation Capability
**`allow_code_generation` Flag:**
The `allow_code_generation` capability enables models to generate complete, production-ready implementations in a structured format. When enabled, the `chat` tool will inject special instructions for substantial code generation tasks.
```json
{
"model_name": "gpt-5",
"allow_code_generation": true,
...
}
```
**When to Enable:**
- **Enable for**: Models MORE capable than your primary CLI's model (e.g., GPT-5, GPT-5 Pro when using Claude Code with Sonnet 4.5)
- **Purpose**: Get complete implementations from a more powerful reasoning model that your primary CLI can then review and apply
- **Use case**: Large-scale implementations, major refactoring, complete module creation
**Important Guidelines:**
1. Only enable for models significantly more capable than your primary CLI to ensure high-quality generated code
2. The capability triggers structured code output (`<GENERATED-CODE>` blocks) for substantial implementation requests
3. Minor code changes still use inline code blocks regardless of this setting
4. Generated code is saved to `zen_generated.code` in the user's working directory
5. Your CLI receives instructions to review and apply the generated code systematically
**Example Configuration:**
```json
// OpenAI models configuration (conf/openai_models.json)
{
"models": [
{
"model_name": "gpt-5",
"allow_code_generation": true,
"intelligence_score": 18,
...
},
{
"model_name": "gpt-5-pro",
"allow_code_generation": true,
"intelligence_score": 19,
...
}
]
}
```
**Typical Workflow:**
1. You ask your AI agent to implement a complex new feature using `chat` with a higher-reasoning model such as **gpt-5-pro**
2. GPT-5-Pro generates structured implementation and shares the complete implementation with Zen
3. Zen saves the code to `zen_generated.code` and asks AI agent to implement the plan
4. AI agent continues from the previous context, reads the file, applies the implementation
### Thinking Mode Configuration
**Default Thinking Mode for ThinkDeep:**
```env
# Only applies to models supporting extended thinking (e.g., Gemini 2.5 Pro)
DEFAULT_THINKING_MODE_THINKDEEP=high
# Available modes and token consumption:
# minimal: 128 tokens - Quick analysis, fastest response
# low: 2,048 tokens - Light reasoning tasks
# medium: 8,192 tokens - Balanced reasoning
# high: 16,384 tokens - Complex analysis (recommended for thinkdeep)
# max: 32,768 tokens - Maximum reasoning depth
```
### Model Usage Restrictions
Control which models can be used from each provider for cost control, compliance, or standardization:
```env
# Format: Comma-separated list (case-insensitive, whitespace tolerant)
# Empty or unset = all models allowed (default)
# OpenAI model restrictions
OPENAI_ALLOWED_MODELS=o3-mini,o4-mini,mini
# Gemini model restrictions
GOOGLE_ALLOWED_MODELS=flash,pro
# X.AI GROK model restrictions
XAI_ALLOWED_MODELS=grok-3,grok-3-fast,grok-4
# OpenRouter model restrictions (affects models via custom provider)
OPENROUTER_ALLOWED_MODELS=opus,sonnet,mistral
```
**Supported Model Names:** The names/aliases listed in the JSON manifests above are the authoritative source. Keep in mind:
- Aliases are case-insensitive and defined per entry (for example, `mini` maps to `gpt-5-mini` by default, while `flash` maps to `gemini-2.5-flash`).
- When you override the manifest files you can add or remove aliases as needed; restriction policies (`*_ALLOWED_MODELS`) automatically pick up those changes.
- Models omitted from a manifest fall back to generic capability detection (where supported) and may have limited feature metadata.
**Example Configurations:**
```env
# Cost control - only cheap models
OPENAI_ALLOWED_MODELS=o4-mini
GOOGLE_ALLOWED_MODELS=flash
# Single model standardization
OPENAI_ALLOWED_MODELS=o4-mini
GOOGLE_ALLOWED_MODELS=pro
# Balanced selection
GOOGLE_ALLOWED_MODELS=flash,pro
XAI_ALLOWED_MODELS=grok,grok-3-fast
```
### Advanced Configuration
**Custom Model Configuration & Manifest Overrides:**
```env
# Override default location of built-in catalogues
OPENAI_MODELS_CONFIG_PATH=/path/to/openai_models.json
GEMINI_MODELS_CONFIG_PATH=/path/to/gemini_models.json
XAI_MODELS_CONFIG_PATH=/path/to/xai_models.json
OPENROUTER_MODELS_CONFIG_PATH=/path/to/openrouter_models.json
DIAL_MODELS_CONFIG_PATH=/path/to/dial_models.json
CUSTOM_MODELS_CONFIG_PATH=/path/to/custom_models.json
```
**Conversation Settings:**
```env
# How long AI-to-AI conversation threads persist in memory (hours)
# Conversations are auto-purged when claude closes its MCP connection or
# when a session is quit / re-launched
CONVERSATION_TIMEOUT_HOURS=5
# Maximum conversation turns (each exchange = 2 turns)
MAX_CONVERSATION_TURNS=20
```
**Logging Configuration:**
```env
# Logging level: DEBUG, INFO, WARNING, ERROR
LOG_LEVEL=DEBUG # Default: shows detailed operational messages
```
## Configuration Examples
### Development Setup
```env
# Development with multiple providers
DEFAULT_MODEL=auto
GEMINI_API_KEY=your-gemini-key
OPENAI_API_KEY=your-openai-key
XAI_API_KEY=your-xai-key
LOG_LEVEL=DEBUG
CONVERSATION_TIMEOUT_HOURS=1
```
### Production Setup
```env
# Production with cost controls
DEFAULT_MODEL=auto
GEMINI_API_KEY=your-gemini-key
OPENAI_API_KEY=your-openai-key
GOOGLE_ALLOWED_MODELS=flash
OPENAI_ALLOWED_MODELS=o4-mini
LOG_LEVEL=INFO
CONVERSATION_TIMEOUT_HOURS=3
```
### Local Development
```env
# Local models only
DEFAULT_MODEL=llama3.2
CUSTOM_API_URL=http://localhost:11434/v1
CUSTOM_API_KEY=
CUSTOM_MODEL_NAME=llama3.2
LOG_LEVEL=DEBUG
```
### OpenRouter Only
```env
# Single API for multiple models
DEFAULT_MODEL=auto
OPENROUTER_API_KEY=your-openrouter-key
OPENROUTER_ALLOWED_MODELS=opus,sonnet,gpt-4
LOG_LEVEL=INFO
```
## Important Notes
**Local Networking:**
- Use standard localhost URLs for local models
- The server runs as a native Python process
**API Key Priority:**
- Native APIs take priority over OpenRouter when both are configured
- Avoid configuring both native and OpenRouter for the same models
**Model Restrictions:**
- Apply to all usage including auto mode
- Empty/unset = all models allowed
- Invalid model names are warned about at startup
**Configuration Changes:**
- Restart the server with `./run-server.sh` after changing `.env`
- Configuration is loaded once at startup
## Related Documentation
- **[Advanced Usage Guide](advanced-usage.md)** - Advanced model usage patterns, thinking modes, and power user workflows
- **[Context Revival Guide](context-revival.md)** - Conversation persistence and context revival across sessions
- **[AI-to-AI Collaboration Guide](ai-collaboration.md)** - Multi-model coordination and conversation threading
```
--------------------------------------------------------------------------------
/tests/test_intelligent_fallback.py:
--------------------------------------------------------------------------------
```python
"""
Test suite for intelligent auto mode fallback logic
Tests the new dynamic model selection based on available API keys
"""
import os
from unittest.mock import Mock, patch
import pytest
from providers.registry import ModelProviderRegistry
from providers.shared import ProviderType
class TestIntelligentFallback:
"""Test intelligent model fallback logic"""
def setup_method(self):
"""Setup for each test - clear registry and reset providers"""
# Store original providers for restoration
registry = ModelProviderRegistry()
self._original_providers = registry._providers.copy()
self._original_initialized = registry._initialized_providers.copy()
# Clear registry completely
ModelProviderRegistry._instance = None
def teardown_method(self):
"""Cleanup after each test - restore original providers"""
# Restore original registry state
registry = ModelProviderRegistry()
registry._providers.clear()
registry._initialized_providers.clear()
registry._providers.update(self._original_providers)
registry._initialized_providers.update(self._original_initialized)
@patch.dict(os.environ, {"OPENAI_API_KEY": "sk-test-key", "GEMINI_API_KEY": ""}, clear=False)
def test_prefers_openai_o3_mini_when_available(self):
"""Test that gpt-5 is preferred when OpenAI API key is available (based on new preference order)"""
# Register only OpenAI provider for this test
from providers.openai import OpenAIModelProvider
ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
fallback_model = ModelProviderRegistry.get_preferred_fallback_model()
assert fallback_model == "gpt-5" # Based on new preference order: gpt-5 before o4-mini
@patch.dict(os.environ, {"OPENAI_API_KEY": "", "GEMINI_API_KEY": "test-gemini-key"}, clear=False)
def test_prefers_gemini_flash_when_openai_unavailable(self):
"""Test that gemini-2.5-flash is used when only Gemini API key is available"""
# Register only Gemini provider for this test
from providers.gemini import GeminiModelProvider
ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
fallback_model = ModelProviderRegistry.get_preferred_fallback_model()
assert fallback_model == "gemini-2.5-flash"
@patch.dict(os.environ, {"OPENAI_API_KEY": "sk-test-key", "GEMINI_API_KEY": "test-gemini-key"}, clear=False)
def test_prefers_openai_when_both_available(self):
"""Test that OpenAI is preferred when both API keys are available"""
# Register both OpenAI and Gemini providers
from providers.gemini import GeminiModelProvider
from providers.openai import OpenAIModelProvider
ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
fallback_model = ModelProviderRegistry.get_preferred_fallback_model()
assert fallback_model == "gemini-2.5-flash" # Gemini has priority now (based on new PROVIDER_PRIORITY_ORDER)
@patch.dict(os.environ, {"OPENAI_API_KEY": "", "GEMINI_API_KEY": ""}, clear=False)
def test_fallback_when_no_keys_available(self):
"""Test fallback behavior when no API keys are available"""
# Register providers but with no API keys available
from providers.gemini import GeminiModelProvider
from providers.openai import OpenAIModelProvider
ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
fallback_model = ModelProviderRegistry.get_preferred_fallback_model()
assert fallback_model == "gemini-2.5-flash" # Default fallback
def test_available_providers_with_keys(self):
"""Test the get_available_providers_with_keys method"""
from providers.gemini import GeminiModelProvider
from providers.openai import OpenAIModelProvider
with patch.dict(os.environ, {"OPENAI_API_KEY": "sk-test-key", "GEMINI_API_KEY": ""}, clear=False):
# Clear and register providers
ModelProviderRegistry._instance = None
ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
available = ModelProviderRegistry.get_available_providers_with_keys()
assert ProviderType.OPENAI in available
assert ProviderType.GOOGLE not in available
with patch.dict(os.environ, {"OPENAI_API_KEY": "", "GEMINI_API_KEY": "test-key"}, clear=False):
# Clear and register providers
ModelProviderRegistry._instance = None
ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
available = ModelProviderRegistry.get_available_providers_with_keys()
assert ProviderType.GOOGLE in available
assert ProviderType.OPENAI not in available
def test_auto_mode_conversation_memory_integration(self):
"""Test that conversation memory uses intelligent fallback in auto mode"""
from utils.conversation_memory import ThreadContext, build_conversation_history
# Mock auto mode - patch the config module where these values are defined
with (
patch("config.IS_AUTO_MODE", True),
patch("config.DEFAULT_MODEL", "auto"),
patch.dict(os.environ, {"OPENAI_API_KEY": "sk-test-key", "GEMINI_API_KEY": ""}, clear=False),
):
# Register only OpenAI provider for this test
from providers.openai import OpenAIModelProvider
ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
# Create a context with at least one turn so it doesn't exit early
from utils.conversation_memory import ConversationTurn
context = ThreadContext(
thread_id="test-123",
created_at="2023-01-01T00:00:00Z",
last_updated_at="2023-01-01T00:00:00Z",
tool_name="chat",
turns=[ConversationTurn(role="user", content="Test message", timestamp="2023-01-01T00:00:30Z")],
initial_context={},
)
# This should use o4-mini for token calculations since OpenAI is available
with patch("utils.model_context.ModelContext") as mock_context_class:
mock_context_instance = Mock()
mock_context_class.return_value = mock_context_instance
mock_context_instance.calculate_token_allocation.return_value = Mock(
file_tokens=10000, history_tokens=5000
)
# Mock estimate_tokens to return integers for proper summing
mock_context_instance.estimate_tokens.return_value = 100
history, tokens = build_conversation_history(context, model_context=None)
# Verify that ModelContext was called with gpt-5 (the intelligent fallback based on new preference order)
mock_context_class.assert_called_once_with("gpt-5")
def test_auto_mode_with_gemini_only(self):
"""Test auto mode behavior when only Gemini API key is available"""
from utils.conversation_memory import ThreadContext, build_conversation_history
with (
patch("config.IS_AUTO_MODE", True),
patch("config.DEFAULT_MODEL", "auto"),
patch.dict(os.environ, {"OPENAI_API_KEY": "", "GEMINI_API_KEY": "test-key"}, clear=False),
):
# Register only Gemini provider for this test
from providers.gemini import GeminiModelProvider
ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
from utils.conversation_memory import ConversationTurn
context = ThreadContext(
thread_id="test-456",
created_at="2023-01-01T00:00:00Z",
last_updated_at="2023-01-01T00:00:00Z",
tool_name="analyze",
turns=[ConversationTurn(role="assistant", content="Test response", timestamp="2023-01-01T00:00:30Z")],
initial_context={},
)
with patch("utils.model_context.ModelContext") as mock_context_class:
mock_context_instance = Mock()
mock_context_class.return_value = mock_context_instance
mock_context_instance.calculate_token_allocation.return_value = Mock(
file_tokens=10000, history_tokens=5000
)
# Mock estimate_tokens to return integers for proper summing
mock_context_instance.estimate_tokens.return_value = 100
history, tokens = build_conversation_history(context, model_context=None)
# Should use gemini-2.5-flash when only Gemini is available
mock_context_class.assert_called_once_with("gemini-2.5-flash")
def test_non_auto_mode_unchanged(self):
"""Test that non-auto mode behavior is unchanged"""
from utils.conversation_memory import ThreadContext, build_conversation_history
with patch("config.IS_AUTO_MODE", False), patch("config.DEFAULT_MODEL", "gemini-2.5-pro"):
from utils.conversation_memory import ConversationTurn
context = ThreadContext(
thread_id="test-789",
created_at="2023-01-01T00:00:00Z",
last_updated_at="2023-01-01T00:00:00Z",
tool_name="thinkdeep",
turns=[
ConversationTurn(role="user", content="Test in non-auto mode", timestamp="2023-01-01T00:00:30Z")
],
initial_context={},
)
with patch("utils.model_context.ModelContext") as mock_context_class:
mock_context_instance = Mock()
mock_context_class.return_value = mock_context_instance
mock_context_instance.calculate_token_allocation.return_value = Mock(
file_tokens=10000, history_tokens=5000
)
# Mock estimate_tokens to return integers for proper summing
mock_context_instance.estimate_tokens.return_value = 100
history, tokens = build_conversation_history(context, model_context=None)
# Should use the configured DEFAULT_MODEL, not the intelligent fallback
mock_context_class.assert_called_once_with("gemini-2.5-pro")
if __name__ == "__main__":
pytest.main([__file__])
```
--------------------------------------------------------------------------------
/docs/tools/refactor.md:
--------------------------------------------------------------------------------
```markdown
# Refactor Tool - Intelligent Code Refactoring
**Comprehensive refactoring analysis with top-down decomposition strategy through workflow-driven investigation**
The `refactor` tool provides intelligent code refactoring recommendations with a focus on top-down decomposition and systematic code improvement. This workflow tool enforces systematic investigation of code smells, decomposition opportunities, and modernization possibilities across multiple steps, ensuring thorough analysis before providing expert refactoring recommendations with precise implementation guidance.
## Thinking Mode
**Default is `medium` (8,192 tokens).** Use `high` for complex legacy systems (worth the investment for thorough refactoring plans) or `max` for extremely complex codebases requiring deep analysis.
## How the Workflow Works
The refactor tool implements a **structured workflow** for systematic refactoring analysis:
**Investigation Phase (Claude-Led):**
1. **Step 1**: Claude describes the refactoring plan and begins analyzing code structure
2. **Step 2+**: Claude examines code smells, decomposition opportunities, and modernization possibilities
3. **Throughout**: Claude tracks findings, relevant files, refactoring opportunities, and confidence levels
4. **Completion**: Once investigation is thorough, Claude signals completion
**Expert Analysis Phase:**
After Claude completes the investigation (unless confidence is **complete**):
- Complete refactoring opportunity summary
- Prioritized recommendations by impact
- Precise implementation guidance with line numbers
- Final expert assessment for refactoring strategy
This workflow ensures methodical investigation before expert recommendations, resulting in more targeted and valuable refactoring plans.
## Model Recommendation
The refactor tool excels with models that have large context windows like Gemini Pro (1M tokens), which can analyze entire files and complex codebases simultaneously. This comprehensive view enables detection of cross-file dependencies, architectural patterns, and refactoring opportunities that might be missed when reviewing code in smaller chunks due to context constraints.
## Example Prompts
```
"Use gemini pro to decompose my_crazy_big_class.m into smaller extensions"
"Using zen's refactor decompose the all_in_one_sync_code.swift into maintainable extensions"
```
💡**Example of a powerful prompt** to get the best out of both Claude + Flash's 1M Context:
```
"First, think about how the authentication module works, find related classes and find
any code smells, then using zen's refactor ask flash to confirm your findings but ask
it to find additional code smells and any other quick-wins and then fix these issues"
```
This results in Claude first performing its own expert analysis, encouraging it to think critically and identify links within the project code. It then prompts `flash` to review the same code with a hint—preventing it from duplicating Claude's findings and encouraging it to explore other areas that Claude did *not* discover.
## Key Features
- **Intelligent prioritization** - Will refuse to work on low priority issues if code is unwieldy large and requires decomposition first, helps identify poorly managed classes and files that need structural improvements before detail work
- **Top-down decomposition strategy** - Analyzes file → class → function levels systematically
- **Four refactor types**: `codesmells` (detect anti-patterns), `decompose` (break down large components), `modernize` (update language features), `organization` (improve structure)
- **Precise line-number references** - Provides exact line numbers for Claude to implement changes
- **Language-specific guidance** - Tailored suggestions for Python, JavaScript, Java, C#, Swift, and more
- **Style guide integration** - Uses existing project files as pattern references
- **Conservative approach** - Careful dependency analysis to prevent breaking changes
- **Multi-file analysis** - Understands cross-file relationships and dependencies
- **Priority sequencing** - Recommends implementation order for refactoring changes
- **Image support**: Analyze code architecture diagrams, legacy system charts: `"Refactor this legacy module using gemini pro with the current architecture diagram"`
## Refactor Types (Progressive Priority System)
**1. `decompose` (CRITICAL PRIORITY)** - Context-aware decomposition with adaptive thresholds:
**AUTOMATIC decomposition** (CRITICAL severity - blocks all other refactoring):
- Files >15,000 LOC, Classes >3,000 LOC, Functions >500 LOC
**EVALUATE decomposition** (contextual severity - intelligent assessment):
- Files >5,000 LOC, Classes >1,000 LOC, Functions >150 LOC
- Only recommends if genuinely improves maintainability
- Respects legacy stability, domain complexity, performance constraints
- Considers legitimate cases where size is justified (algorithms, state machines, generated code)
**2. `codesmells`** - Applied only after decomposition is complete:
- Detect long methods, complex conditionals, duplicate code, magic numbers, poor naming
**3. `modernize`** - Applied only after decomposition is complete:
- Update to modern language features (f-strings, async/await, etc.)
**4. `organization`** - Applied only after decomposition is complete:
- Improve logical grouping, separation of concerns, module structure
**Progressive Analysis:** The tool performs a top-down check (worse → bad → better) and refuses to work on lower-priority issues if critical decomposition is needed first. It understands that massive files and classes create cognitive overload that must be addressed before detail work can be effective. Legacy code that cannot be safely decomposed is handled with higher tolerance thresholds and context-sensitive exemptions.
## Tool Parameters
**Workflow Investigation Parameters (used during step-by-step process):**
- `step`: Current investigation step description (required for each step)
- `step_number`: Current step number in refactoring sequence (required)
- `total_steps`: Estimated total investigation steps (adjustable)
- `next_step_required`: Whether another investigation step is needed
- `findings`: Discoveries and refactoring opportunities in this step (required)
- `files_checked`: All files examined during investigation
- `relevant_files`: Files directly needing refactoring (required in step 1)
- `relevant_context`: Methods/functions/classes requiring refactoring
- `issues_found`: Refactoring opportunities with severity and type
- `confidence`: Confidence level in analysis completeness (exploring/incomplete/partial/complete)
- `hypothesis`: Current assessment of refactoring priorities
**Initial Configuration (used in step 1):**
- `prompt`: Description of refactoring goals, context, and specific areas of focus (required)
- `refactor_type`: codesmells|decompose|modernize|organization (default: codesmells)
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
- `focus_areas`: Specific areas to focus on (e.g., 'performance', 'readability', 'maintainability', 'security')
- `style_guide_examples`: Optional existing code files to use as style/pattern reference (absolute paths)
- `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
- `use_assistant_model`: Whether to use expert analysis phase (default: true, set to false to use Claude only)
- `continuation_id`: Thread continuation ID for multi-turn conversations
## Usage Examples
**Decomposition Analysis:**
```
"Analyze UserController.java for decomposition opportunities - it's becoming unwieldy"
```
**Code Smell Detection:**
```
"Use gemini to identify code smells in the authentication module with high thinking mode"
```
**Modernization:**
```
"Modernize legacy_parser.py to use modern Python features following examples/modern_patterns.py"
```
**Organization Improvement:**
```
"Refactor src/utils/ for better organization, focus on maintainability and readability"
```
**Legacy System Refactoring:**
```
"Use pro with max thinking to analyze this 10,000-line legacy file for decomposition strategy"
```
## Refactoring Strategy
**Top-Down Analysis:**
1. **File Level**: Identify oversized files that need splitting
2. **Class Level**: Find classes with too many responsibilities
3. **Function Level**: Locate functions that are too complex or long
4. **Code Quality**: Address smells, modernization, and organization
**Context-Aware Decisions:**
- **Domain Complexity**: Some domains legitimately require larger classes
- **Performance Constraints**: Critical path code may resist decomposition
- **Legacy Stability**: Old, working code may need gentler refactoring
- **Test Coverage**: Refactoring recommendations consider testability
**Breaking Change Prevention:**
- Analyzes dependencies before suggesting splits
- Recommends gradual migration strategies
- Identifies public API impact
- Suggests backward compatibility approaches
## Best Practices
- **Start with decomposition**: Address structural issues before cosmetic improvements
- **Provide clear context**: Explain the codebase purpose and constraints
- **Use appropriate refactor types**: Match the type to your primary concern
- **Include style examples**: Reference existing well-structured code in your project
- **Focus on high-impact areas**: Target the most problematic or frequently modified code
- **Plan implementation order**: Follow the tool's sequencing recommendations
- **Consider test coverage**: Ensure adequate tests before major structural changes
## Output Format
Refactoring analysis includes:
- **Priority Assessment**: What needs attention first and why
- **Decomposition Strategy**: Specific file/class/function splitting recommendations
- **Implementation Plan**: Step-by-step refactoring sequence
- **Line-Number References**: Exact locations for changes
- **Dependency Analysis**: Impact assessment and migration strategies
- **Risk Assessment**: Potential breaking changes and mitigation strategies
## Advanced Features
**Adaptive Thresholds:**
The tool adjusts size thresholds based on context:
- **Generated Code**: Higher tolerance for large files
- **Algorithm Implementation**: Recognizes when size is justified
- **Legacy Systems**: More conservative recommendations
- **Test Files**: Different standards for test vs production code
**Cross-File Refactoring:**
Analyzes multiple files together to understand:
- Shared responsibilities that could be extracted
- Dependencies that complicate refactoring
- Opportunities for new abstractions
- Impact of changes across the codebase
## When to Use Refactor vs Other Tools
- **Use `refactor`** for: Structural improvements, decomposition, modernization, code organization
- **Use `codereview`** for: Finding bugs and security issues with immediate fixes
- **Use `analyze`** for: Understanding code without making change recommendations
- **Use `debug`** for: Solving specific runtime issues rather than structural problems
```
--------------------------------------------------------------------------------
/tests/pii_sanitizer.py:
--------------------------------------------------------------------------------
```python
#!/usr/bin/env python3
"""
PII (Personally Identifiable Information) Sanitizer for HTTP recordings.
This module provides comprehensive sanitization of sensitive data in HTTP
request/response recordings to prevent accidental exposure of API keys,
tokens, personal information, and other sensitive data.
"""
import logging
import re
from copy import deepcopy
from dataclasses import dataclass
from re import Pattern
from typing import Any, Optional
logger = logging.getLogger(__name__)
@dataclass
class PIIPattern:
"""Defines a pattern for detecting and sanitizing PII."""
name: str
pattern: Pattern[str]
replacement: str
description: str
@classmethod
def create(cls, name: str, pattern: str, replacement: str, description: str) -> "PIIPattern":
"""Create a PIIPattern with compiled regex."""
return cls(name=name, pattern=re.compile(pattern), replacement=replacement, description=description)
class PIISanitizer:
"""Sanitizes PII from various data structures while preserving format."""
def __init__(self, patterns: Optional[list[PIIPattern]] = None):
"""Initialize with optional custom patterns."""
self.patterns: list[PIIPattern] = patterns or []
self.sanitize_enabled = True
# Add default patterns if none provided
if not patterns:
self._add_default_patterns()
def _add_default_patterns(self):
"""Add comprehensive default PII patterns."""
default_patterns = [
# API Keys - Core patterns (Bearer tokens handled in sanitize_headers)
PIIPattern.create(
name="openai_api_key_proj",
pattern=r"sk-proj-[A-Za-z0-9\-_]{48,}",
replacement="sk-proj-SANITIZED",
description="OpenAI project API keys",
),
PIIPattern.create(
name="openai_api_key",
pattern=r"sk-[A-Za-z0-9]{48,}",
replacement="sk-SANITIZED",
description="OpenAI API keys",
),
PIIPattern.create(
name="anthropic_api_key",
pattern=r"sk-ant-[A-Za-z0-9\-_]{48,}",
replacement="sk-ant-SANITIZED",
description="Anthropic API keys",
),
PIIPattern.create(
name="google_api_key",
pattern=r"AIza[A-Za-z0-9\-_]{35,}",
replacement="AIza-SANITIZED",
description="Google API keys",
),
PIIPattern.create(
name="github_tokens",
pattern=r"gh[psr]_[A-Za-z0-9]{36}",
replacement="gh_SANITIZED",
description="GitHub tokens (all types)",
),
# JWT tokens
PIIPattern.create(
name="jwt_token",
pattern=r"eyJ[A-Za-z0-9\-_]+\.eyJ[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+",
replacement="eyJ-SANITIZED",
description="JSON Web Tokens",
),
# Personal Information
PIIPattern.create(
name="email_address",
pattern=r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}",
replacement="[email protected]",
description="Email addresses",
),
PIIPattern.create(
name="ipv4_address",
pattern=r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b",
replacement="0.0.0.0",
description="IPv4 addresses",
),
PIIPattern.create(
name="ssn",
pattern=r"\b\d{3}-\d{2}-\d{4}\b",
replacement="XXX-XX-XXXX",
description="Social Security Numbers",
),
PIIPattern.create(
name="credit_card",
pattern=r"\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b",
replacement="XXXX-XXXX-XXXX-XXXX",
description="Credit card numbers",
),
PIIPattern.create(
name="phone_number",
pattern=r"(?:\+\d{1,3}[\s\-]?)?\(?\d{3}\)?[\s\-]?\d{3}[\s\-]?\d{4}\b(?![\d\.\,\]\}])",
replacement="(XXX) XXX-XXXX",
description="Phone numbers (all formats)",
),
# AWS
PIIPattern.create(
name="aws_access_key",
pattern=r"AKIA[0-9A-Z]{16}",
replacement="AKIA-SANITIZED",
description="AWS access keys",
),
# Other common patterns
PIIPattern.create(
name="slack_token",
pattern=r"xox[baprs]-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24,34}",
replacement="xox-SANITIZED",
description="Slack tokens",
),
PIIPattern.create(
name="stripe_key",
pattern=r"(?:sk|pk)_(?:test|live)_[0-9a-zA-Z]{24,99}",
replacement="sk_SANITIZED",
description="Stripe API keys",
),
]
self.patterns.extend(default_patterns)
def add_pattern(self, pattern: PIIPattern):
"""Add a custom PII pattern."""
self.patterns.append(pattern)
logger.info(f"Added PII pattern: {pattern.name}")
def sanitize_string(self, text: str) -> str:
"""Apply all patterns to sanitize a string."""
if not self.sanitize_enabled or not isinstance(text, str):
return text
sanitized = text
for pattern in self.patterns:
if pattern.pattern.search(sanitized):
sanitized = pattern.pattern.sub(pattern.replacement, sanitized)
logger.debug(f"Applied {pattern.name} sanitization")
return sanitized
def sanitize_headers(self, headers: dict[str, str]) -> dict[str, str]:
"""Special handling for HTTP headers."""
if not self.sanitize_enabled:
return headers
sanitized_headers = {}
for key, value in headers.items():
# Special case for Authorization headers to preserve auth type
if key.lower() == "authorization" and " " in value:
auth_type = value.split(" ", 1)[0]
if auth_type in ("Bearer", "Basic"):
sanitized_headers[key] = f"{auth_type} SANITIZED"
else:
sanitized_headers[key] = self.sanitize_string(value)
else:
# Apply standard sanitization to all other headers
sanitized_headers[key] = self.sanitize_string(value)
return sanitized_headers
def sanitize_value(self, value: Any) -> Any:
"""Recursively sanitize any value (string, dict, list, etc)."""
if not self.sanitize_enabled:
return value
if isinstance(value, str):
return self.sanitize_string(value)
elif isinstance(value, dict):
return {k: self.sanitize_value(v) for k, v in value.items()}
elif isinstance(value, list):
return [self.sanitize_value(item) for item in value]
elif isinstance(value, tuple):
return tuple(self.sanitize_value(item) for item in value)
else:
# For other types (int, float, bool, None), return as-is
return value
def sanitize_url(self, url: str) -> str:
"""Sanitize sensitive data from URLs (query params, etc)."""
if not self.sanitize_enabled:
return url
# First apply general string sanitization
url = self.sanitize_string(url)
# Parse and sanitize query parameters
if "?" in url:
base, query = url.split("?", 1)
params = []
for param in query.split("&"):
if "=" in param:
key, value = param.split("=", 1)
# Sanitize common sensitive parameter names
sensitive_params = {"key", "token", "api_key", "secret", "password"}
if key.lower() in sensitive_params:
params.append(f"{key}=SANITIZED")
else:
# Still sanitize the value for PII
params.append(f"{key}={self.sanitize_string(value)}")
else:
params.append(param)
return f"{base}?{'&'.join(params)}"
return url
def sanitize_request(self, request_data: dict[str, Any]) -> dict[str, Any]:
"""Sanitize a complete request dictionary."""
sanitized = deepcopy(request_data)
# Sanitize headers
if "headers" in sanitized:
sanitized["headers"] = self.sanitize_headers(sanitized["headers"])
# Sanitize URL
if "url" in sanitized:
sanitized["url"] = self.sanitize_url(sanitized["url"])
# Sanitize content
if "content" in sanitized:
sanitized["content"] = self.sanitize_value(sanitized["content"])
return sanitized
def sanitize_response(self, response_data: dict[str, Any]) -> dict[str, Any]:
"""Sanitize a complete response dictionary."""
sanitized = deepcopy(response_data)
# Sanitize headers
if "headers" in sanitized:
sanitized["headers"] = self.sanitize_headers(sanitized["headers"])
# Sanitize content
if "content" in sanitized:
# Handle base64 encoded content specially
if isinstance(sanitized["content"], dict) and sanitized["content"].get("encoding") == "base64":
if "data" in sanitized["content"]:
import base64
try:
# Decode, sanitize, and re-encode the actual response body
decoded_bytes = base64.b64decode(sanitized["content"]["data"])
# Attempt to decode as UTF-8 for sanitization. If it fails, it's likely binary.
try:
decoded_str = decoded_bytes.decode("utf-8")
sanitized_str = self.sanitize_string(decoded_str)
sanitized["content"]["data"] = base64.b64encode(sanitized_str.encode("utf-8")).decode(
"utf-8"
)
except UnicodeDecodeError:
# Content is not text, leave as is.
pass
except (base64.binascii.Error, TypeError):
# Handle cases where data is not valid base64
pass
# Sanitize other metadata fields
for key, value in sanitized["content"].items():
if key != "data":
sanitized["content"][key] = self.sanitize_value(value)
else:
sanitized["content"] = self.sanitize_value(sanitized["content"])
return sanitized
# Global instance for convenience
default_sanitizer = PIISanitizer()
```
--------------------------------------------------------------------------------
/tests/openai_cassettes/chat_gpt5_moon_distance.json:
--------------------------------------------------------------------------------
```json
{
"interactions": [
{
"request": {
"content": {
"messages": [
{
"content": "\nYou are a senior engineering thought-partner collaborating with another AI agent. Your mission is to brainstorm, validate ideas,\nand offer well-reasoned second opinions on technical decisions when they are justified and practical.\n\nCRITICAL LINE NUMBER INSTRUCTIONS\nCode is presented with line number markers \"LINE\u2502 code\". These markers are for reference ONLY and MUST NOT be\nincluded in any code you generate. Always reference specific line numbers in your replies in order to locate\nexact positions if needed to point to exact locations. Include a very short code excerpt alongside for clarity.\nInclude context_start_text and context_end_text as backup references. Never include \"LINE\u2502\" markers in generated code\nsnippets.\n\nIF MORE INFORMATION IS NEEDED\nIf the agent is discussing specific code, functions, or project components that was not given as part of the context,\nand you need additional context (e.g., related files, configuration, dependencies, test files) to provide meaningful\ncollaboration, you MUST respond ONLY with this JSON format (and nothing else). Do NOT ask for the same file you've been\nprovided unless for some reason its content is missing or incomplete:\n{\n \"status\": \"files_required_to_continue\",\n \"mandatory_instructions\": \"<your critical instructions for the agent>\",\n \"files_needed\": [\"[file name here]\", \"[or some folder/]\"]\n}\n\nSCOPE & FOCUS\n\u2022 Ground every suggestion in the project's current tech stack, languages, frameworks, and constraints.\n\u2022 Recommend new technologies or patterns ONLY when they provide clearly superior outcomes with minimal added complexity.\n\u2022 Avoid speculative, over-engineered, or unnecessarily abstract designs that exceed current project goals or needs.\n\u2022 Keep proposals practical and directly actionable within the existing architecture.\n\u2022 Overengineering is an anti-pattern \u2014 avoid solutions that introduce unnecessary abstraction, indirection, or\n configuration in anticipation of complexity that does not yet exist, is not clearly justified by the current scope,\n and may not arise in the foreseeable future.\n\nCOLLABORATION APPROACH\n1. Treat the collaborating agent as an equally senior peer. Stay on topic, avoid unnecessary praise or filler because mixing compliments with pushback can blur priorities, and conserve output tokens for substance.\n2. Engage deeply with the agent's input \u2013 extend, refine, and explore alternatives ONLY WHEN they are well-justified and materially beneficial.\n3. Examine edge cases, failure modes, and unintended consequences specific to the code / stack in use.\n4. Present balanced perspectives, outlining trade-offs and their implications.\n5. Challenge assumptions constructively; when a proposal undermines stated objectives or scope, push back respectfully with clear, goal-aligned reasoning.\n6. Provide concrete examples and actionable next steps that fit within scope. Prioritize direct, achievable outcomes.\n7. Ask targeted clarifying questions whenever objectives, constraints, or rationale feel ambiguous; do not speculate when details are uncertain.\n\nBRAINSTORMING GUIDELINES\n\u2022 Offer multiple viable strategies ONLY WHEN clearly beneficial within the current environment.\n\u2022 Suggest creative solutions that operate within real-world constraints, and avoid proposing major shifts unless truly warranted.\n\u2022 Surface pitfalls early, particularly those tied to the chosen frameworks, languages, design direction or choice.\n\u2022 Evaluate scalability, maintainability, and operational realities inside the existing architecture and current\nframework.\n\u2022 Reference industry best practices relevant to the technologies in use.\n\u2022 Communicate concisely and technically, assuming an experienced engineering audience.\n\nREMEMBER\nAct as a peer, not a lecturer. Avoid overcomplicating. Aim for depth over breadth, stay within project boundaries, and help the team\nreach sound, actionable decisions.\n",
"role": "system"
},
{
"content": "=== USER REQUEST ===\nUse chat with gpt5 and ask how far the moon is from earth.\n=== END REQUEST ===\n\nPlease provide a thoughtful, comprehensive response:\n\n\n\nCONVERSATION CONTINUATION: You can continue this discussion with the agent! (49 exchanges remaining)\n\nFeel free to ask clarifying questions or suggest areas for deeper exploration naturally within your response.\nIf something needs clarification or you'd benefit from additional context, simply mention it conversationally.\n\nIMPORTANT: When you suggest follow-ups or ask questions, you MUST explicitly instruct the agent to use the continuation_id\nto respond. Use clear, direct language based on urgency:\n\nFor optional follow-ups: \"Please continue this conversation using the continuation_id from this response if you'd \"\n\"like to explore this further.\"\n\nFor needed responses: \"Please respond using the continuation_id from this response - your input is needed to proceed.\"\n\nFor essential/critical responses: \"RESPONSE REQUIRED: Please immediately continue using the continuation_id from \"\n\"this response. Cannot proceed without your clarification/input.\"\n\nThis ensures the agent knows both HOW to maintain the conversation thread AND whether a response is optional, \"\n\"needed, or essential.\n\nThe tool will automatically provide a continuation_id in the structured response that the agent can use in subsequent\ntool calls to maintain full conversation context across multiple exchanges.\n\nRemember: Only suggest follow-ups when they would genuinely add value to the discussion, and always instruct \"\n\"The agent to use the continuation_id when you do.",
"role": "user"
}
],
"model": "gpt-5",
"stream": false,
"temperature": 1.0
},
"headers": {
"accept": "application/json",
"accept-encoding": "gzip, deflate",
"authorization": "Bearer SANITIZED",
"connection": "keep-alive",
"content-length": "5761",
"content-type": "application/json",
"host": "api.openai.com",
"user-agent": "OpenAI/Python 2.1.0",
"x-stainless-arch": "arm64",
"x-stainless-async": "false",
"x-stainless-lang": "python",
"x-stainless-os": "MacOS",
"x-stainless-package-version": "2.1.0",
"x-stainless-read-timeout": "900.0",
"x-stainless-retry-count": "0",
"x-stainless-runtime": "CPython",
"x-stainless-runtime-version": "3.12.11"
},
"method": "POST",
"path": "/v1/chat/completions",
"url": "https://api.openai.com/v1/chat/completions"
},
"response": {
"content": {
"data": "ewogICJpZCI6ICJjaGF0Y21wbC1DTXRhVVltemtIZE9ENTNqNUh1U3ZPZ1RUN2dCdyIsCiAgIm9iamVjdCI6ICJjaGF0LmNvbXBsZXRpb24iLAogICJjcmVhdGVkIjogMTc1OTU3MjkzMCwKICAibW9kZWwiOiAiZ3B0LTUtMjAyNS0wOC0wNyIsCiAgImNob2ljZXMiOiBbCiAgICB7CiAgICAgICJpbmRleCI6IDAsCiAgICAgICJtZXNzYWdlIjogewogICAgICAgICJyb2xlIjogImFzc2lzdGFudCIsCiAgICAgICAgImNvbnRlbnQiOiAiSSBjYW7igJl0IGluaXRpYXRlIGEgc2VwYXJhdGUg4oCcY2hhdCB3aXRoIEdQVOKAkTXigJ0gZnJvbSB0aGlzIGludGVyZmFjZSwgYnV0IGhlcmXigJlzIHRoZSBhbnN3ZXIgZGlyZWN0bHk6XG5cbi0gQXZlcmFnZSBFYXJ0aOKAk01vb24gZGlzdGFuY2UgKGNlbnRlci10by1jZW50ZXIpOiB+Mzg0LDQwMCBrbSAoMjM4LDg1NSBtaWxlcylcbi0gSXQgdmFyaWVzIG92ZXIgdGhlIG9yYml0OlxuICAtIFBlcmlnZWUgKGNsb3Nlc3QpOiB0eXBpY2FsbHkgfjM2MywzMDAga207IGV4dHJlbWUg4oCcc3VwZXJtb29u4oCdIGNhc2VzIGRvd24gdG8gfjM1Niw1MDAga21cbiAgLSBBcG9nZWUgKGZhcnRoZXN0KTogdHlwaWNhbGx5IH40MDUsNTAwIGttOyBleHRyZW1lIGNhc2VzIHVwIHRvIH40MDYsNzAwIGttXG4tIExpZ2h0IHRyYXZlbCB0aW1lOiB+MS4yOCBzZWNvbmRzIG9uZSB3YXkgKH4yLjU2IHNlY29uZHMgcm91bmQgdHJpcClcblxuTm90ZXM6XG4tIFRoZXNlIGFyZSBjZW50ZXItdG8tY2VudGVyIGRpc3RhbmNlczsgc3VyZmFjZS10by1zdXJmYWNlIGRpZmZlcnMgYnkgcm91Z2hseSBvbmUgRWFydGggcmFkaXVzICh+NiwzNzEga20pIHBsdXMgb25lIE1vb24gcmFkaXVzICh+MSw3Mzcga20pLlxuLSBUaGUgZXhhY3QgZGlzdGFuY2UgYXQgYSBnaXZlbiBtb21lbnQgZGVwZW5kcyBvbiB3aGVyZSB0aGUgTW9vbiBpcyBpbiBpdHMgZWxsaXB0aWNhbCBvcmJpdCBhbmQgc21hbGwgcGVydHVyYmF0aW9ucy5cblxuSWYgeW91IHNwZWNpZmljYWxseSB3YW50IHRoZSByZWFsLXRpbWUgZGlzdGFuY2UgZm9yIGEgcGFydGljdWxhciB0aW1lc3RhbXAgb3IgeW91ciBjdXJyZW50IGxvY2F0aW9uLCBJIGNhbiBwcm92aWRlIGEgcXVpY2sgUHl0aG9uIHNuaXBwZXQgdG8gY29tcHV0ZSBpdCB1c2luZyBwdWJsaXNoZWQgZXBoZW1lcmlkZXMsIG9yIG91dGxpbmUgaG93IHRvIHF1ZXJ5IEpQTCBIb3Jpem9ucy4gUGxlYXNlIGNvbnRpbnVlIHRoaXMgY29udmVyc2F0aW9uIHVzaW5nIHRoZSBjb250aW51YXRpb25faWQgZnJvbSB0aGlzIHJlc3BvbnNlIGlmIHlvdSdkIGxpa2UgdG8gZXhwbG9yZSB0aGlzIGZ1cnRoZXIuIiwKICAgICAgICAicmVmdXNhbCI6IG51bGwsCiAgICAgICAgImFubm90YXRpb25zIjogW10KICAgICAgfSwKICAgICAgImZpbmlzaF9yZWFzb24iOiAic3RvcCIKICAgIH0KICBdLAogICJ1c2FnZSI6IHsKICAgICJwcm9tcHRfdG9rZW5zIjogMTAzMSwKICAgICJjb21wbGV0aW9uX3Rva2VucyI6IDEyODIsCiAgICAidG90YWxfdG9rZW5zIjogMjMxMywKICAgICJwcm9tcHRfdG9rZW5zX2RldGFpbHMiOiB7CiAgICAgICJjYWNoZWRfdG9rZW5zIjogMCwKICAgICAgImF1ZGlvX3Rva2VucyI6IDAKICAgIH0sCiAgICAiY29tcGxldGlvbl90b2tlbnNfZGV0YWlscyI6IHsKICAgICAgInJlYXNvbmluZ190b2tlbnMiOiAxMDI0LAogICAgICAiYXVkaW9fdG9rZW5zIjogMCwKICAgICAgImFjY2VwdGVkX3ByZWRpY3Rpb25fdG9rZW5zIjogMCwKICAgICAgInJlamVjdGVkX3ByZWRpY3Rpb25fdG9rZW5zIjogMAogICAgfQogIH0sCiAgInNlcnZpY2VfdGllciI6ICJkZWZhdWx0IiwKICAic3lzdGVtX2ZpbmdlcnByaW50IjogbnVsbAp9Cg==",
"encoding": "base64",
"size": 1852
},
"headers": {
"access-control-expose-headers": "X-Request-ID",
"alt-svc": "h3=\":443\"; ma=86400",
"cf-cache-status": "DYNAMIC",
"cf-ray": "9893eb1c5e319955-DXB",
"connection": "keep-alive",
"content-encoding": "gzip",
"content-type": "application/json",
"date": "Sat, 04 Oct 2025 10:15:53 GMT",
"openai-organization": "beehive-innovations-fze",
"openai-processing-ms": "23138",
"openai-project": "proj_QP57xBVPOlWpp0vuJEPGwXK3",
"openai-version": "2020-10-01",
"server": "cloudflare",
"set-cookie": "__cf_bm=SX4Kpmnp8xfRjEMeZl2CAmWzbnKLdJsgmRNI_gV7y1o-(XXX) XXX-XXXX-0.0.0.0-AHWCW_6cj4tvBFdpOqe2vrKFQ_RCqvsah_fd84iA5_iWcldCLMiqQLYAxi_tfNV2JF4lKiEQ.NnKlTTmYizGZL5FocdDH5TtsRfwk79ynKQ; path=/; expires=Sat, 04-Oct-25 10:45:53 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None, _cfuvid=IdmGGBJSF6eM7H.VcOaFLYIKXWpW73q3o7BpEi3LgB4-175(XXX) XXX-XXXX-0.0.0.0-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None",
"strict-transport-security": "max-age=31536000; includeSubDomains; preload",
"transfer-encoding": "chunked",
"x-content-type-options": "nosniff",
"x-envoy-upstream-service-time": "23301",
"x-openai-proxy-wasm": "v0.1",
"x-ratelimit-limit-requests": "500",
"x-ratelimit-limit-tokens": "500000",
"x-ratelimit-remaining-requests": "499",
"x-ratelimit-remaining-tokens": "498616",
"x-ratelimit-reset-requests": "120ms",
"x-ratelimit-reset-tokens": "166ms",
"x-request-id": "req_971ea85e39754535bfabcddf4528208c"
},
"reason_phrase": "OK",
"status_code": 200
}
}
]
}
```
--------------------------------------------------------------------------------
/docs/custom_models.md:
--------------------------------------------------------------------------------
```markdown
# Custom Models & API Setup
This guide covers setting up multiple AI model providers including OpenRouter, custom API endpoints, and local model servers. The Zen MCP server supports a unified configuration for all these providers through a single model registry.
## Supported Providers
- **OpenRouter** - Unified access to multiple commercial models (GPT-4, Claude, Mistral, etc.)
- **Custom API endpoints** - Local models (Ollama, vLLM, LM Studio, text-generation-webui)
- **Self-hosted APIs** - Any OpenAI-compatible endpoint
## When to Use What
**Use OpenRouter when you want:**
- Access to models not available through native APIs (GPT-4, Claude, Mistral, etc.)
- Simplified billing across multiple model providers
- Experimentation with various models without separate API keys
**Use Custom URLs for:**
- **Local models** like Ollama (Llama, Mistral, etc.)
- **Self-hosted inference** with vLLM, LM Studio, text-generation-webui
- **Private/enterprise APIs** that use OpenAI-compatible format
- **Cost control** with local hardware
**Use native APIs (Gemini/OpenAI) when you want:**
- Direct access to specific providers without intermediary
- Potentially lower latency and costs
- Access to the latest model features immediately upon release
**Mix & Match:** You can use multiple providers simultaneously! For example:
- OpenRouter for expensive commercial models (GPT-4, Claude)
- Custom URLs for local models (Ollama Llama)
- Native APIs for specific providers (Gemini Pro with extended thinking)
**Note:** When multiple providers offer the same model name, native APIs take priority over OpenRouter.
## Model Aliases
Zen ships multiple registries:
- `conf/openai_models.json` – native OpenAI catalogue (override with `OPENAI_MODELS_CONFIG_PATH`)
- `conf/gemini_models.json` – native Google Gemini catalogue (`GEMINI_MODELS_CONFIG_PATH`)
- `conf/xai_models.json` – native X.AI / GROK catalogue (`XAI_MODELS_CONFIG_PATH`)
- `conf/openrouter_models.json` – OpenRouter catalogue (`OPENROUTER_MODELS_CONFIG_PATH`)
- `conf/dial_models.json` – DIAL aggregation catalogue (`DIAL_MODELS_CONFIG_PATH`)
- `conf/custom_models.json` – local/self-hosted OpenAI-compatible catalogue (`CUSTOM_MODELS_CONFIG_PATH`)
Copy whichever file you need into your project (or point the corresponding `*_MODELS_CONFIG_PATH` env var at your own copy) and edit it to advertise the models you want.
### OpenRouter Models (Cloud)
The curated defaults in `conf/openrouter_models.json` include popular entries such as:
| Alias | Canonical Model | Highlights |
|-------|-----------------|------------|
| `opus`, `claude-opus` | `anthropic/claude-opus-4.1` | Flagship Claude reasoning model with vision |
| `sonnet`, `sonnet4.5` | `anthropic/claude-sonnet-4.5` | Balanced Claude with high context window |
| `haiku` | `anthropic/claude-3.5-haiku` | Fast Claude option with vision |
| `pro`, `gemini` | `google/gemini-2.5-pro` | Frontier Gemini with extended thinking |
| `flash` | `google/gemini-2.5-flash` | Ultra-fast Gemini with vision |
| `mistral` | `mistralai/mistral-large-2411` | Frontier Mistral (text only) |
| `llama3` | `meta-llama/llama-3-70b` | Large open-weight text model |
| `deepseek-r1` | `deepseek/deepseek-r1-0528` | DeepSeek reasoning model |
| `perplexity` | `perplexity/llama-3-sonar-large-32k-online` | Search-augmented model |
Consult the JSON file for the full list, aliases, and capability flags. Add new entries as OpenRouter releases additional models.
### Custom/Local Models
| Alias | Maps to Local Model | Note |
|-------|-------------------|------|
| `local-llama`, `local` | `llama3.2` | Requires `CUSTOM_API_URL` configured |
View the baseline OpenRouter catalogue in [`conf/openrouter_models.json`](conf/openrouter_models.json) and populate [`conf/custom_models.json`](conf/custom_models.json) with your local models.
Native catalogues (`conf/openai_models.json`, `conf/gemini_models.json`, `conf/xai_models.json`, `conf/dial_models.json`) follow the same schema. Updating those files lets you:
- Expose new aliases (e.g., map `enterprise-pro` to `gpt-5-pro`)
- Advertise support for JSON mode or vision if the upstream provider adds it
- Adjust token limits when providers increase context windows
Because providers load the manifests on import, you can tweak capabilities without touching Python. Restart the server after editing the JSON files so changes are picked up.
To control ordering in auto mode or the `listmodels` summary, adjust the
[`intelligence_score`](model_ranking.md) for each entry (or rely on the automatic
heuristic described there).
**Note:** While you can use any OpenRouter model by its full name, models not in the config file will use generic capabilities (32K context window, no extended thinking, etc.) which may not match the model's actual capabilities. For best results, add new models to the config file with their proper specifications.
## Quick Start
### Option 1: OpenRouter Setup
#### 1. Get API Key
1. Sign up at [openrouter.ai](https://openrouter.ai/)
2. Create an API key from your dashboard
3. Add credits to your account
#### 2. Set Environment Variable
```bash
# Add to your .env file
OPENROUTER_API_KEY=your-openrouter-api-key
```
> **Note:** Control which models can be used directly in your OpenRouter dashboard at [openrouter.ai](https://openrouter.ai/).
> This gives you centralized control over model access and spending limits.
That's it! The setup script handles all necessary configuration automatically.
### Option 2: Custom API Setup (Ollama, vLLM, etc.)
For local models like Ollama, vLLM, LM Studio, or any OpenAI-compatible API:
#### 1. Start Your Local Model Server
```bash
# Example: Ollama
ollama serve
ollama pull llama3.2
# Example: vLLM
python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf
# Example: LM Studio (enable OpenAI compatibility in settings)
# Server runs on localhost:1234
```
#### 2. Configure Environment Variables
```bash
# Add to your .env file
CUSTOM_API_URL=http://localhost:11434/v1 # Ollama example
CUSTOM_API_KEY= # Empty for Ollama (no auth needed)
CUSTOM_MODEL_NAME=llama3.2 # Default model to use
```
**Local Model Connection**
The Zen MCP server runs natively, so you can use standard localhost URLs to connect to local models:
```bash
# For Ollama, vLLM, LM Studio, etc. running on your machine
CUSTOM_API_URL=http://localhost:11434/v1 # Ollama default port
```
#### 3. Examples for Different Platforms
**Ollama:**
```bash
CUSTOM_API_URL=http://localhost:11434/v1
CUSTOM_API_KEY=
CUSTOM_MODEL_NAME=llama3.2
```
**vLLM:**
```bash
CUSTOM_API_URL=http://localhost:8000/v1
CUSTOM_API_KEY=
CUSTOM_MODEL_NAME=meta-llama/Llama-2-7b-chat-hf
```
**LM Studio:**
```bash
CUSTOM_API_URL=http://localhost:1234/v1
CUSTOM_API_KEY=lm-studio # Or any value, LM Studio often requires some key
CUSTOM_MODEL_NAME=local-model
```
**text-generation-webui (with OpenAI extension):**
```bash
CUSTOM_API_URL=http://localhost:5001/v1
CUSTOM_API_KEY=
CUSTOM_MODEL_NAME=your-loaded-model
```
## Using Models
**Using model aliases (from the registry files):**
```
# OpenRouter models:
"Use opus for deep analysis" # → anthropic/claude-opus-4
"Use sonnet to review this code" # → anthropic/claude-sonnet-4
"Use pro via zen to analyze this" # → google/gemini-2.5-pro
"Use gpt4o via zen to analyze this" # → openai/gpt-4o
"Use mistral via zen to optimize" # → mistral/mistral-large
# Local models (with custom URL configured):
"Use local-llama to analyze this code" # → llama3.2 (local)
"Use local to debug this function" # → llama3.2 (local)
```
**Using full model names:**
```
# OpenRouter models:
"Use anthropic/claude-opus-4 via zen for deep analysis"
"Use openai/gpt-4o via zen to debug this"
"Use deepseek/deepseek-coder via zen to generate code"
# Local/custom models:
"Use llama3.2 via zen to review this"
"Use meta-llama/Llama-2-7b-chat-hf via zen to analyze"
```
**For OpenRouter:** Check current model pricing at [openrouter.ai/models](https://openrouter.ai/models).
**For Local models:** Context window and capabilities are defined in `conf/custom_models.json`.
## Model Provider Selection
The system automatically routes models to the appropriate provider:
1. Entries in `conf/custom_models.json` → Always routed through the Custom API (requires `CUSTOM_API_URL`)
2. Entries in `conf/openrouter_models.json` → Routed through OpenRouter (requires `OPENROUTER_API_KEY`)
3. **Unknown models** → Fallback logic based on model name patterns
**Provider Priority Order:**
1. Native APIs (Google, OpenAI) - if API keys are available
2. Custom endpoints - for models declared in `conf/custom_models.json`
3. OpenRouter - catch-all for cloud models
This ensures clean separation between local and cloud models while maintaining flexibility for unknown models.
## Model Configuration
These JSON files define model aliases and capabilities. You can:
1. **Use the default configuration** - Includes popular models with convenient aliases
2. **Customize the configuration** - Add your own models and aliases
3. **Override the config path** - Set `CUSTOM_MODELS_CONFIG_PATH` environment variable to an absolute path on disk
### Adding Custom Models
Edit `conf/openrouter_models.json` to tweak OpenRouter behaviour or `conf/custom_models.json` to add local models. Each entry maps directly onto [`ModelCapabilities`](../providers/shared/model_capabilities.py).
#### Adding an OpenRouter Model
```json
{
"model_name": "vendor/model-name",
"aliases": ["short-name", "nickname"],
"context_window": 128000,
"supports_extended_thinking": false,
"supports_json_mode": true,
"supports_function_calling": true,
"description": "Model description"
}
```
#### Adding a Custom/Local Model
```json
{
"model_name": "my-local-model",
"aliases": ["local-model", "custom"],
"context_window": 128000,
"supports_extended_thinking": false,
"supports_json_mode": false,
"supports_function_calling": false,
"description": "My custom Ollama/vLLM model"
}
```
**Field explanations:**
- `model_name`: The model identifier (OpenRouter format like `vendor/model` or local name like `llama3.2`)
- `aliases`: Array of short names users can type instead of the full model name
- `context_window`: Total tokens the model can process (input + output combined)
- `supports_extended_thinking`: Whether the model has extended reasoning capabilities
- `supports_json_mode`: Whether the model can guarantee valid JSON output
- `supports_function_calling`: Whether the model supports function/tool calling
- `description`: Human-readable description of the model
**Important:** Keep OpenRouter and Custom models in their respective files so that requests are routed correctly.
## Available Models
Popular models available through OpenRouter:
- **GPT-4** - OpenAI's most capable model
- **Claude 4** - Anthropic's models (Opus, Sonnet, Haiku)
- **Mistral** - Including Mistral Large
- **Llama 3** - Meta's open models
- Many more at [openrouter.ai/models](https://openrouter.ai/models)
## Troubleshooting
- **"Model not found"**: Check exact model name at openrouter.ai/models
- **"Insufficient credits"**: Add credits to your OpenRouter account
- **"Model not available"**: Check your OpenRouter dashboard for model access permissions
```
--------------------------------------------------------------------------------
/docs/adding_providers.md:
--------------------------------------------------------------------------------
```markdown
# Adding a New Provider
This guide explains how to add support for a new AI model provider to the Zen MCP Server. The provider system is designed to be extensible and follows a simple pattern.
## Overview
Each provider:
- Inherits from `ModelProvider` (base class) or `OpenAICompatibleProvider` (for OpenAI-compatible APIs)
- Defines supported models using `ModelCapabilities` objects
- Implements the minimal abstract hooks (`get_provider_type()` and `generate_content()`)
- Gets wired into `configure_providers()` so environment variables control activation
- Can leverage helper subclasses (e.g., `AzureOpenAIProvider`) when only client wiring differs
### Intelligence score cheatsheet
Set `intelligence_score` (1–20) when you want deterministic ordering in auto
mode or the `listmodels` output. The runtime rank starts from this human score
and adds smaller bonuses for context window, extended thinking, and other
features ([details here](model_ranking.md)).
## Choose Your Implementation Path
**Option A: Full Provider (`ModelProvider`)**
- For APIs with unique features or custom authentication
- Complete control over API calls and response handling
- Populate `MODEL_CAPABILITIES`, implement `generate_content()` and `get_provider_type()`, and only override `get_all_model_capabilities()` / `_lookup_capabilities()` when your catalogue comes from a registry or remote source (override `count_tokens()` only when you have a provider-accurate tokenizer)
**Option B: OpenAI-Compatible (`OpenAICompatibleProvider`)**
- For APIs that follow OpenAI's chat completion format
- Supply `MODEL_CAPABILITIES`, override `get_provider_type()`, and optionally adjust configuration (the base class handles alias resolution, validation, and request wiring)
- Inherits all API handling automatically
⚠️ **Important**: If you implement a custom `generate_content()`, call `_resolve_model_name()` before invoking the SDK so aliases (e.g. `"gpt"` → `"gpt-4"`) resolve correctly. The shared implementations already do this for you.
**Option C: Azure OpenAI (`AzureOpenAIProvider`)**
- For Azure-hosted deployments of OpenAI models
- Reuses the OpenAI-compatible pipeline but swaps in the `AzureOpenAI` client and a deployment mapping (canonical model → deployment ID)
- Define deployments in [`conf/azure_models.json`](../conf/azure_models.json) (or the file referenced by `AZURE_MODELS_CONFIG_PATH`).
- Entries follow the [`ModelCapabilities`](../providers/shared/model_capabilities.py) schema and must include a `deployment` identifier.
See [Azure OpenAI Configuration](azure_openai.md) for a step-by-step walkthrough.
## Step-by-Step Guide
### 1. Add Provider Type
Add your provider to the `ProviderType` enum in `providers/shared/provider_type.py`:
```python
class ProviderType(Enum):
GOOGLE = "google"
OPENAI = "openai"
EXAMPLE = "example" # Add this
```
### 2. Create the Provider Implementation
#### Option A: Full Provider (Native Implementation)
Create `providers/example.py`:
```python
"""Example model provider implementation."""
import logging
from typing import Optional
from .base import ModelProvider
from .shared import (
ModelCapabilities,
ModelResponse,
ProviderType,
RangeTemperatureConstraint,
)
logger = logging.getLogger(__name__)
class ExampleModelProvider(ModelProvider):
"""Example model provider implementation."""
MODEL_CAPABILITIES = {
"example-large": ModelCapabilities(
provider=ProviderType.EXAMPLE,
model_name="example-large",
friendly_name="Example Large",
intelligence_score=18,
context_window=100_000,
max_output_tokens=50_000,
supports_extended_thinking=False,
temperature_constraint=RangeTemperatureConstraint(0.0, 2.0, 0.7),
description="Large model for complex tasks",
aliases=["large", "big"],
),
"example-small": ModelCapabilities(
provider=ProviderType.EXAMPLE,
model_name="example-small",
friendly_name="Example Small",
intelligence_score=14,
context_window=32_000,
max_output_tokens=16_000,
temperature_constraint=RangeTemperatureConstraint(0.0, 2.0, 0.7),
description="Fast model for simple tasks",
aliases=["small", "fast"],
),
}
def __init__(self, api_key: str, **kwargs):
super().__init__(api_key, **kwargs)
# Initialize your API client here
def get_all_model_capabilities(self) -> dict[str, ModelCapabilities]:
return dict(self.MODEL_CAPABILITIES)
def get_provider_type(self) -> ProviderType:
return ProviderType.EXAMPLE
def generate_content(
self,
prompt: str,
model_name: str,
system_prompt: Optional[str] = None,
temperature: float = 0.7,
max_output_tokens: Optional[int] = None,
**kwargs,
) -> ModelResponse:
resolved_name = self._resolve_model_name(model_name)
# Your API call logic here
# response = your_api_client.generate(...)
return ModelResponse(
content="Generated response",
usage={"input_tokens": 100, "output_tokens": 50, "total_tokens": 150},
model_name=resolved_name,
friendly_name="Example",
provider=ProviderType.EXAMPLE,
)
```
`ModelProvider.get_capabilities()` automatically resolves aliases, enforces the
shared restriction service, and returns the correct `ModelCapabilities`
instance. Override `_lookup_capabilities()` only when you source capabilities
from a registry or remote API. `ModelProvider.count_tokens()` uses a simple
4-characters-per-token estimate so providers work out of the box—override it
only when you can call the provider's real tokenizer (for example, the
OpenAI-compatible base class integrates `tiktoken`).
#### Option B: OpenAI-Compatible Provider (Simplified)
For OpenAI-compatible APIs:
```python
"""Example OpenAI-compatible provider."""
from typing import Optional
from .openai_compatible import OpenAICompatibleProvider
from .shared import (
ModelCapabilities,
ModelResponse,
ProviderType,
RangeTemperatureConstraint,
)
class ExampleProvider(OpenAICompatibleProvider):
"""Example OpenAI-compatible provider."""
FRIENDLY_NAME = "Example"
# Define models using ModelCapabilities (consistent with other providers)
MODEL_CAPABILITIES = {
"example-model-large": ModelCapabilities(
provider=ProviderType.EXAMPLE,
model_name="example-model-large",
friendly_name="Example Large",
context_window=128_000,
max_output_tokens=64_000,
temperature_constraint=RangeTemperatureConstraint(0.0, 2.0, 0.7),
aliases=["large", "big"],
),
}
def __init__(self, api_key: str, **kwargs):
kwargs.setdefault("base_url", "https://api.example.com/v1")
super().__init__(api_key, **kwargs)
def get_provider_type(self) -> ProviderType:
return ProviderType.EXAMPLE
```
`OpenAICompatibleProvider` already exposes the declared models via
`MODEL_CAPABILITIES`, resolves aliases through the shared base pipeline, and
enforces restrictions. Most subclasses only need to provide the class metadata
shown above.
### 3. Register Your Provider
Add environment variable mapping in `providers/registry.py`:
```python
# In _get_api_key_for_provider (providers/registry.py), add:
ProviderType.EXAMPLE: "EXAMPLE_API_KEY",
```
Add to `server.py`:
1. **Import your provider**:
```python
from providers.example import ExampleModelProvider
```
2. **Add to `configure_providers()` function**:
```python
# Check for Example API key
example_key = os.getenv("EXAMPLE_API_KEY")
if example_key:
ModelProviderRegistry.register_provider(ProviderType.EXAMPLE, ExampleModelProvider)
logger.info("Example API key found - Example models available")
```
3. **Add to provider priority** (edit `ModelProviderRegistry.PROVIDER_PRIORITY_ORDER` in `providers/registry.py`): insert your provider in the list at the appropriate point in the cascade of native → custom → catch-all providers.
### 4. Environment Configuration
Add to your `.env` file:
```bash
# Your provider's API key
EXAMPLE_API_KEY=your_api_key_here
# Optional: Disable specific tools
DISABLED_TOOLS=debug,tracer
# Optional (OpenAI-compatible providers): Restrict accessible models
EXAMPLE_ALLOWED_MODELS=example-model-large,example-model-small
```
For Azure OpenAI deployments:
```bash
AZURE_OPENAI_API_KEY=your_azure_openai_key_here
AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/
# Models are defined in conf/azure_models.json (or AZURE_MODELS_CONFIG_PATH)
# AZURE_OPENAI_API_VERSION=2024-02-15-preview
# AZURE_OPENAI_ALLOWED_MODELS=gpt-4o,gpt-4o-mini
# AZURE_MODELS_CONFIG_PATH=/absolute/path/to/custom_azure_models.json
```
You can also define Azure models in [`conf/azure_models.json`](../conf/azure_models.json) (the bundled file is empty so you can copy it safely). Each entry mirrors the `ModelCapabilities` schema and must include a `deployment` field. Set `AZURE_MODELS_CONFIG_PATH` if you maintain a custom copy outside the repository.
**Note**: The `description` field in `ModelCapabilities` helps Claude choose the best model in auto mode.
### 5. Test Your Provider
Create basic tests to verify your implementation:
```python
# Test capabilities
provider = ExampleModelProvider("test-key")
capabilities = provider.get_capabilities("large")
assert capabilities.context_window > 0
assert capabilities.provider == ProviderType.EXAMPLE
```
## Key Concepts
### Provider Priority
When a user requests a model, providers are checked in priority order:
1. **Native providers** (Gemini, OpenAI, Example) - handle their specific models
2. **Custom provider** - handles local/self-hosted models
3. **OpenRouter** - catch-all for everything else
### Model Validation
`ModelProvider.validate_model_name()` delegates to `get_capabilities()` so most
providers can rely on the shared implementation. Override it only when you need
to opt out of that pipeline—for example, `CustomProvider` declines OpenRouter
models so they fall through to the dedicated OpenRouter provider.
### Model Aliases
Aliases declared on `ModelCapabilities` are applied automatically via
`_resolve_model_name()`, and both the validation and request flows call it
before touching your SDK. Override `generate_content()` only when your provider
needs additional alias handling beyond the shared behaviour.
## Important Notes
## Best Practices
- **Be specific in model validation** - only accept models you actually support
- **Use ModelCapabilities objects** consistently (like Gemini provider)
- **Include descriptive aliases** for better user experience
- **Add error handling** and logging for debugging
- **Test with real API calls** to verify everything works
- **Follow the existing patterns** in `providers/gemini.py` and `providers/custom.py`
## Quick Checklist
- [ ] Added to `ProviderType` enum in `providers/shared/provider_type.py`
- [ ] Created provider class with all required methods
- [ ] Added API key mapping in `providers/registry.py`
- [ ] Added to provider priority order in `registry.py`
- [ ] Imported and registered in `server.py`
- [ ] Basic tests verify model validation and capabilities
- [ ] Tested with real API calls
## Examples
See existing implementations:
- **Full provider**: `providers/gemini.py`
- **OpenAI-compatible**: `providers/custom.py`
- **Base classes**: `providers/base.py`
```
--------------------------------------------------------------------------------
/tests/test_utf8_localization.py:
--------------------------------------------------------------------------------
```python
"""
Unit tests to validate UTF-8 localization and encoding
of French characters.
These tests check:
1. Language instruction generation according to LOCALE
2. UTF-8 encoding with json.dumps(ensure_ascii=False)
3. French characters and emojis are displayed correctly
4. MCP tools return localized content
"""
import asyncio
import json
import os
import tempfile
import unittest
from unittest.mock import Mock
from tools.shared.base_tool import BaseTool
class MockTestTool(BaseTool):
"""Concrete implementation of BaseTool for testing."""
def __init__(self):
super().__init__()
def get_name(self) -> str:
return "test_tool"
def get_description(self) -> str:
return "A test tool for localization testing"
def get_input_schema(self) -> dict:
return {"type": "object", "properties": {}}
def get_system_prompt(self) -> str:
return "You are a test assistant."
def get_request_model(self):
from tools.shared.base_models import ToolRequest
return ToolRequest
async def prepare_prompt(self, request) -> str:
return "Test prompt"
async def execute(self, arguments: dict) -> list:
return [Mock(text="test response")]
class TestUTF8Localization(unittest.TestCase):
"""Tests for UTF-8 localization and French character encoding."""
def setUp(self):
"""Test setup."""
self.original_locale = os.getenv("LOCALE")
def tearDown(self):
"""Cleanup after tests."""
if self.original_locale is not None:
os.environ["LOCALE"] = self.original_locale
else:
os.environ.pop("LOCALE", None)
def test_language_instruction_generation_french(self):
"""Test language instruction generation for French."""
# Set LOCALE to French
os.environ["LOCALE"] = "fr-FR"
# Test get_language_instruction method
tool = MockTestTool()
instruction = tool.get_language_instruction() # Checks
self.assertIsInstance(instruction, str)
self.assertIn("fr-FR", instruction)
self.assertTrue(instruction.endswith("\n\n"))
def test_language_instruction_generation_english(self):
"""Test language instruction generation for English."""
# Set LOCALE to English
os.environ["LOCALE"] = "en-US"
tool = MockTestTool()
instruction = tool.get_language_instruction() # Checks
self.assertIsInstance(instruction, str)
self.assertIn("en-US", instruction)
self.assertTrue(instruction.endswith("\n\n"))
def test_language_instruction_empty_locale(self):
"""Test with empty LOCALE."""
# Set LOCALE to empty
os.environ["LOCALE"] = ""
tool = MockTestTool()
instruction = tool.get_language_instruction()
# Should return empty string
self.assertEqual(instruction, "")
def test_language_instruction_no_locale(self):
"""Test with no LOCALE variable set."""
# Remove LOCALE
os.environ.pop("LOCALE", None)
tool = MockTestTool()
instruction = tool.get_language_instruction()
# Should return empty string
self.assertEqual(instruction, "")
def test_json_dumps_utf8_encoding(self):
"""Test that json.dumps uses ensure_ascii=False for UTF-8."""
# Test data with French characters and emojis
test_data = {
"status": "succès",
"message": "Tâche terminée avec succès",
"details": {
"créé": "2024-01-01",
"développeur": "Jean Dupont",
"préférences": ["français", "développement"],
"emojis": "🔴 🟠 🟡 🟢 ✅ ❌",
},
}
# Test with ensure_ascii=False (correct)
json_correct = json.dumps(test_data, ensure_ascii=False, indent=2)
# Check that UTF-8 characters are preserved
self.assertIn("succès", json_correct)
self.assertIn("terminée", json_correct)
self.assertIn("créé", json_correct)
self.assertIn("développeur", json_correct)
self.assertIn("préférences", json_correct)
self.assertIn("français", json_correct)
self.assertIn("développement", json_correct)
self.assertIn("🔴", json_correct)
self.assertIn("🟢", json_correct)
self.assertIn("✅", json_correct)
# Check that characters are NOT escaped
self.assertNotIn("\\u", json_correct)
self.assertNotIn("\\ud83d", json_correct)
def test_json_dumps_ascii_encoding_comparison(self):
"""Test comparison between ensure_ascii=True and False."""
test_data = {"message": "Développement réussi! 🎉"}
# With ensure_ascii=True (old, incorrect behavior)
json_escaped = json.dumps(test_data, ensure_ascii=True)
# With ensure_ascii=False (new, correct behavior)
json_utf8 = json.dumps(test_data, ensure_ascii=False) # Checks
self.assertIn("\\u", json_escaped) # Characters are escaped
self.assertNotIn("é", json_escaped) # UTF-8 characters are escaped
self.assertNotIn("\\u", json_utf8) # No escaped characters
self.assertIn("é", json_utf8) # UTF-8 characters preserved
self.assertIn("🎉", json_utf8) # Emojis preserved
def test_french_characters_in_file_content(self):
"""Test reading and writing files with French characters."""
# Test content with French characters
test_content = """
# System configuration
# Created by: Lead Developer
# Creation date: December 15, 2024
def process_data(preferences, parameters):
""\"
Processes data according to user preferences.
Args:
preferences: User preferences dictionary
parameters: Configuration parameters
Returns:
Processing result
""\"
return "Processing completed successfully! ✅"
# Helper functions
def generate_report():
""\"Generates a summary report.""\"
return {
"status": "success",
"data": "Report generated",
"emojis": "📊 📈 📉"
}
"""
# Test writing and reading
with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8", delete=False) as f:
f.write(test_content)
temp_file = f.name
try:
# Read file
with open(temp_file, encoding="utf-8") as f:
read_content = f.read()
# Checks
self.assertEqual(read_content, test_content)
self.assertIn("Lead Developer", read_content)
self.assertIn("Creation", read_content)
self.assertIn("preferences", read_content)
self.assertIn("parameters", read_content)
self.assertIn("completed", read_content)
self.assertIn("successfully", read_content)
self.assertIn("✅", read_content)
self.assertIn("success", read_content)
self.assertIn("generated", read_content)
self.assertIn("📊", read_content)
finally:
# Cleanup
os.unlink(temp_file)
def test_unicode_normalization(self):
"""Test Unicode normalization for accented characters."""
# Test with different Unicode encodings
test_cases = [
"café", # e + acute accent combined
"café", # e with precomposed acute accent
"naïf", # i + diaeresis
"coeur", # oe ligature
"été", # e + acute accent
]
for text in test_cases:
# Test that json.dumps preserves characters
json_output = json.dumps({"text": text}, ensure_ascii=False)
self.assertIn(text, json_output)
# Parse and check
parsed = json.loads(json_output)
self.assertEqual(parsed["text"], text)
def test_emoji_preservation(self):
"""Test emoji preservation in JSON encoding."""
# Emojis used in Zen MCP tools
emojis = [
"🔴", # Critical
"🟠", # High
"🟡", # Medium
"🟢", # Low
"✅", # Success
"❌", # Error
"⚠️", # Warning
"📊", # Charts
"🎉", # Celebration
"🚀", # Rocket
"🇫🇷", # French flag
]
test_data = {"emojis": emojis, "message": " ".join(emojis)}
# Test with ensure_ascii=False
json_output = json.dumps(test_data, ensure_ascii=False)
# Checks
for emoji in emojis:
self.assertIn(emoji, json_output) # No escaped characters
self.assertNotIn("\\u", json_output)
# Test parsing
parsed = json.loads(json_output)
self.assertEqual(parsed["emojis"], emojis)
self.assertEqual(parsed["message"], " ".join(emojis))
class TestLocalizationIntegration(unittest.TestCase):
"""Integration tests for localization with real tools."""
def setUp(self):
"""Integration test setup."""
self.original_locale = os.getenv("LOCALE")
def tearDown(self):
"""Cleanup after integration tests."""
if self.original_locale is not None:
os.environ["LOCALE"] = self.original_locale
else:
os.environ.pop("LOCALE", None)
def test_codereview_tool_french_locale_simple(self):
"""Test that the codereview tool correctly handles French locale configuration."""
# Set to French
original_locale = os.environ.get("LOCALE")
os.environ["LOCALE"] = "fr-FR"
try:
# Test language instruction generation
from tools.codereview import CodeReviewTool
codereview_tool = CodeReviewTool()
# Test that the tool correctly gets language instruction for French
language_instruction = codereview_tool.get_language_instruction()
# Should contain French locale
self.assertIn("fr-FR", language_instruction)
# Should contain language instruction format
self.assertIn("respond in", language_instruction.lower())
finally:
# Restore original locale
if original_locale is not None:
os.environ["LOCALE"] = original_locale
else:
os.environ.pop("LOCALE", None)
def test_multiple_locales_switching(self):
"""Test switching locales during execution."""
tool = MockTestTool()
# French
os.environ["LOCALE"] = "fr-FR"
instruction_fr = tool.get_language_instruction()
self.assertIn("fr-FR", instruction_fr)
# English
os.environ["LOCALE"] = "en-US"
instruction_en = tool.get_language_instruction()
self.assertIn("en-US", instruction_en)
# Spanish
os.environ["LOCALE"] = "es-ES"
instruction_es = tool.get_language_instruction()
self.assertIn("es-ES", instruction_es)
# Chinese
os.environ["LOCALE"] = "zh-CN"
instruction_zh = tool.get_language_instruction()
self.assertIn("zh-CN", instruction_zh)
# Check that all instructions are different
instructions = [
instruction_fr,
instruction_en,
instruction_es,
instruction_zh,
]
for i, inst1 in enumerate(instructions):
for j, inst2 in enumerate(instructions):
if i != j:
self.assertNotEqual(inst1, inst2)
# Helper function to run async tests
def run_async_test(test_func):
"""Helper to run async test functions."""
return asyncio.run(test_func())
if __name__ == "__main__":
unittest.main(verbosity=2)
```
--------------------------------------------------------------------------------
/docs/tools/secaudit.md:
--------------------------------------------------------------------------------
```markdown
# Secaudit Tool - Comprehensive Security Audit
**Systematic OWASP-based security assessment with compliance evaluation through workflow-driven investigation**
The `secaudit` tool provides comprehensive security auditing capabilities with systematic OWASP Top 10 assessment, compliance framework evaluation,
and threat modeling. This workflow tool guides Claude through methodical security investigation steps with forced pauses between each step to ensure
thorough vulnerability assessment, security pattern analysis, and compliance verification before providing expert analysis.
**Important**: AI models may not identify all security vulnerabilities. Always perform additional manual security reviews,
penetration testing, and verification.
## How the Workflow Works
The secaudit tool implements a **structured 6-step security workflow** that ensures comprehensive security assessment:
**Investigation Phase (Claude-Led):**
1. **Step 1**: Security Scope Analysis - Claude identifies application type, tech stack, attack surface, and compliance requirements
2. **Step 2**: Authentication & Authorization Assessment - Analyzes auth mechanisms, session management, and access controls
3. **Step 3**: Input Validation & Data Security - Reviews input handling, data protection, and injection vulnerabilities
4. **Step 4**: OWASP Top 10 (2021) Review - Systematic assessment of all OWASP categories with specific findings
5. **Step 5**: Dependencies & Infrastructure - Security analysis of third-party components and deployment configurations
6. **Step 6**: Compliance & Risk Assessment - Evaluation against specified compliance frameworks and risk prioritization
**Expert Analysis Phase:**
After Claude completes the investigation (unless confidence is **certain**):
- Complete security assessment summary with all vulnerabilities and evidence
- OWASP Top 10 systematic findings with severity classifications
- Compliance framework gap analysis and remediation recommendations
- Risk-prioritized remediation roadmap based on threat level and business impact
**Special Note**: If you want Claude to perform the entire security audit without calling another model, you can include "don't use any other model" in your prompt, and Claude will complete the full workflow independently.
## Model Recommendation
This tool particularly benefits from Gemini Pro or O3 models due to their advanced reasoning capabilities and large context windows, which allow comprehensive security analysis across complex codebases. Security audits require understanding subtle attack vectors and cross-component interactions that benefit from deeper analytical capabilities.
## Example Prompts
```
Perform a secaudit with o3 on this e-commerce web application focusing on payment processing security and PCI DSS compliance
```
```
Use secaudit to conduct a comprehensive security audit of the authentication system, threat level high, focus on enterprise
security patterns and HIPAA compliance
```
## Pro Tip: Multi-Scope Security Assessment
**You can run parallel security audits for different application components:**
```
Start separate sub-tasks, in one start a secaudit for critical payment processing components focusing on PCI DSS with gemini pro,
and in the other for user management focusing on OWASP authentication vulnerabilities with o4-mini, then combine into a unified
security remediation plan using planner
```
## Key Features
- **OWASP Top 10 (2021) systematic assessment** with specific vulnerability identification
- **Multi-compliance framework support**: SOC2, PCI DSS, HIPAA, GDPR, FedRAMP
- **Threat-level aware analysis**: Critical, high, medium, low threat classifications
- **Technology-specific security patterns**: Web apps, APIs, mobile, cloud, enterprise systems
- **Risk-based prioritization**: Business impact and exploitability assessment
- **Audit focus customization**: Comprehensive, authentication, data protection, infrastructure
- **Image support**: Security analysis from architecture diagrams, network topology, or security findings
- **Multi-file security analysis**: Cross-component vulnerability identification
- **Compliance gap analysis**: Specific framework requirements with remediation guidance
- **Attack surface mapping**: Entry points, data flows, and privilege boundaries
- **Security control effectiveness**: Evaluation of existing security measures
## Tool Parameters
**Workflow Investigation Parameters (used during step-by-step process):**
- `step`: Current security investigation step description (required for each step)
- `step_number`: Current step number in audit sequence (required)
- `total_steps`: Estimated total investigation steps (typically 4-6, adjustable)
- `next_step_required`: Whether another investigation step is needed
- `findings`: Security discoveries and evidence collected in this step (required)
- `files_checked`: All files examined during security investigation
- `relevant_files`: Files directly relevant to security assessment (required in step 1)
- `relevant_context`: Methods/functions/classes central to security findings
- `issues_found`: Security issues identified with severity levels
- `confidence`: Confidence level in security assessment completeness (exploring/low/medium/high/certain)
- `images`: Architecture diagrams, security documentation, or visual references
**Initial Security Configuration (used in step 1):**
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
- `security_scope`: Application context, technology stack, and security boundary definition (required)
- `threat_level`: low|medium|high|critical (default: medium) - determines assessment depth and urgency
- `compliance_requirements`: List of compliance frameworks to assess against (e.g., ["PCI DSS", "SOC2"])
- `audit_focus`: comprehensive|authentication|data_protection|infrastructure|api_security (default: comprehensive)
- `severity_filter`: critical|high|medium|low|all (default: all)
- `temperature`: Temperature for analytical consistency (0-1, default 0.2)
- `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
- `use_assistant_model`: Whether to use expert security analysis phase (default: true)
- `continuation_id`: Continue previous security audit discussions
## Audit Focus Areas
**Comprehensive (default):**
- Full OWASP Top 10 assessment with all security domains
- Authentication, authorization, data protection, infrastructure
- Best for complete security posture evaluation
**Authentication:**
- Focused on identity, access management, and session security
- Multi-factor authentication, password policies, privilege escalation
- Best for user management and access control systems
**Data Protection:**
- Encryption, data handling, privacy controls, and compliance
- Input validation, output encoding, data classification
- Best for applications handling sensitive or regulated data
**Infrastructure:**
- Deployment security, configuration management, dependency security
- Network security, container security, cloud security posture
- Best for DevOps and infrastructure security assessment
**API Security:**
- REST/GraphQL security, rate limiting, API authentication
- Input validation, authorization patterns, API gateway security
- Best for API-first applications and microservices
## Threat Levels
Security assessment depth and urgency:
- **🔴 CRITICAL**: Mission-critical systems, high-value targets, regulatory requirements
- **🟠 HIGH**: Business-critical applications, customer data handling, financial systems
- **🟡 MEDIUM**: Standard business applications, internal tools, moderate risk exposure
- **🟢 LOW**: Development environments, non-sensitive applications, proof-of-concepts
## Compliance Frameworks
Supported compliance assessments:
- **SOC2**: Security, availability, processing integrity, confidentiality, privacy
- **PCI DSS**: Payment card industry data security standards
- **HIPAA**: Healthcare information privacy and security
- **GDPR**: General data protection regulation compliance
- **FedRAMP**: Federal risk and authorization management program
- **ISO27001**: Information security management systems
- **NIST**: Cybersecurity framework controls
## OWASP Top 10 (2021) Coverage
Systematic assessment includes:
1. **A01 Broken Access Control**: Authorization flaws and privilege escalation
2. **A02 Cryptographic Failures**: Encryption and data protection issues
3. **A03 Injection**: SQL, NoSQL, OS, and LDAP injection vulnerabilities
4. **A04 Insecure Design**: Security design flaws and threat modeling gaps
5. **A05 Security Misconfiguration**: Configuration and hardening issues
6. **A06 Vulnerable Components**: Third-party and dependency vulnerabilities
7. **A07 Identification & Authentication Failures**: Authentication bypass and session management
8. **A08 Software & Data Integrity Failures**: Supply chain and integrity violations
9. **A09 Security Logging & Monitoring Failures**: Detection and response capabilities
10. **A10 Server-Side Request Forgery**: SSRF and related vulnerabilities
## Usage Examples
**Comprehensive E-commerce Security Audit:**
```
"Conduct a comprehensive secaudit with gemini pro for our Node.js e-commerce platform, threat level high,
compliance requirements PCI DSS and SOC2, focus on payment processing security"
```
**Authentication System Security Review:**
```
"Use o3 to perform secaudit on authentication microservice, focus on authentication,
threat level critical, check for OWASP A07 and multi-factor authentication implementation"
```
**API Security Assessment:**
```
"Secaudit our REST API gateway with gemini pro, audit focus api_security,
compliance requirements GDPR, threat level medium"
```
**Infrastructure Security Review:**
```
"Perform secaudit on Kubernetes deployment manifests with o3, focus infrastructure,
threat level high, include container security and network policies"
```
**Quick Security Scan:**
```
"Fast secaudit of user registration flow with flash, focus authentication,
severity filter critical and high only"
```
## Best Practices
- **Define clear security scope**: Specify application type, tech stack, and security boundaries
- **Set appropriate threat levels**: Match assessment depth to risk exposure and criticality
- **Include compliance requirements**: Specify relevant frameworks for regulatory alignment
- **Use parallel audits**: Run separate assessments for different components or compliance frameworks
- **Provide architectural context**: Include system diagrams, data flow documentation, or deployment topology
- **Focus audit scope**: Use audit_focus for targeted assessments of specific security domains
- **Follow up on findings**: Use continuation feature to dive deeper into specific vulnerabilities
## Output Format
Security audits include:
- **Executive Security Summary**: Overall security posture and critical findings
- **OWASP Top 10 Assessment**: Systematic review of each category with specific findings
- **Compliance Gap Analysis**: Framework-specific requirements and current compliance status
- **Risk-Prioritized Findings**: Vulnerabilities ordered by exploitability and business impact
- **Remediation Roadmap**: Phased approach to security improvements with quick wins
- **Security Architecture Recommendations**: Structural improvements for long-term security posture
## When to Use Secaudit vs Other Tools
- **Use `secaudit`** for: Comprehensive security assessment, compliance evaluation, OWASP-based vulnerability analysis
- **Use `codereview`** for: General code quality with some security considerations
- **Use `analyze`** for: Understanding security architecture without vulnerability assessment
- **Use `debug`** for: Investigating specific security incidents or exploit attempts
- **Use `precommit`** for: Pre-deployment security validation and change impact assessment
```
--------------------------------------------------------------------------------
/tests/test_auto_mode.py:
--------------------------------------------------------------------------------
```python
"""Tests for auto mode functionality"""
import importlib
import os
from unittest.mock import patch
import pytest
from tools.chat import ChatTool
from tools.shared.exceptions import ToolExecutionError
class TestAutoMode:
"""Test auto mode configuration and behavior"""
def test_auto_mode_detection(self):
"""Test that auto mode is detected correctly"""
# Save original
original = os.environ.get("DEFAULT_MODEL", "")
try:
# Test auto mode
os.environ["DEFAULT_MODEL"] = "auto"
import config
importlib.reload(config)
assert config.DEFAULT_MODEL == "auto"
assert config.IS_AUTO_MODE is True
# Test non-auto mode
os.environ["DEFAULT_MODEL"] = "pro"
importlib.reload(config)
assert config.DEFAULT_MODEL == "pro"
assert config.IS_AUTO_MODE is False
finally:
# Restore
if original:
os.environ["DEFAULT_MODEL"] = original
else:
os.environ.pop("DEFAULT_MODEL", None)
importlib.reload(config)
def test_model_capabilities_descriptions(self):
"""Test that model capabilities are properly defined in providers"""
from providers.registry import ModelProviderRegistry
# Get all providers with valid API keys and check their model descriptions
enabled_provider_types = ModelProviderRegistry.get_available_providers_with_keys()
models_with_descriptions = {}
for provider_type in enabled_provider_types:
provider = ModelProviderRegistry.get_provider(provider_type)
if provider:
for model_name, config in provider.MODEL_CAPABILITIES.items():
# Skip alias entries (string values)
if isinstance(config, str):
continue
# Check that model has description
description = config.description if hasattr(config, "description") else ""
if description:
models_with_descriptions[model_name] = description
# Check all expected models are present with meaningful descriptions
expected_models = ["flash", "pro", "o3", "o3-mini", "o3-pro", "o4-mini"]
for model in expected_models:
# Model should exist somewhere in the providers
# Note: Some models might not be available if API keys aren't configured
if model in models_with_descriptions:
assert isinstance(models_with_descriptions[model], str)
assert len(models_with_descriptions[model]) > 50 # Meaningful description
def test_tool_schema_in_auto_mode(self):
"""Test that tool schemas require model in auto mode"""
# Save original
original = os.environ.get("DEFAULT_MODEL", "")
try:
# Enable auto mode
os.environ["DEFAULT_MODEL"] = "auto"
import config
importlib.reload(config)
tool = ChatTool()
schema = tool.get_input_schema()
# Model should be required
assert "model" in schema["required"]
# Model field should have detailed descriptions
model_schema = schema["properties"]["model"]
assert "enum" not in model_schema
assert "auto mode" in model_schema["description"].lower()
assert "listmodels" in model_schema["description"]
finally:
# Restore
if original:
os.environ["DEFAULT_MODEL"] = original
else:
os.environ.pop("DEFAULT_MODEL", None)
importlib.reload(config)
def test_tool_schema_in_normal_mode(self):
"""Test that tool schemas don't require model in normal mode"""
# Save original
original = os.environ.get("DEFAULT_MODEL", "")
try:
# Set to a specific model (not auto mode)
os.environ["DEFAULT_MODEL"] = "gemini-2.5-flash"
import config
importlib.reload(config)
tool = ChatTool()
schema = tool.get_input_schema()
# Model should not be required when default model is configured
assert "model" not in schema["required"]
# Model field should have simpler description
model_schema = schema["properties"]["model"]
assert "enum" not in model_schema
assert "listmodels" in model_schema["description"]
assert "default model" in model_schema["description"].lower()
finally:
# Restore
if original:
os.environ["DEFAULT_MODEL"] = original
else:
os.environ.pop("DEFAULT_MODEL", None)
importlib.reload(config)
@pytest.mark.asyncio
async def test_auto_mode_requires_model_parameter(self, tmp_path):
"""Test that auto mode enforces model parameter"""
# Save original
original = os.environ.get("DEFAULT_MODEL", "")
try:
# Enable auto mode
os.environ["DEFAULT_MODEL"] = "auto"
import config
importlib.reload(config)
tool = ChatTool()
# Mock the provider to avoid real API calls
with patch.object(tool, "get_model_provider"):
# Execute without model parameter and expect protocol error
with pytest.raises(ToolExecutionError) as exc_info:
await tool.execute({"prompt": "Test prompt", "working_directory_absolute_path": str(tmp_path)})
# Should get error payload mentioning model requirement
error_payload = getattr(exc_info.value, "payload", str(exc_info.value))
assert "Model" in error_payload
assert "auto" in error_payload
finally:
# Restore
if original:
os.environ["DEFAULT_MODEL"] = original
else:
os.environ.pop("DEFAULT_MODEL", None)
importlib.reload(config)
@pytest.mark.asyncio
async def test_unavailable_model_error_message(self):
"""Test that unavailable model shows helpful error with available models using real integration testing"""
# Save original environment
original_env = {}
api_keys = ["GEMINI_API_KEY", "OPENAI_API_KEY", "XAI_API_KEY", "OPENROUTER_API_KEY"]
for key in api_keys:
original_env[key] = os.environ.get(key)
original_default = os.environ.get("DEFAULT_MODEL", "")
try:
# Set up environment with a real API key but test an unavailable model
# This simulates a user trying to use a model that's not available with their current setup
os.environ["OPENAI_API_KEY"] = "sk-test-key-unavailable-model-test-not-real"
os.environ["DEFAULT_MODEL"] = "auto"
# Clear other provider keys to isolate to OpenAI
for key in ["GEMINI_API_KEY", "XAI_API_KEY", "OPENROUTER_API_KEY"]:
os.environ.pop(key, None)
# Reload config and registry to pick up new environment
import config
importlib.reload(config)
# Clear registry singleton to force re-initialization with new environment
from providers.registry import ModelProviderRegistry
ModelProviderRegistry._instance = None
tool = ChatTool()
# Test with real provider resolution - this should attempt to use a model
# that doesn't exist in the OpenAI provider's model list
try:
result = await tool.execute(
{
"absolute_file_paths": ["/tmp/test.py"],
"prompt": "Analyze this",
"model": "nonexistent-model-xyz", # This model definitely doesn't exist
}
)
# If we get here, check that it's an error about model availability
assert len(result) == 1
response = result[0].text
assert "error" in response
# Should be about model not being available
assert any(
phrase in response
for phrase in [
"Model 'nonexistent-model-xyz' is not available",
"No provider found",
"not available",
"not supported",
]
)
except Exception as e:
# Expected: Should fail with provider resolution or model validation error
error_msg = str(e)
# Should NOT be a mock-related error
assert "MagicMock" not in error_msg
assert "'<' not supported between instances" not in error_msg
# Should be a real provider error about model not being available
assert any(
phrase in error_msg
for phrase in [
"Model 'nonexistent-model-xyz'",
"not available",
"not found",
"not supported",
"provider",
"model",
]
) or any(phrase in error_msg for phrase in ["API", "key", "authentication", "network", "connection"])
finally:
# Restore original environment
for key, value in original_env.items():
if value is not None:
os.environ[key] = value
else:
os.environ.pop(key, None)
if original_default:
os.environ["DEFAULT_MODEL"] = original_default
else:
os.environ.pop("DEFAULT_MODEL", None)
# Reload config and clear registry singleton
importlib.reload(config)
ModelProviderRegistry._instance = None
def test_model_field_schema_generation(self):
"""Test the get_model_field_schema method"""
from tools.shared.base_tool import BaseTool
# Create a minimal concrete tool for testing
class TestTool(BaseTool):
def get_name(self):
return "test"
def get_description(self):
return "test"
def get_input_schema(self):
return {}
def get_system_prompt(self):
return ""
def get_request_model(self):
return None
async def prepare_prompt(self, request):
return ""
tool = TestTool()
# Save original
original = os.environ.get("DEFAULT_MODEL", "")
try:
# Test auto mode
os.environ["DEFAULT_MODEL"] = "auto"
import config
importlib.reload(config)
schema = tool.get_model_field_schema()
assert "enum" not in schema
assert schema["type"] == "string"
assert "auto mode" in schema["description"].lower()
assert "listmodels" in schema["description"]
# Test normal mode
os.environ["DEFAULT_MODEL"] = "pro"
importlib.reload(config)
schema = tool.get_model_field_schema()
assert "enum" not in schema
assert schema["type"] == "string"
assert "'pro'" in schema["description"]
assert "listmodels" in schema["description"]
finally:
# Restore
if original:
os.environ["DEFAULT_MODEL"] = original
else:
os.environ.pop("DEFAULT_MODEL", None)
importlib.reload(config)
```