This is page 29 of 39. Use http://codebase.md/wshobson/maverick-mcp?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .dockerignore
├── .env.example
├── .github
│ ├── dependabot.yml
│ ├── FUNDING.yml
│ ├── ISSUE_TEMPLATE
│ │ ├── bug_report.md
│ │ ├── config.yml
│ │ ├── feature_request.md
│ │ ├── question.md
│ │ └── security_report.md
│ ├── pull_request_template.md
│ └── workflows
│ ├── claude-code-review.yml
│ └── claude.yml
├── .gitignore
├── .python-version
├── .vscode
│ ├── launch.json
│ └── settings.json
├── alembic
│ ├── env.py
│ ├── script.py.mako
│ └── versions
│ ├── 001_initial_schema.py
│ ├── 003_add_performance_indexes.py
│ ├── 006_rename_metadata_columns.py
│ ├── 008_performance_optimization_indexes.py
│ ├── 009_rename_to_supply_demand.py
│ ├── 010_self_contained_schema.py
│ ├── 011_remove_proprietary_terms.py
│ ├── 013_add_backtest_persistence_models.py
│ ├── 014_add_portfolio_models.py
│ ├── 08e3945a0c93_merge_heads.py
│ ├── 9374a5c9b679_merge_heads_for_testing.py
│ ├── abf9b9afb134_merge_multiple_heads.py
│ ├── adda6d3fd84b_merge_proprietary_terms_removal_with_.py
│ ├── e0c75b0bdadb_fix_financial_data_precision_only.py
│ ├── f0696e2cac15_add_essential_performance_indexes.py
│ └── fix_database_integrity_issues.py
├── alembic.ini
├── CLAUDE.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── DATABASE_SETUP.md
├── docker-compose.override.yml.example
├── docker-compose.yml
├── Dockerfile
├── docs
│ ├── api
│ │ └── backtesting.md
│ ├── BACKTESTING.md
│ ├── COST_BASIS_SPECIFICATION.md
│ ├── deep_research_agent.md
│ ├── exa_research_testing_strategy.md
│ ├── PORTFOLIO_PERSONALIZATION_PLAN.md
│ ├── PORTFOLIO.md
│ ├── SETUP_SELF_CONTAINED.md
│ └── speed_testing_framework.md
├── examples
│ ├── complete_speed_validation.py
│ ├── deep_research_integration.py
│ ├── llm_optimization_example.py
│ ├── llm_speed_demo.py
│ ├── monitoring_example.py
│ ├── parallel_research_example.py
│ ├── speed_optimization_demo.py
│ └── timeout_fix_demonstration.py
├── LICENSE
├── Makefile
├── MANIFEST.in
├── maverick_mcp
│ ├── __init__.py
│ ├── agents
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── circuit_breaker.py
│ │ ├── deep_research.py
│ │ ├── market_analysis.py
│ │ ├── optimized_research.py
│ │ ├── supervisor.py
│ │ └── technical_analysis.py
│ ├── api
│ │ ├── __init__.py
│ │ ├── api_server.py
│ │ ├── connection_manager.py
│ │ ├── dependencies
│ │ │ ├── __init__.py
│ │ │ ├── stock_analysis.py
│ │ │ └── technical_analysis.py
│ │ ├── error_handling.py
│ │ ├── inspector_compatible_sse.py
│ │ ├── inspector_sse.py
│ │ ├── middleware
│ │ │ ├── error_handling.py
│ │ │ ├── mcp_logging.py
│ │ │ ├── rate_limiting_enhanced.py
│ │ │ └── security.py
│ │ ├── openapi_config.py
│ │ ├── routers
│ │ │ ├── __init__.py
│ │ │ ├── agents.py
│ │ │ ├── backtesting.py
│ │ │ ├── data_enhanced.py
│ │ │ ├── data.py
│ │ │ ├── health_enhanced.py
│ │ │ ├── health_tools.py
│ │ │ ├── health.py
│ │ │ ├── intelligent_backtesting.py
│ │ │ ├── introspection.py
│ │ │ ├── mcp_prompts.py
│ │ │ ├── monitoring.py
│ │ │ ├── news_sentiment_enhanced.py
│ │ │ ├── performance.py
│ │ │ ├── portfolio.py
│ │ │ ├── research.py
│ │ │ ├── screening_ddd.py
│ │ │ ├── screening_parallel.py
│ │ │ ├── screening.py
│ │ │ ├── technical_ddd.py
│ │ │ ├── technical_enhanced.py
│ │ │ ├── technical.py
│ │ │ └── tool_registry.py
│ │ ├── server.py
│ │ ├── services
│ │ │ ├── __init__.py
│ │ │ ├── base_service.py
│ │ │ ├── market_service.py
│ │ │ ├── portfolio_service.py
│ │ │ ├── prompt_service.py
│ │ │ └── resource_service.py
│ │ ├── simple_sse.py
│ │ └── utils
│ │ ├── __init__.py
│ │ ├── insomnia_export.py
│ │ └── postman_export.py
│ ├── application
│ │ ├── __init__.py
│ │ ├── commands
│ │ │ └── __init__.py
│ │ ├── dto
│ │ │ ├── __init__.py
│ │ │ └── technical_analysis_dto.py
│ │ ├── queries
│ │ │ ├── __init__.py
│ │ │ └── get_technical_analysis.py
│ │ └── screening
│ │ ├── __init__.py
│ │ ├── dtos.py
│ │ └── queries.py
│ ├── backtesting
│ │ ├── __init__.py
│ │ ├── ab_testing.py
│ │ ├── analysis.py
│ │ ├── batch_processing_stub.py
│ │ ├── batch_processing.py
│ │ ├── model_manager.py
│ │ ├── optimization.py
│ │ ├── persistence.py
│ │ ├── retraining_pipeline.py
│ │ ├── strategies
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── ml
│ │ │ │ ├── __init__.py
│ │ │ │ ├── adaptive.py
│ │ │ │ ├── ensemble.py
│ │ │ │ ├── feature_engineering.py
│ │ │ │ └── regime_aware.py
│ │ │ ├── ml_strategies.py
│ │ │ ├── parser.py
│ │ │ └── templates.py
│ │ ├── strategy_executor.py
│ │ ├── vectorbt_engine.py
│ │ └── visualization.py
│ ├── config
│ │ ├── __init__.py
│ │ ├── constants.py
│ │ ├── database_self_contained.py
│ │ ├── database.py
│ │ ├── llm_optimization_config.py
│ │ ├── logging_settings.py
│ │ ├── plotly_config.py
│ │ ├── security_utils.py
│ │ ├── security.py
│ │ ├── settings.py
│ │ ├── technical_constants.py
│ │ ├── tool_estimation.py
│ │ └── validation.py
│ ├── core
│ │ ├── __init__.py
│ │ ├── technical_analysis.py
│ │ └── visualization.py
│ ├── data
│ │ ├── __init__.py
│ │ ├── cache_manager.py
│ │ ├── cache.py
│ │ ├── django_adapter.py
│ │ ├── health.py
│ │ ├── models.py
│ │ ├── performance.py
│ │ ├── session_management.py
│ │ └── validation.py
│ ├── database
│ │ ├── __init__.py
│ │ ├── base.py
│ │ └── optimization.py
│ ├── dependencies.py
│ ├── domain
│ │ ├── __init__.py
│ │ ├── entities
│ │ │ ├── __init__.py
│ │ │ └── stock_analysis.py
│ │ ├── events
│ │ │ └── __init__.py
│ │ ├── portfolio.py
│ │ ├── screening
│ │ │ ├── __init__.py
│ │ │ ├── entities.py
│ │ │ ├── services.py
│ │ │ └── value_objects.py
│ │ ├── services
│ │ │ ├── __init__.py
│ │ │ └── technical_analysis_service.py
│ │ ├── stock_analysis
│ │ │ ├── __init__.py
│ │ │ └── stock_analysis_service.py
│ │ └── value_objects
│ │ ├── __init__.py
│ │ └── technical_indicators.py
│ ├── exceptions.py
│ ├── infrastructure
│ │ ├── __init__.py
│ │ ├── cache
│ │ │ └── __init__.py
│ │ ├── caching
│ │ │ ├── __init__.py
│ │ │ └── cache_management_service.py
│ │ ├── connection_manager.py
│ │ ├── data_fetching
│ │ │ ├── __init__.py
│ │ │ └── stock_data_service.py
│ │ ├── health
│ │ │ ├── __init__.py
│ │ │ └── health_checker.py
│ │ ├── persistence
│ │ │ ├── __init__.py
│ │ │ └── stock_repository.py
│ │ ├── providers
│ │ │ └── __init__.py
│ │ ├── screening
│ │ │ ├── __init__.py
│ │ │ └── repositories.py
│ │ └── sse_optimizer.py
│ ├── langchain_tools
│ │ ├── __init__.py
│ │ ├── adapters.py
│ │ └── registry.py
│ ├── logging_config.py
│ ├── memory
│ │ ├── __init__.py
│ │ └── stores.py
│ ├── monitoring
│ │ ├── __init__.py
│ │ ├── health_check.py
│ │ ├── health_monitor.py
│ │ ├── integration_example.py
│ │ ├── metrics.py
│ │ ├── middleware.py
│ │ └── status_dashboard.py
│ ├── providers
│ │ ├── __init__.py
│ │ ├── dependencies.py
│ │ ├── factories
│ │ │ ├── __init__.py
│ │ │ ├── config_factory.py
│ │ │ └── provider_factory.py
│ │ ├── implementations
│ │ │ ├── __init__.py
│ │ │ ├── cache_adapter.py
│ │ │ ├── macro_data_adapter.py
│ │ │ ├── market_data_adapter.py
│ │ │ ├── persistence_adapter.py
│ │ │ └── stock_data_adapter.py
│ │ ├── interfaces
│ │ │ ├── __init__.py
│ │ │ ├── cache.py
│ │ │ ├── config.py
│ │ │ ├── macro_data.py
│ │ │ ├── market_data.py
│ │ │ ├── persistence.py
│ │ │ └── stock_data.py
│ │ ├── llm_factory.py
│ │ ├── macro_data.py
│ │ ├── market_data.py
│ │ ├── mocks
│ │ │ ├── __init__.py
│ │ │ ├── mock_cache.py
│ │ │ ├── mock_config.py
│ │ │ ├── mock_macro_data.py
│ │ │ ├── mock_market_data.py
│ │ │ ├── mock_persistence.py
│ │ │ └── mock_stock_data.py
│ │ ├── openrouter_provider.py
│ │ ├── optimized_screening.py
│ │ ├── optimized_stock_data.py
│ │ └── stock_data.py
│ ├── README.md
│ ├── tests
│ │ ├── __init__.py
│ │ ├── README_INMEMORY_TESTS.md
│ │ ├── test_cache_debug.py
│ │ ├── test_fixes_validation.py
│ │ ├── test_in_memory_routers.py
│ │ ├── test_in_memory_server.py
│ │ ├── test_macro_data_provider.py
│ │ ├── test_mailgun_email.py
│ │ ├── test_market_calendar_caching.py
│ │ ├── test_mcp_tool_fixes_pytest.py
│ │ ├── test_mcp_tool_fixes.py
│ │ ├── test_mcp_tools.py
│ │ ├── test_models_functional.py
│ │ ├── test_server.py
│ │ ├── test_stock_data_enhanced.py
│ │ ├── test_stock_data_provider.py
│ │ └── test_technical_analysis.py
│ ├── tools
│ │ ├── __init__.py
│ │ ├── performance_monitoring.py
│ │ ├── portfolio_manager.py
│ │ ├── risk_management.py
│ │ └── sentiment_analysis.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── agent_errors.py
│ │ ├── batch_processing.py
│ │ ├── cache_warmer.py
│ │ ├── circuit_breaker_decorators.py
│ │ ├── circuit_breaker_services.py
│ │ ├── circuit_breaker.py
│ │ ├── data_chunking.py
│ │ ├── database_monitoring.py
│ │ ├── debug_utils.py
│ │ ├── fallback_strategies.py
│ │ ├── llm_optimization.py
│ │ ├── logging_example.py
│ │ ├── logging_init.py
│ │ ├── logging.py
│ │ ├── mcp_logging.py
│ │ ├── memory_profiler.py
│ │ ├── monitoring_middleware.py
│ │ ├── monitoring.py
│ │ ├── orchestration_logging.py
│ │ ├── parallel_research.py
│ │ ├── parallel_screening.py
│ │ ├── quick_cache.py
│ │ ├── resource_manager.py
│ │ ├── shutdown.py
│ │ ├── stock_helpers.py
│ │ ├── structured_logger.py
│ │ ├── tool_monitoring.py
│ │ ├── tracing.py
│ │ └── yfinance_pool.py
│ ├── validation
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── data.py
│ │ ├── middleware.py
│ │ ├── portfolio.py
│ │ ├── responses.py
│ │ ├── screening.py
│ │ └── technical.py
│ └── workflows
│ ├── __init__.py
│ ├── agents
│ │ ├── __init__.py
│ │ ├── market_analyzer.py
│ │ ├── optimizer_agent.py
│ │ ├── strategy_selector.py
│ │ └── validator_agent.py
│ ├── backtesting_workflow.py
│ └── state.py
├── PLANS.md
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── scripts
│ ├── dev.sh
│ ├── INSTALLATION_GUIDE.md
│ ├── load_example.py
│ ├── load_market_data.py
│ ├── load_tiingo_data.py
│ ├── migrate_db.py
│ ├── README_TIINGO_LOADER.md
│ ├── requirements_tiingo.txt
│ ├── run_stock_screening.py
│ ├── run-migrations.sh
│ ├── seed_db.py
│ ├── seed_sp500.py
│ ├── setup_database.sh
│ ├── setup_self_contained.py
│ ├── setup_sp500_database.sh
│ ├── test_seeded_data.py
│ ├── test_tiingo_loader.py
│ ├── tiingo_config.py
│ └── validate_setup.py
├── SECURITY.md
├── server.json
├── setup.py
├── tests
│ ├── __init__.py
│ ├── conftest.py
│ ├── core
│ │ └── test_technical_analysis.py
│ ├── data
│ │ └── test_portfolio_models.py
│ ├── domain
│ │ ├── conftest.py
│ │ ├── test_portfolio_entities.py
│ │ └── test_technical_analysis_service.py
│ ├── fixtures
│ │ └── orchestration_fixtures.py
│ ├── integration
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── README.md
│ │ ├── run_integration_tests.sh
│ │ ├── test_api_technical.py
│ │ ├── test_chaos_engineering.py
│ │ ├── test_config_management.py
│ │ ├── test_full_backtest_workflow_advanced.py
│ │ ├── test_full_backtest_workflow.py
│ │ ├── test_high_volume.py
│ │ ├── test_mcp_tools.py
│ │ ├── test_orchestration_complete.py
│ │ ├── test_portfolio_persistence.py
│ │ ├── test_redis_cache.py
│ │ ├── test_security_integration.py.disabled
│ │ └── vcr_setup.py
│ ├── performance
│ │ ├── __init__.py
│ │ ├── test_benchmarks.py
│ │ ├── test_load.py
│ │ ├── test_profiling.py
│ │ └── test_stress.py
│ ├── providers
│ │ └── test_stock_data_simple.py
│ ├── README.md
│ ├── test_agents_router_mcp.py
│ ├── test_backtest_persistence.py
│ ├── test_cache_management_service.py
│ ├── test_cache_serialization.py
│ ├── test_circuit_breaker.py
│ ├── test_database_pool_config_simple.py
│ ├── test_database_pool_config.py
│ ├── test_deep_research_functional.py
│ ├── test_deep_research_integration.py
│ ├── test_deep_research_parallel_execution.py
│ ├── test_error_handling.py
│ ├── test_event_loop_integrity.py
│ ├── test_exa_research_integration.py
│ ├── test_exception_hierarchy.py
│ ├── test_financial_search.py
│ ├── test_graceful_shutdown.py
│ ├── test_integration_simple.py
│ ├── test_langgraph_workflow.py
│ ├── test_market_data_async.py
│ ├── test_market_data_simple.py
│ ├── test_mcp_orchestration_functional.py
│ ├── test_ml_strategies.py
│ ├── test_optimized_research_agent.py
│ ├── test_orchestration_integration.py
│ ├── test_orchestration_logging.py
│ ├── test_orchestration_tools_simple.py
│ ├── test_parallel_research_integration.py
│ ├── test_parallel_research_orchestrator.py
│ ├── test_parallel_research_performance.py
│ ├── test_performance_optimizations.py
│ ├── test_production_validation.py
│ ├── test_provider_architecture.py
│ ├── test_rate_limiting_enhanced.py
│ ├── test_runner_validation.py
│ ├── test_security_comprehensive.py.disabled
│ ├── test_security_cors.py
│ ├── test_security_enhancements.py.disabled
│ ├── test_security_headers.py
│ ├── test_security_penetration.py
│ ├── test_session_management.py
│ ├── test_speed_optimization_validation.py
│ ├── test_stock_analysis_dependencies.py
│ ├── test_stock_analysis_service.py
│ ├── test_stock_data_fetching_service.py
│ ├── test_supervisor_agent.py
│ ├── test_supervisor_functional.py
│ ├── test_tool_estimation_config.py
│ ├── test_visualization.py
│ └── utils
│ ├── test_agent_errors.py
│ ├── test_logging.py
│ ├── test_parallel_screening.py
│ └── test_quick_cache.py
├── tools
│ ├── check_orchestration_config.py
│ ├── experiments
│ │ ├── validation_examples.py
│ │ └── validation_fixed.py
│ ├── fast_dev.sh
│ ├── hot_reload.py
│ ├── quick_test.py
│ └── templates
│ ├── new_router_template.py
│ ├── new_tool_template.py
│ ├── screening_strategy_template.py
│ └── test_template.py
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/tests/test_parallel_research_performance.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Comprehensive Performance Test Suite for Parallel Research System
3 |
4 | This test suite validates parallel research performance against specific thresholds:
5 | - Minimum speedup: 2 agents = 1.3x speedup, 4 agents = 2.0x speedup
6 | - Maximum memory increase: 3x memory usage acceptable for 4x agents
7 | - Test duration: Quick tests for CI (<30s total runtime)
8 |
9 | Features:
10 | - Realistic failure simulation (0%, 10%, 25% failure rates)
11 | - Memory usage monitoring and validation
12 | - Statistical significance testing (3+ runs per test)
13 | - Integration with existing pytest infrastructure
14 | - Performance markers for easy filtering
15 | """
16 |
17 | import asyncio
18 | import gc
19 | import logging
20 | import random
21 | import statistics
22 | import time
23 | import tracemalloc
24 | from datetime import datetime
25 | from typing import Any
26 | from unittest.mock import AsyncMock
27 |
28 | import psutil
29 | import pytest
30 |
31 | from maverick_mcp.agents.deep_research import DeepResearchAgent
32 | from maverick_mcp.utils.parallel_research import (
33 | ParallelResearchConfig,
34 | ParallelResearchOrchestrator,
35 | ResearchResult,
36 | ResearchTask,
37 | )
38 |
39 | logger = logging.getLogger(__name__)
40 |
41 | # Performance thresholds and test configuration
42 | PERFORMANCE_THRESHOLDS = {
43 | "speedup": {
44 | 2: 1.3, # 2 agents should achieve 1.3x speedup minimum
45 | 4: 2.0, # 4 agents should achieve 2.0x speedup minimum
46 | },
47 | "memory_multiplier": 3.0, # 3x memory usage acceptable for 4x agents
48 | "max_test_duration": 30.0, # Total test suite should complete within 30s
49 | }
50 |
51 | # Failure simulation configuration
52 | FAILURE_RATES = [0.0, 0.10, 0.25] # 0%, 10%, 25% failure rates
53 | STATISTICAL_RUNS = 3 # Number of runs for statistical significance
54 |
55 |
56 | class PerformanceMonitor:
57 | """Monitors CPU, memory, and timing performance during test execution."""
58 |
59 | def __init__(self, test_name: str):
60 | self.test_name = test_name
61 | self.start_time: float = 0
62 | self.start_memory: int = 0
63 | self.peak_memory: int = 0
64 | self.cpu_percent_samples: list[float] = []
65 | self.process = psutil.Process()
66 |
67 | def __enter__(self):
68 | """Start performance monitoring."""
69 | # Force garbage collection for accurate memory measurement
70 | gc.collect()
71 |
72 | # Start memory tracing
73 | tracemalloc.start()
74 |
75 | # Record baseline metrics
76 | self.start_time = time.time()
77 | self.start_memory = self.process.memory_info().rss
78 | self.peak_memory = self.start_memory
79 |
80 | logger.info(f"Performance monitoring started for {self.test_name}")
81 | return self
82 |
83 | def sample_cpu(self):
84 | """Sample current CPU usage."""
85 | try:
86 | cpu_percent = self.process.cpu_percent()
87 | self.cpu_percent_samples.append(cpu_percent)
88 |
89 | # Track peak memory
90 | current_memory = self.process.memory_info().rss
91 | self.peak_memory = max(self.peak_memory, current_memory)
92 | except Exception as e:
93 | logger.warning(f"Failed to sample CPU/memory: {e}")
94 |
95 | def __exit__(self, exc_type, exc_val, exc_tb):
96 | """Complete performance monitoring and return metrics."""
97 | end_time = time.time()
98 | end_memory = self.process.memory_info().rss
99 |
100 | # Get memory tracing results
101 | current, peak_traced = tracemalloc.get_traced_memory()
102 | tracemalloc.stop()
103 |
104 | # Calculate metrics
105 | execution_time = end_time - self.start_time
106 | memory_increase = end_memory - self.start_memory
107 | memory_multiplier = end_memory / max(self.start_memory, 1)
108 | avg_cpu = (
109 | statistics.mean(self.cpu_percent_samples) if self.cpu_percent_samples else 0
110 | )
111 |
112 | self.metrics = {
113 | "execution_time": execution_time,
114 | "start_memory_mb": self.start_memory / (1024 * 1024),
115 | "end_memory_mb": end_memory / (1024 * 1024),
116 | "peak_memory_mb": max(self.peak_memory, peak_traced) / (1024 * 1024),
117 | "memory_increase_mb": memory_increase / (1024 * 1024),
118 | "memory_multiplier": memory_multiplier,
119 | "avg_cpu_percent": avg_cpu,
120 | "cpu_samples": len(self.cpu_percent_samples),
121 | }
122 |
123 | logger.info(
124 | f"Performance metrics for {self.test_name}: "
125 | f"Time: {execution_time:.3f}s, "
126 | f"Memory: {self.metrics['start_memory_mb']:.1f}MB -> "
127 | f"{self.metrics['end_memory_mb']:.1f}MB "
128 | f"({memory_multiplier:.2f}x), "
129 | f"CPU: {avg_cpu:.1f}%"
130 | )
131 |
132 |
133 | class MockResearchExecutor:
134 | """
135 | Realistic mock executor that simulates LLM research operations with:
136 | - Configurable failure rates and timeout scenarios
137 | - Variable response times (100-500ms)
138 | - Different response sizes to test memory patterns
139 | - Structured research data that mirrors real usage
140 | """
141 |
142 | def __init__(
143 | self,
144 | failure_rate: float = 0.0,
145 | base_delay: float = 0.1,
146 | delay_variance: float = 0.4,
147 | include_timeouts: bool = False,
148 | ):
149 | self.failure_rate = failure_rate
150 | self.base_delay = base_delay
151 | self.delay_variance = delay_variance
152 | self.include_timeouts = include_timeouts
153 | self.execution_count = 0
154 | self.execution_log: list[dict[str, Any]] = []
155 |
156 | async def __call__(self, task: ResearchTask) -> dict[str, Any]:
157 | """Execute mock research task with realistic behavior."""
158 | self.execution_count += 1
159 | start_time = time.time()
160 |
161 | # Simulate realistic processing delay (100-500ms)
162 | delay = self.base_delay + random.uniform(0, self.delay_variance)
163 | await asyncio.sleep(delay)
164 |
165 | # Simulate various failure modes
166 | if random.random() < self.failure_rate:
167 | failure_type = random.choice(
168 | ["api_timeout", "rate_limit", "auth_error", "network_error"]
169 | )
170 |
171 | if failure_type == "api_timeout" and self.include_timeouts:
172 | # Simulate timeout by sleeping longer than task timeout
173 | await asyncio.sleep(task.timeout + 1 if task.timeout else 10)
174 |
175 | raise Exception(f"Simulated {failure_type} for task {task.task_id}")
176 |
177 | # Generate realistic research response based on task type
178 | response = self._generate_research_response(task)
179 |
180 | execution_time = time.time() - start_time
181 | self.execution_log.append(
182 | {
183 | "task_id": task.task_id,
184 | "task_type": task.task_type,
185 | "execution_time": execution_time,
186 | "delay_simulated": delay,
187 | "response_size": len(str(response)),
188 | }
189 | )
190 |
191 | return response
192 |
193 | def _generate_research_response(self, task: ResearchTask) -> dict[str, Any]:
194 | """Generate structured research data with realistic content."""
195 | # Vary response size based on research type
196 | insight_counts = {
197 | "fundamental": random.randint(8, 15),
198 | "technical": random.randint(6, 12),
199 | "sentiment": random.randint(10, 20),
200 | "competitive": random.randint(7, 14),
201 | }
202 |
203 | insight_count = insight_counts.get(task.task_type, 10)
204 |
205 | # Generate realistic insights
206 | insights = [
207 | f"{task.task_type} insight {i + 1} for {task.target_topic[:20]}"
208 | for i in range(insight_count)
209 | ]
210 |
211 | # Add context-specific data
212 | sources = [
213 | {
214 | "title": f"Source {i + 1} for {task.task_type} research",
215 | "url": f"https://example.com/research/{task.task_type}/{i}",
216 | "credibility_score": random.uniform(0.6, 0.95),
217 | "published_date": datetime.now().isoformat(),
218 | "content_snippet": f"Content from source {i + 1} "
219 | * random.randint(10, 50),
220 | "relevance_score": random.uniform(0.7, 1.0),
221 | }
222 | for i in range(random.randint(3, 8))
223 | ]
224 |
225 | return {
226 | "research_type": task.task_type,
227 | "insights": insights,
228 | "risk_factors": [
229 | f"{task.task_type} risk factor {i + 1}"
230 | for i in range(random.randint(3, 7))
231 | ],
232 | "opportunities": [
233 | f"{task.task_type} opportunity {i + 1}"
234 | for i in range(random.randint(3, 7))
235 | ],
236 | "sentiment": {
237 | "direction": random.choice(["bullish", "bearish", "neutral"]),
238 | "confidence": random.uniform(0.5, 0.9),
239 | "consensus": random.uniform(0.4, 0.8),
240 | },
241 | "credibility_score": random.uniform(0.6, 0.9),
242 | "sources": sources,
243 | "focus_areas": task.focus_areas,
244 | "metadata": {
245 | "execution_time": random.uniform(0.1, 0.5),
246 | "api_calls_made": random.randint(2, 8),
247 | "cache_hits": random.randint(0, 4),
248 | "cache_misses": random.randint(1, 6),
249 | },
250 | # Add some large data structures to test memory usage
251 | "detailed_analysis": {
252 | f"analysis_point_{i}": f"Detailed analysis content {i} "
253 | * random.randint(50, 200)
254 | for i in range(random.randint(5, 15))
255 | },
256 | }
257 |
258 | def get_execution_stats(self) -> dict[str, Any]:
259 | """Get detailed execution statistics."""
260 | if not self.execution_log:
261 | return {"total_executions": 0}
262 |
263 | execution_times = [log["execution_time"] for log in self.execution_log]
264 | response_sizes = [log["response_size"] for log in self.execution_log]
265 |
266 | return {
267 | "total_executions": len(self.execution_log),
268 | "avg_execution_time": statistics.mean(execution_times),
269 | "median_execution_time": statistics.median(execution_times),
270 | "avg_response_size": statistics.mean(response_sizes),
271 | "total_response_size": sum(response_sizes),
272 | "task_type_distribution": {
273 | task_type: len(
274 | [log for log in self.execution_log if log["task_type"] == task_type]
275 | )
276 | for task_type in [
277 | "fundamental",
278 | "technical",
279 | "sentiment",
280 | "competitive",
281 | ]
282 | },
283 | }
284 |
285 |
286 | class PerformanceTester:
287 | """Manages performance test execution and validation."""
288 |
289 | @staticmethod
290 | def create_test_tasks(
291 | agent_count: int, session_id: str = "perf_test"
292 | ) -> list[ResearchTask]:
293 | """Create realistic research tasks for performance testing."""
294 | topics = [
295 | "Apple Inc financial analysis and market outlook",
296 | "Tesla Inc competitive position and technical analysis",
297 | "Microsoft Corp sentiment analysis and growth prospects",
298 | "NVIDIA Corp fundamental analysis and AI sector outlook",
299 | "Amazon Inc market position and e-commerce trends",
300 | "Google Inc advertising revenue and cloud competition",
301 | ]
302 |
303 | tasks = []
304 | for i in range(agent_count):
305 | topic = topics[i % len(topics)]
306 | task_types = ["fundamental", "technical", "sentiment", "competitive"]
307 | task_type = task_types[i % len(task_types)]
308 |
309 | task = ResearchTask(
310 | task_id=f"{session_id}_{task_type}_{i}",
311 | task_type=task_type,
312 | target_topic=topic,
313 | focus_areas=[f"focus_{i}", f"area_{i % 3}"],
314 | priority=random.randint(5, 9),
315 | timeout=10, # Short timeout for CI
316 | )
317 | tasks.append(task)
318 |
319 | return tasks
320 |
321 | @staticmethod
322 | async def run_sequential_baseline(
323 | tasks: list[ResearchTask], executor: MockResearchExecutor
324 | ) -> dict[str, Any]:
325 | """Run tasks sequentially to establish performance baseline."""
326 | start_time = time.time()
327 | results = []
328 |
329 | for task in tasks:
330 | try:
331 | result = await executor(task)
332 | results.append({"task": task, "result": result, "status": "success"})
333 | except Exception as e:
334 | results.append({"task": task, "error": str(e), "status": "failed"})
335 |
336 | execution_time = time.time() - start_time
337 | successful_results = [r for r in results if r["status"] == "success"]
338 |
339 | return {
340 | "execution_time": execution_time,
341 | "successful_tasks": len(successful_results),
342 | "failed_tasks": len(results) - len(successful_results),
343 | "results": results,
344 | }
345 |
346 | @staticmethod
347 | async def run_parallel_test(
348 | tasks: list[ResearchTask],
349 | config: ParallelResearchConfig,
350 | executor: MockResearchExecutor,
351 | ) -> ResearchResult:
352 | """Run parallel research test with orchestrator."""
353 | orchestrator = ParallelResearchOrchestrator(config)
354 |
355 | # Mock synthesis callback
356 | async def mock_synthesis(
357 | task_results: dict[str, ResearchTask],
358 | ) -> dict[str, Any]:
359 | successful_results = [
360 | task.result
361 | for task in task_results.values()
362 | if task.status == "completed" and task.result
363 | ]
364 |
365 | return {
366 | "synthesis": f"Mock synthesis from {len(successful_results)} results",
367 | "confidence_score": random.uniform(0.7, 0.9),
368 | "key_findings": [
369 | f"Finding {i}" for i in range(min(len(successful_results), 5))
370 | ],
371 | }
372 |
373 | return await orchestrator.execute_parallel_research(
374 | tasks=tasks,
375 | research_executor=executor,
376 | synthesis_callback=mock_synthesis,
377 | )
378 |
379 | @staticmethod
380 | def validate_performance_thresholds(
381 | agent_count: int,
382 | sequential_time: float,
383 | parallel_time: float,
384 | memory_multiplier: float,
385 | success_threshold: float = 0.8, # 80% of tests should pass threshold
386 | ) -> dict[str, bool]:
387 | """Validate performance against defined thresholds."""
388 | speedup = sequential_time / max(parallel_time, 0.001) # Avoid division by zero
389 | expected_speedup = PERFORMANCE_THRESHOLDS["speedup"].get(agent_count, 1.0)
390 | max_memory_multiplier = PERFORMANCE_THRESHOLDS["memory_multiplier"]
391 |
392 | return {
393 | "speedup_threshold_met": speedup >= expected_speedup * success_threshold,
394 | "memory_threshold_met": memory_multiplier <= max_memory_multiplier,
395 | "speedup_achieved": speedup,
396 | "speedup_expected": expected_speedup,
397 | "memory_multiplier": memory_multiplier,
398 | "memory_limit": max_memory_multiplier,
399 | }
400 |
401 |
402 | # Test fixtures
403 | @pytest.fixture
404 | def performance_monitor():
405 | """Create performance monitor for tests."""
406 |
407 | def _create_monitor(test_name: str):
408 | return PerformanceMonitor(test_name)
409 |
410 | return _create_monitor
411 |
412 |
413 | @pytest.fixture
414 | def mock_executor_factory():
415 | """Factory for creating mock research executors with different configurations."""
416 |
417 | def _create_executor(**kwargs):
418 | return MockResearchExecutor(**kwargs)
419 |
420 | return _create_executor
421 |
422 |
423 | @pytest.fixture
424 | def performance_tester():
425 | """Provide PerformanceTester utility."""
426 | return PerformanceTester()
427 |
428 |
429 | # Unit Performance Tests
430 | @pytest.mark.unit
431 | class TestParallelResearchPerformance:
432 | """Core performance tests for parallel research system."""
433 |
434 | @pytest.mark.parametrize("agent_count", [2, 4])
435 | @pytest.mark.parametrize(
436 | "failure_rate", FAILURE_RATES[:2]
437 | ) # 0% and 10% for unit tests
438 | async def test_parallel_speedup_thresholds(
439 | self,
440 | agent_count: int,
441 | failure_rate: float,
442 | performance_monitor,
443 | mock_executor_factory,
444 | performance_tester,
445 | ):
446 | """Test that parallel execution meets minimum speedup thresholds."""
447 | test_name = f"speedup_{agent_count}agents_{int(failure_rate * 100)}pct_failure"
448 |
449 | # Run multiple iterations for statistical significance
450 | speedup_results = []
451 | memory_results = []
452 |
453 | for run in range(STATISTICAL_RUNS):
454 | with performance_monitor(f"{test_name}_run{run}") as monitor:
455 | # Create test configuration
456 | config = ParallelResearchConfig(
457 | max_concurrent_agents=agent_count,
458 | timeout_per_agent=5,
459 | enable_fallbacks=True,
460 | rate_limit_delay=0.05, # Fast for testing
461 | )
462 |
463 | # Create tasks and executor
464 | tasks = performance_tester.create_test_tasks(
465 | agent_count, f"speedup_test_{run}"
466 | )
467 | executor = mock_executor_factory(
468 | failure_rate=failure_rate,
469 | base_delay=0.1,
470 | delay_variance=0.1,
471 | )
472 |
473 | # Sample CPU/memory during test
474 | monitor.sample_cpu()
475 |
476 | # Run sequential baseline
477 | sequential_start = time.time()
478 | await performance_tester.run_sequential_baseline(tasks, executor)
479 | sequential_time = time.time() - sequential_start
480 |
481 | monitor.sample_cpu()
482 |
483 | # Reset executor for parallel test
484 | executor.execution_count = 0
485 | executor.execution_log.clear()
486 |
487 | # Run parallel test
488 | parallel_start = time.time()
489 | await performance_tester.run_parallel_test(tasks, config, executor)
490 | parallel_time = time.time() - parallel_start
491 |
492 | monitor.sample_cpu()
493 |
494 | # Calculate metrics
495 | speedup = sequential_time / max(parallel_time, 0.001)
496 | speedup_results.append(speedup)
497 | memory_results.append(monitor.metrics["memory_multiplier"])
498 |
499 | logger.info(
500 | f"Run {run + 1}: Sequential: {sequential_time:.3f}s, "
501 | f"Parallel: {parallel_time:.3f}s, Speedup: {speedup:.2f}x"
502 | )
503 |
504 | # Statistical analysis
505 | avg_speedup = statistics.mean(speedup_results)
506 | median_speedup = statistics.median(speedup_results)
507 | avg_memory_multiplier = statistics.mean(memory_results)
508 |
509 | # Validate against thresholds
510 | expected_speedup = PERFORMANCE_THRESHOLDS["speedup"][agent_count]
511 | validation = performance_tester.validate_performance_thresholds(
512 | agent_count=agent_count,
513 | sequential_time=1.0, # Normalized
514 | parallel_time=1.0 / avg_speedup, # Normalized
515 | memory_multiplier=avg_memory_multiplier,
516 | )
517 |
518 | logger.info(
519 | f"Performance summary for {agent_count} agents, {failure_rate * 100}% failure rate: "
520 | f"Avg speedup: {avg_speedup:.2f}x (expected: {expected_speedup:.2f}x), "
521 | f"Memory multiplier: {avg_memory_multiplier:.2f}x"
522 | )
523 |
524 | # Assertions with clear failure messages
525 | assert validation["speedup_threshold_met"], (
526 | f"Speedup threshold not met: achieved {avg_speedup:.2f}x, "
527 | f"expected {expected_speedup:.2f}x (with 80% success rate)"
528 | )
529 |
530 | assert validation["memory_threshold_met"], (
531 | f"Memory threshold exceeded: {avg_memory_multiplier:.2f}x > "
532 | f"{PERFORMANCE_THRESHOLDS['memory_multiplier']}x limit"
533 | )
534 |
535 | # Performance characteristics validation
536 | assert median_speedup > 1.0, "Parallel execution should show some speedup"
537 | assert all(m < 10.0 for m in memory_results), (
538 | "Memory usage should be reasonable"
539 | )
540 |
541 | async def test_performance_under_failures(
542 | self, performance_monitor, mock_executor_factory, performance_tester
543 | ):
544 | """Test performance degradation under different failure scenarios."""
545 | agent_count = 4
546 | test_name = "failure_resilience_test"
547 |
548 | results = {}
549 |
550 | for failure_rate in FAILURE_RATES:
551 | with performance_monitor(
552 | f"{test_name}_{int(failure_rate * 100)}pct"
553 | ) as monitor:
554 | config = ParallelResearchConfig(
555 | max_concurrent_agents=agent_count,
556 | timeout_per_agent=3,
557 | enable_fallbacks=True,
558 | rate_limit_delay=0.02,
559 | )
560 |
561 | tasks = performance_tester.create_test_tasks(
562 | agent_count, f"failure_test_{int(failure_rate * 100)}"
563 | )
564 | executor = mock_executor_factory(
565 | failure_rate=failure_rate,
566 | base_delay=0.05,
567 | include_timeouts=False, # No timeouts for this test
568 | )
569 |
570 | monitor.sample_cpu()
571 | parallel_result = await performance_tester.run_parallel_test(
572 | tasks, config, executor
573 | )
574 | monitor.sample_cpu()
575 |
576 | results[failure_rate] = {
577 | "successful_tasks": parallel_result.successful_tasks,
578 | "failed_tasks": parallel_result.failed_tasks,
579 | "execution_time": monitor.metrics["execution_time"],
580 | "memory_multiplier": monitor.metrics["memory_multiplier"],
581 | "success_rate": parallel_result.successful_tasks
582 | / (parallel_result.successful_tasks + parallel_result.failed_tasks),
583 | }
584 |
585 | # Validate failure handling - adjusted for realistic expectations
586 | assert results[0.0]["success_rate"] == 1.0, (
587 | "Zero failure rate should achieve 100% success"
588 | )
589 | assert results[0.10]["success_rate"] >= 0.7, (
590 | "10% failure rate should maintain >70% success"
591 | )
592 | assert results[0.25]["success_rate"] >= 0.5, (
593 | "25% failure rate should maintain >50% success"
594 | )
595 |
596 | # Validate performance doesn't degrade drastically with failures
597 | baseline_time = results[0.0]["execution_time"]
598 | assert results[0.10]["execution_time"] <= baseline_time * 1.5, (
599 | "10% failure shouldn't increase time by >50%"
600 | )
601 | assert results[0.25]["execution_time"] <= baseline_time * 2.0, (
602 | "25% failure shouldn't double execution time"
603 | )
604 |
605 | logger.info("Failure resilience test completed successfully")
606 |
607 | async def test_memory_usage_patterns(
608 | self, performance_monitor, mock_executor_factory, performance_tester
609 | ):
610 | """Test memory usage patterns across different agent counts."""
611 | memory_results = {}
612 |
613 | for agent_count in [1, 2, 4]:
614 | with performance_monitor(f"memory_test_{agent_count}_agents") as monitor:
615 | config = ParallelResearchConfig(
616 | max_concurrent_agents=agent_count,
617 | timeout_per_agent=5,
618 | enable_fallbacks=True,
619 | rate_limit_delay=0.01,
620 | )
621 |
622 | # Create larger dataset to test memory scaling
623 | tasks = performance_tester.create_test_tasks(
624 | agent_count * 2, f"memory_test_{agent_count}"
625 | )
626 | executor = mock_executor_factory(
627 | failure_rate=0.0,
628 | base_delay=0.05, # Short delay to focus on memory
629 | )
630 |
631 | # Force garbage collection before test
632 | gc.collect()
633 | monitor.sample_cpu()
634 |
635 | # Run test with memory monitoring
636 | result = await performance_tester.run_parallel_test(
637 | tasks, config, executor
638 | )
639 |
640 | # Sample memory again
641 | monitor.sample_cpu()
642 |
643 | # Force another GC to see post-test memory
644 | gc.collect()
645 | await asyncio.sleep(0.1) # Allow cleanup
646 |
647 | memory_results[agent_count] = {
648 | "peak_memory_mb": monitor.metrics["peak_memory_mb"],
649 | "memory_increase_mb": monitor.metrics["memory_increase_mb"],
650 | "memory_multiplier": monitor.metrics["memory_multiplier"],
651 | "successful_tasks": result.successful_tasks,
652 | }
653 |
654 | # Validate memory scaling is reasonable
655 | baseline_memory = memory_results[1]["peak_memory_mb"]
656 | memory_4x = memory_results[4]["peak_memory_mb"]
657 |
658 | # 4x agents should not use more than 3x memory
659 | memory_scaling = memory_4x / baseline_memory
660 | assert memory_scaling <= PERFORMANCE_THRESHOLDS["memory_multiplier"], (
661 | f"Memory scaling too high: {memory_scaling:.2f}x > "
662 | f"{PERFORMANCE_THRESHOLDS['memory_multiplier']}x limit"
663 | )
664 |
665 | # Memory usage should scale sub-linearly (better than linear)
666 | assert memory_scaling < 4.0, "Memory should scale sub-linearly with agent count"
667 |
668 | logger.info(f"Memory scaling from 1 to 4 agents: {memory_scaling:.2f}x")
669 |
670 |
671 | # Slow/Integration Performance Tests
672 | @pytest.mark.slow
673 | class TestParallelResearchIntegrationPerformance:
674 | """Integration performance tests with more realistic scenarios."""
675 |
676 | async def test_deep_research_agent_parallel_integration(
677 | self, performance_monitor, mock_executor_factory
678 | ):
679 | """Test DeepResearchAgent with parallel execution enabled."""
680 | with performance_monitor("deep_research_agent_parallel") as monitor:
681 | # Mock LLM for DeepResearchAgent
682 | mock_llm = AsyncMock()
683 | mock_llm.ainvoke.return_value.content = '{"analysis": "test analysis"}'
684 |
685 | # Create agent with parallel execution enabled
686 | agent = DeepResearchAgent(
687 | llm=mock_llm,
688 | persona="moderate",
689 | enable_parallel_execution=True,
690 | parallel_config=ParallelResearchConfig(
691 | max_concurrent_agents=3,
692 | timeout_per_agent=5,
693 | enable_fallbacks=True,
694 | ),
695 | )
696 |
697 | monitor.sample_cpu()
698 |
699 | # Mock the subagent execution to avoid real API calls
700 | async def mock_subagent_execution(task: ResearchTask) -> dict[str, Any]:
701 | await asyncio.sleep(0.1) # Simulate work
702 | return {
703 | "research_type": task.task_type,
704 | "insights": [f"Mock insight for {task.task_type}"],
705 | "sentiment": {"direction": "neutral", "confidence": 0.7},
706 | "credibility_score": 0.8,
707 | "sources": [{"title": "Mock source", "url": "http://example.com"}],
708 | }
709 |
710 | # Override the subagent execution method
711 | agent._execute_subagent_task = mock_subagent_execution
712 |
713 | # Run comprehensive research
714 | result = await agent.research_comprehensive(
715 | topic="Apple Inc comprehensive analysis",
716 | session_id="integration_test",
717 | depth="standard",
718 | use_parallel_execution=True,
719 | )
720 |
721 | monitor.sample_cpu()
722 |
723 | # Validate integration results
724 | assert result["status"] == "success"
725 | assert result["execution_mode"] == "parallel"
726 | assert "parallel_execution_stats" in result
727 | assert result["parallel_execution_stats"]["total_tasks"] > 0
728 | assert result["execution_time_ms"] > 0
729 |
730 | # Performance validation
731 | execution_time_seconds = result["execution_time_ms"] / 1000
732 | assert execution_time_seconds < 10.0, (
733 | "Integration test should complete within 10s"
734 | )
735 | assert monitor.metrics["memory_multiplier"] < 5.0, (
736 | "Memory usage should be reasonable"
737 | )
738 |
739 | logger.info("Deep research agent integration test passed")
740 |
741 | @pytest.mark.parametrize(
742 | "failure_rate", [0.25]
743 | ) # Only high failure rate for slow tests
744 | async def test_high_failure_rate_resilience(
745 | self,
746 | failure_rate: float,
747 | performance_monitor,
748 | mock_executor_factory,
749 | performance_tester,
750 | ):
751 | """Test system resilience under high failure rates."""
752 | agent_count = 6 # More agents for integration testing
753 | test_name = f"high_failure_resilience_{int(failure_rate * 100)}pct"
754 |
755 | resilience_results = []
756 |
757 | for run in range(STATISTICAL_RUNS):
758 | with performance_monitor(f"{test_name}_run{run}") as monitor:
759 | config = ParallelResearchConfig(
760 | max_concurrent_agents=4, # Limit concurrency
761 | timeout_per_agent=8, # Longer timeout for resilience
762 | enable_fallbacks=True,
763 | rate_limit_delay=0.1,
764 | )
765 |
766 | tasks = performance_tester.create_test_tasks(
767 | agent_count, f"resilience_test_{run}"
768 | )
769 | executor = mock_executor_factory(
770 | failure_rate=failure_rate,
771 | base_delay=0.2, # Longer delays to simulate real API calls
772 | delay_variance=0.3,
773 | include_timeouts=True, # Include timeout scenarios
774 | )
775 |
776 | monitor.sample_cpu()
777 |
778 | try:
779 | result = await performance_tester.run_parallel_test(
780 | tasks, config, executor
781 | )
782 |
783 | success_rate = result.successful_tasks / (
784 | result.successful_tasks + result.failed_tasks
785 | )
786 | resilience_results.append(
787 | {
788 | "success_rate": success_rate,
789 | "execution_time": monitor.metrics["execution_time"],
790 | "memory_multiplier": monitor.metrics["memory_multiplier"],
791 | "parallel_efficiency": result.parallel_efficiency,
792 | }
793 | )
794 |
795 | except Exception as e:
796 | logger.warning(f"Resilience test run {run} failed: {e}")
797 | resilience_results.append(
798 | {
799 | "success_rate": 0.0,
800 | "execution_time": monitor.metrics["execution_time"],
801 | "memory_multiplier": monitor.metrics["memory_multiplier"],
802 | "parallel_efficiency": 0.0,
803 | }
804 | )
805 |
806 | monitor.sample_cpu()
807 |
808 | # Analyze resilience results
809 | avg_success_rate = statistics.mean(
810 | [r["success_rate"] for r in resilience_results]
811 | )
812 | avg_execution_time = statistics.mean(
813 | [r["execution_time"] for r in resilience_results]
814 | )
815 |
816 | # Validate system maintained reasonable performance under stress
817 | min_acceptable_success = 0.5 # 50% success rate under 25% failure
818 | assert avg_success_rate >= min_acceptable_success, (
819 | f"System not resilient enough: {avg_success_rate:.2f} < {min_acceptable_success}"
820 | )
821 |
822 | assert avg_execution_time < 20.0, (
823 | "High failure rate tests should still complete reasonably"
824 | )
825 |
826 | logger.info(
827 | f"High failure rate resilience test: {avg_success_rate:.2f} average success rate, "
828 | f"{avg_execution_time:.2f}s average execution time"
829 | )
830 |
831 |
832 | # Test Suite Performance Validation
833 | @pytest.mark.unit
834 | class TestSuitePerformance:
835 | """Validate overall test suite performance characteristics."""
836 |
837 | async def test_total_test_duration_under_threshold(self, performance_monitor):
838 | """Validate that core performance tests complete within time budget."""
839 | with performance_monitor("test_suite_duration") as monitor:
840 | # Simulate running the key performance tests
841 | await asyncio.sleep(0.1) # Placeholder for actual test execution
842 | monitor.sample_cpu()
843 |
844 | # This is a meta-test that would be updated based on actual suite performance
845 | # For now, we validate the monitoring infrastructure works
846 | assert monitor.metrics["execution_time"] < 1.0, (
847 | "Meta-test should complete quickly"
848 | )
849 | assert monitor.metrics["memory_multiplier"] < 2.0, (
850 | "Meta-test should use minimal memory"
851 | )
852 |
853 | def test_performance_threshold_configuration(self):
854 | """Validate performance threshold configuration is reasonable."""
855 | # Test threshold sanity checks
856 | assert PERFORMANCE_THRESHOLDS["speedup"][2] > 1.0, (
857 | "2-agent speedup should exceed 1x"
858 | )
859 | assert (
860 | PERFORMANCE_THRESHOLDS["speedup"][4] > PERFORMANCE_THRESHOLDS["speedup"][2]
861 | ), "4-agent speedup should exceed 2-agent"
862 | assert PERFORMANCE_THRESHOLDS["memory_multiplier"] > 1.0, (
863 | "Memory multiplier should allow some increase"
864 | )
865 | assert PERFORMANCE_THRESHOLDS["memory_multiplier"] < 10.0, (
866 | "Memory multiplier should be reasonable"
867 | )
868 | assert PERFORMANCE_THRESHOLDS["max_test_duration"] > 10.0, (
869 | "Test duration budget should be reasonable"
870 | )
871 |
872 | # Test failure rate configuration
873 | assert all(0.0 <= rate <= 1.0 for rate in FAILURE_RATES), (
874 | "Failure rates should be valid percentages"
875 | )
876 | assert len(set(FAILURE_RATES)) == len(FAILURE_RATES), (
877 | "Failure rates should be unique"
878 | )
879 |
880 | # Test statistical significance configuration
881 | assert STATISTICAL_RUNS >= 3, (
882 | "Statistical runs should provide reasonable sample size"
883 | )
884 | assert STATISTICAL_RUNS <= 10, "Statistical runs should not be excessive for CI"
885 |
886 |
887 | if __name__ == "__main__":
888 | # Allow running individual performance tests for development
889 | import sys
890 |
891 | if len(sys.argv) > 1:
892 | pytest.main([sys.argv[1], "-v", "-s", "--tb=short"])
893 | else:
894 | # Run unit performance tests by default
895 | pytest.main([__file__, "-v", "-s", "-m", "unit", "--tb=short"])
896 |
```
--------------------------------------------------------------------------------
/tests/integration/test_mcp_tools.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | MCP Tools Integration Tests for Claude Desktop Interaction.
3 |
4 | This test suite covers:
5 | - All MCP tool registrations and functionality
6 | - Tool parameter validation and error handling
7 | - Tool response formats and data integrity
8 | - Claude Desktop simulation and interaction patterns
9 | - Real-world usage scenarios
10 | - Performance and timeout handling
11 | """
12 |
13 | import asyncio
14 | import logging
15 | from unittest.mock import Mock, patch
16 |
17 | import numpy as np
18 | import pytest
19 | from fastmcp import Context
20 |
21 | from maverick_mcp.api.routers.backtesting import setup_backtesting_tools
22 |
23 | logger = logging.getLogger(__name__)
24 |
25 |
26 | class MockFastMCP:
27 | """Mock FastMCP instance for testing tool registration."""
28 |
29 | def __init__(self):
30 | self.tools = {}
31 | self.tool_functions = {}
32 |
33 | def tool(self, name: str = None):
34 | """Mock tool decorator."""
35 |
36 | def decorator(func):
37 | tool_name = name or func.__name__
38 | self.tools[tool_name] = {
39 | "function": func,
40 | "name": tool_name,
41 | "signature": self._get_function_signature(func),
42 | }
43 | self.tool_functions[tool_name] = func
44 | return func
45 |
46 | return decorator
47 |
48 | def _get_function_signature(self, func):
49 | """Extract function signature for validation."""
50 | import inspect
51 |
52 | sig = inspect.signature(func)
53 | return {
54 | "parameters": list(sig.parameters.keys()),
55 | "annotations": {k: str(v.annotation) for k, v in sig.parameters.items()},
56 | }
57 |
58 |
59 | class TestMCPToolsIntegration:
60 | """Integration tests for MCP tools and Claude Desktop interaction."""
61 |
62 | @pytest.fixture
63 | def mock_mcp(self):
64 | """Create mock FastMCP instance."""
65 | return MockFastMCP()
66 |
67 | @pytest.fixture
68 | def mock_context(self):
69 | """Create mock MCP context."""
70 | context = Mock(spec=Context)
71 | context.session = {}
72 | return context
73 |
74 | @pytest.fixture
75 | async def setup_tools(self, mock_mcp):
76 | """Set up all backtesting tools for testing."""
77 | setup_backtesting_tools(mock_mcp)
78 | return mock_mcp
79 |
80 | async def test_all_mcp_tools_registration(self, setup_tools):
81 | """Test that all MCP tools are properly registered."""
82 | mcp = setup_tools
83 |
84 | # Expected tools from backtesting router
85 | expected_tools = [
86 | "run_backtest",
87 | "optimize_strategy",
88 | "walk_forward_analysis",
89 | "monte_carlo_simulation",
90 | "compare_strategies",
91 | "list_strategies",
92 | "parse_strategy",
93 | "backtest_portfolio",
94 | "generate_backtest_charts",
95 | "generate_optimization_charts",
96 | "run_ml_strategy_backtest",
97 | "train_ml_predictor",
98 | "analyze_market_regimes",
99 | "create_strategy_ensemble",
100 | ]
101 |
102 | # Check all tools are registered
103 | registered_tools = set(mcp.tools.keys())
104 | expected_set = set(expected_tools)
105 |
106 | missing_tools = expected_set - registered_tools
107 | extra_tools = registered_tools - expected_set
108 |
109 | assert len(missing_tools) == 0, f"Missing tools: {missing_tools}"
110 |
111 | logger.info(f"✓ All {len(registered_tools)} MCP tools registered successfully")
112 | if extra_tools:
113 | logger.info(f"Additional tools found: {extra_tools}")
114 |
115 | # Validate each tool has proper signature
116 | for tool_name, tool_info in mcp.tools.items():
117 | assert callable(tool_info["function"]), f"Tool {tool_name} is not callable"
118 | assert "signature" in tool_info, f"Tool {tool_name} missing signature"
119 |
120 | return {
121 | "registered_tools": list(registered_tools),
122 | "tool_count": len(registered_tools),
123 | }
124 |
125 | async def test_run_backtest_tool_comprehensive(self, setup_tools, mock_context):
126 | """Test run_backtest tool with comprehensive parameter validation."""
127 | mcp = setup_tools
128 | tool_func = mcp.tool_functions["run_backtest"]
129 |
130 | # Test cases with different parameter combinations
131 | test_cases = [
132 | {
133 | "name": "basic_sma_cross",
134 | "params": {
135 | "symbol": "AAPL",
136 | "strategy": "sma_cross",
137 | "fast_period": "10",
138 | "slow_period": "20",
139 | },
140 | "should_succeed": True,
141 | },
142 | {
143 | "name": "rsi_strategy",
144 | "params": {
145 | "symbol": "GOOGL",
146 | "strategy": "rsi",
147 | "period": "14",
148 | "oversold": "30",
149 | "overbought": "70",
150 | },
151 | "should_succeed": True,
152 | },
153 | {
154 | "name": "invalid_symbol",
155 | "params": {
156 | "symbol": "", # Empty symbol
157 | "strategy": "sma_cross",
158 | },
159 | "should_succeed": False,
160 | },
161 | {
162 | "name": "invalid_strategy",
163 | "params": {
164 | "symbol": "AAPL",
165 | "strategy": "nonexistent_strategy",
166 | },
167 | "should_succeed": False,
168 | },
169 | {
170 | "name": "invalid_numeric_params",
171 | "params": {
172 | "symbol": "AAPL",
173 | "strategy": "sma_cross",
174 | "fast_period": "invalid_number",
175 | },
176 | "should_succeed": False,
177 | },
178 | ]
179 |
180 | results = {}
181 |
182 | for test_case in test_cases:
183 | try:
184 | # Mock the VectorBT engine to avoid actual data fetching
185 | with patch("maverick_mcp.backtesting.VectorBTEngine") as mock_engine:
186 | mock_instance = Mock()
187 | mock_engine.return_value = mock_instance
188 |
189 | # Mock successful backtest result
190 | mock_result = {
191 | "symbol": test_case["params"]["symbol"],
192 | "strategy_type": test_case["params"]["strategy"],
193 | "metrics": {
194 | "total_return": 0.15,
195 | "sharpe_ratio": 1.2,
196 | "max_drawdown": -0.12,
197 | "total_trades": 25,
198 | },
199 | "trades": [],
200 | "equity_curve": [10000, 10100, 10200, 10300],
201 | "drawdown_series": [0, -0.01, -0.02, 0],
202 | }
203 | mock_instance.run_backtest.return_value = mock_result
204 |
205 | # Execute tool
206 | result = await tool_func(mock_context, **test_case["params"])
207 |
208 | if test_case["should_succeed"]:
209 | assert isinstance(result, dict), (
210 | f"Result should be dict for {test_case['name']}"
211 | )
212 | assert "symbol" in result, (
213 | f"Missing symbol in result for {test_case['name']}"
214 | )
215 | assert "metrics" in result, (
216 | f"Missing metrics in result for {test_case['name']}"
217 | )
218 | results[test_case["name"]] = {"success": True, "result": result}
219 | logger.info(f"✓ {test_case['name']} succeeded as expected")
220 | else:
221 | # If we got here, it didn't fail as expected
222 | results[test_case["name"]] = {
223 | "success": False,
224 | "unexpected_success": True,
225 | }
226 | logger.warning(
227 | f"⚠ {test_case['name']} succeeded but was expected to fail"
228 | )
229 |
230 | except Exception as e:
231 | if test_case["should_succeed"]:
232 | results[test_case["name"]] = {"success": False, "error": str(e)}
233 | logger.error(f"✗ {test_case['name']} failed unexpectedly: {e}")
234 | else:
235 | results[test_case["name"]] = {
236 | "success": True,
237 | "expected_error": str(e),
238 | }
239 | logger.info(f"✓ {test_case['name']} failed as expected: {e}")
240 |
241 | # Calculate success rate
242 | total_tests = len(test_cases)
243 | successful_tests = sum(1 for r in results.values() if r.get("success", False))
244 | success_rate = successful_tests / total_tests
245 |
246 | assert success_rate >= 0.8, f"Success rate too low: {success_rate:.1%}"
247 |
248 | return {"test_results": results, "success_rate": success_rate}
249 |
250 | async def test_strategy_tools_integration(self, setup_tools, mock_context):
251 | """Test strategy-related tools integration."""
252 | mcp = setup_tools
253 |
254 | # Test list_strategies tool
255 | list_func = mcp.tool_functions["list_strategies"]
256 | strategies_result = await list_func(mock_context)
257 |
258 | assert isinstance(strategies_result, dict), "list_strategies should return dict"
259 | assert "available_strategies" in strategies_result, (
260 | "Missing available_strategies"
261 | )
262 | assert "total_count" in strategies_result, "Missing total_count"
263 | assert strategies_result["total_count"] > 0, "Should have strategies available"
264 |
265 | logger.info(f"✓ Found {strategies_result['total_count']} available strategies")
266 |
267 | # Test parse_strategy tool
268 | parse_func = mcp.tool_functions["parse_strategy"]
269 |
270 | parse_test_cases = [
271 | "Buy when RSI is below 30 and sell when above 70",
272 | "Use 10-day and 20-day moving average crossover",
273 | "MACD strategy with standard parameters",
274 | "Invalid strategy description that makes no sense",
275 | ]
276 |
277 | parse_results = {}
278 | for description in parse_test_cases:
279 | try:
280 | result = await parse_func(mock_context, description=description)
281 | assert isinstance(result, dict), "parse_strategy should return dict"
282 | assert "success" in result, "Missing success field"
283 | assert "strategy" in result, "Missing strategy field"
284 |
285 | parse_results[description] = result
286 | status = "✓" if result["success"] else "⚠"
287 | logger.info(
288 | f"{status} Parsed: '{description}' -> {result['strategy'].get('strategy_type', 'unknown')}"
289 | )
290 |
291 | except Exception as e:
292 | parse_results[description] = {"error": str(e)}
293 | logger.error(f"✗ Parse failed for: '{description}' - {e}")
294 |
295 | return {
296 | "strategies_list": strategies_result,
297 | "parse_results": parse_results,
298 | }
299 |
300 | async def test_optimization_tools_integration(self, setup_tools, mock_context):
301 | """Test optimization-related tools integration."""
302 | mcp = setup_tools
303 |
304 | # Mock VectorBT engine for optimization tests
305 | with patch("maverick_mcp.backtesting.VectorBTEngine") as mock_engine_class:
306 | mock_engine = Mock()
307 | mock_engine_class.return_value = mock_engine
308 |
309 | # Mock optimization results
310 | mock_optimization_result = {
311 | "best_parameters": {"fast_period": 12, "slow_period": 26},
312 | "best_performance": {
313 | "total_return": 0.25,
314 | "sharpe_ratio": 1.8,
315 | "max_drawdown": -0.08,
316 | },
317 | "optimization_results": [
318 | {
319 | "parameters": {"fast_period": 10, "slow_period": 20},
320 | "metrics": {"sharpe_ratio": 1.2},
321 | },
322 | {
323 | "parameters": {"fast_period": 12, "slow_period": 26},
324 | "metrics": {"sharpe_ratio": 1.8},
325 | },
326 | ],
327 | }
328 | mock_engine.optimize_parameters.return_value = mock_optimization_result
329 |
330 | # Test optimize_strategy tool
331 | optimize_func = mcp.tool_functions["optimize_strategy"]
332 |
333 | result = await optimize_func(
334 | mock_context,
335 | symbol="AAPL",
336 | strategy="sma_cross",
337 | optimization_level="medium",
338 | top_n=5,
339 | )
340 |
341 | assert isinstance(result, dict), "optimize_strategy should return dict"
342 | logger.info("✓ optimize_strategy tool executed successfully")
343 |
344 | # Test walk_forward_analysis tool
345 | walk_forward_func = mcp.tool_functions["walk_forward_analysis"]
346 |
347 | # Mock walk-forward analysis
348 | with patch(
349 | "maverick_mcp.backtesting.StrategyOptimizer"
350 | ) as mock_optimizer_class:
351 | mock_optimizer = Mock()
352 | mock_optimizer_class.return_value = mock_optimizer
353 |
354 | mock_walk_forward_result = {
355 | "out_of_sample_performance": {
356 | "total_return": 0.18,
357 | "sharpe_ratio": 1.5,
358 | "win_rate": 0.65,
359 | },
360 | "windows_tested": 4,
361 | "average_window_performance": 0.15,
362 | }
363 | mock_optimizer.walk_forward_analysis.return_value = (
364 | mock_walk_forward_result
365 | )
366 |
367 | result = await walk_forward_func(
368 | mock_context,
369 | symbol="AAPL",
370 | strategy="sma_cross",
371 | window_size=252,
372 | step_size=63,
373 | )
374 |
375 | assert isinstance(result, dict), (
376 | "walk_forward_analysis should return dict"
377 | )
378 | logger.info("✓ walk_forward_analysis tool executed successfully")
379 |
380 | return {"optimization_tests": "completed"}
381 |
382 | async def test_ml_tools_integration(self, setup_tools, mock_context):
383 | """Test ML-enhanced tools integration."""
384 | mcp = setup_tools
385 |
386 | # Test ML strategy tools
387 | ml_tools = [
388 | "run_ml_strategy_backtest",
389 | "train_ml_predictor",
390 | "analyze_market_regimes",
391 | "create_strategy_ensemble",
392 | ]
393 |
394 | ml_results = {}
395 |
396 | for tool_name in ml_tools:
397 | if tool_name in mcp.tool_functions:
398 | try:
399 | tool_func = mcp.tool_functions[tool_name]
400 |
401 | # Mock ML dependencies
402 | with patch(
403 | "maverick_mcp.backtesting.VectorBTEngine"
404 | ) as mock_engine:
405 | mock_instance = Mock()
406 | mock_engine.return_value = mock_instance
407 |
408 | # Mock historical data
409 | import numpy as np
410 | import pandas as pd
411 |
412 | dates = pd.date_range(
413 | start="2022-01-01", end="2023-12-31", freq="D"
414 | )
415 | mock_data = pd.DataFrame(
416 | {
417 | "open": np.random.uniform(100, 200, len(dates)),
418 | "high": np.random.uniform(100, 200, len(dates)),
419 | "low": np.random.uniform(100, 200, len(dates)),
420 | "close": np.random.uniform(100, 200, len(dates)),
421 | "volume": np.random.randint(
422 | 1000000, 10000000, len(dates)
423 | ),
424 | },
425 | index=dates,
426 | )
427 | mock_instance.get_historical_data.return_value = mock_data
428 |
429 | # Test specific ML tools
430 | if tool_name == "run_ml_strategy_backtest":
431 | result = await tool_func(
432 | mock_context,
433 | symbol="AAPL",
434 | strategy_type="ml_predictor",
435 | model_type="random_forest",
436 | )
437 | elif tool_name == "train_ml_predictor":
438 | result = await tool_func(
439 | mock_context,
440 | symbol="AAPL",
441 | model_type="random_forest",
442 | n_estimators=100,
443 | )
444 | elif tool_name == "analyze_market_regimes":
445 | result = await tool_func(
446 | mock_context,
447 | symbol="AAPL",
448 | method="hmm",
449 | n_regimes=3,
450 | )
451 | elif tool_name == "create_strategy_ensemble":
452 | result = await tool_func(
453 | mock_context,
454 | symbols=["AAPL", "GOOGL"],
455 | base_strategies=["sma_cross", "rsi"],
456 | )
457 |
458 | ml_results[tool_name] = {
459 | "success": True,
460 | "type": type(result).__name__,
461 | }
462 | logger.info(f"✓ {tool_name} executed successfully")
463 |
464 | except Exception as e:
465 | ml_results[tool_name] = {"success": False, "error": str(e)}
466 | logger.error(f"✗ {tool_name} failed: {e}")
467 | else:
468 | ml_results[tool_name] = {"success": False, "error": "Tool not found"}
469 |
470 | return ml_results
471 |
472 | async def test_visualization_tools_integration(self, setup_tools, mock_context):
473 | """Test visualization tools integration."""
474 | mcp = setup_tools
475 |
476 | visualization_tools = [
477 | "generate_backtest_charts",
478 | "generate_optimization_charts",
479 | ]
480 |
481 | viz_results = {}
482 |
483 | for tool_name in visualization_tools:
484 | if tool_name in mcp.tool_functions:
485 | try:
486 | tool_func = mcp.tool_functions[tool_name]
487 |
488 | # Mock VectorBT engine and visualization dependencies
489 | with patch(
490 | "maverick_mcp.backtesting.VectorBTEngine"
491 | ) as mock_engine:
492 | mock_instance = Mock()
493 | mock_engine.return_value = mock_instance
494 |
495 | # Mock backtest result for charts
496 | mock_result = {
497 | "symbol": "AAPL",
498 | "equity_curve": [10000, 10100, 10200, 10300, 10250],
499 | "drawdown_series": [0, -0.01, -0.02, 0, -0.005],
500 | "trades": [
501 | {
502 | "entry_time": "2023-01-01",
503 | "exit_time": "2023-02-01",
504 | "pnl": 100,
505 | },
506 | {
507 | "entry_time": "2023-03-01",
508 | "exit_time": "2023-04-01",
509 | "pnl": -50,
510 | },
511 | ],
512 | "metrics": {
513 | "total_return": 0.15,
514 | "sharpe_ratio": 1.2,
515 | "max_drawdown": -0.08,
516 | "total_trades": 10,
517 | },
518 | }
519 | mock_instance.run_backtest.return_value = mock_result
520 |
521 | # Mock visualization functions
522 | with patch(
523 | "maverick_mcp.backtesting.visualization.generate_equity_curve"
524 | ) as mock_equity:
525 | with patch(
526 | "maverick_mcp.backtesting.visualization.generate_performance_dashboard"
527 | ) as mock_dashboard:
528 | with patch(
529 | "maverick_mcp.backtesting.visualization.generate_trade_scatter"
530 | ) as mock_scatter:
531 | with patch(
532 | "maverick_mcp.backtesting.visualization.generate_optimization_heatmap"
533 | ) as mock_heatmap:
534 | # Mock chart returns (base64 strings)
535 | mock_chart_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
536 | mock_equity.return_value = mock_chart_data
537 | mock_dashboard.return_value = mock_chart_data
538 | mock_scatter.return_value = mock_chart_data
539 | mock_heatmap.return_value = mock_chart_data
540 |
541 | # Execute visualization tool
542 | result = await tool_func(
543 | mock_context,
544 | symbol="AAPL",
545 | strategy="sma_cross",
546 | theme="light",
547 | )
548 |
549 | assert isinstance(result, dict), (
550 | f"{tool_name} should return dict"
551 | )
552 |
553 | # Validate chart data
554 | for chart_name, chart_data in result.items():
555 | assert isinstance(chart_data, str), (
556 | f"Chart {chart_name} should be string"
557 | )
558 | assert len(chart_data) > 0, (
559 | f"Chart {chart_name} should have data"
560 | )
561 |
562 | viz_results[tool_name] = {
563 | "success": True,
564 | "charts_generated": list(result.keys()),
565 | "chart_count": len(result),
566 | }
567 | logger.info(
568 | f"✓ {tool_name} generated {len(result)} charts successfully"
569 | )
570 |
571 | except Exception as e:
572 | viz_results[tool_name] = {"success": False, "error": str(e)}
573 | logger.error(f"✗ {tool_name} failed: {e}")
574 | else:
575 | viz_results[tool_name] = {"success": False, "error": "Tool not found"}
576 |
577 | return viz_results
578 |
579 | async def test_claude_desktop_simulation(self, setup_tools, mock_context):
580 | """Simulate realistic Claude Desktop usage patterns."""
581 | mcp = setup_tools
582 |
583 | # Simulate a typical Claude Desktop session
584 | session_commands = [
585 | {
586 | "command": "List available strategies",
587 | "tool": "list_strategies",
588 | "params": {},
589 | },
590 | {
591 | "command": "Run backtest for AAPL with SMA crossover",
592 | "tool": "run_backtest",
593 | "params": {
594 | "symbol": "AAPL",
595 | "strategy": "sma_cross",
596 | "fast_period": "10",
597 | "slow_period": "20",
598 | },
599 | },
600 | {
601 | "command": "Compare multiple strategies",
602 | "tool": "compare_strategies",
603 | "params": {
604 | "symbol": "AAPL",
605 | "strategies": ["sma_cross", "rsi", "macd"],
606 | },
607 | },
608 | {
609 | "command": "Generate charts for backtest",
610 | "tool": "generate_backtest_charts",
611 | "params": {
612 | "symbol": "AAPL",
613 | "strategy": "sma_cross",
614 | },
615 | },
616 | ]
617 |
618 | session_results = []
619 |
620 | # Mock all necessary dependencies for simulation
621 | with patch("maverick_mcp.backtesting.VectorBTEngine") as mock_engine_class:
622 | mock_engine = Mock()
623 | mock_engine_class.return_value = mock_engine
624 |
625 | # Mock results for different tools
626 | mock_backtest_result = {
627 | "symbol": "AAPL",
628 | "strategy_type": "sma_cross",
629 | "metrics": {"total_return": 0.15, "sharpe_ratio": 1.2},
630 | "trades": [],
631 | "equity_curve": [10000, 10150],
632 | "drawdown_series": [0, -0.02],
633 | }
634 | mock_engine.run_backtest.return_value = mock_backtest_result
635 |
636 | # Mock comparison results
637 | with patch(
638 | "maverick_mcp.backtesting.BacktestAnalyzer"
639 | ) as mock_analyzer_class:
640 | mock_analyzer = Mock()
641 | mock_analyzer_class.return_value = mock_analyzer
642 |
643 | mock_comparison = {
644 | "strategy_rankings": [
645 | {"strategy": "sma_cross", "rank": 1, "sharpe_ratio": 1.2},
646 | {"strategy": "rsi", "rank": 2, "sharpe_ratio": 1.1},
647 | {"strategy": "macd", "rank": 3, "sharpe_ratio": 0.9},
648 | ],
649 | "best_strategy": "sma_cross",
650 | }
651 | mock_analyzer.compare_strategies.return_value = mock_comparison
652 |
653 | # Mock visualization
654 | with patch(
655 | "maverick_mcp.backtesting.visualization.generate_equity_curve"
656 | ) as mock_viz:
657 | mock_viz.return_value = "mock_chart_data"
658 |
659 | # Execute session commands
660 | for command_info in session_commands:
661 | try:
662 | start_time = asyncio.get_event_loop().time()
663 |
664 | tool_func = mcp.tool_functions[command_info["tool"]]
665 | result = await tool_func(
666 | mock_context, **command_info["params"]
667 | )
668 |
669 | execution_time = (
670 | asyncio.get_event_loop().time() - start_time
671 | )
672 |
673 | session_results.append(
674 | {
675 | "command": command_info["command"],
676 | "tool": command_info["tool"],
677 | "success": True,
678 | "execution_time": execution_time,
679 | "result_type": type(result).__name__,
680 | }
681 | )
682 |
683 | logger.info(
684 | f"✓ '{command_info['command']}' completed in {execution_time:.3f}s"
685 | )
686 |
687 | except Exception as e:
688 | session_results.append(
689 | {
690 | "command": command_info["command"],
691 | "tool": command_info["tool"],
692 | "success": False,
693 | "error": str(e),
694 | }
695 | )
696 | logger.error(f"✗ '{command_info['command']}' failed: {e}")
697 |
698 | # Analyze session results
699 | total_commands = len(session_commands)
700 | successful_commands = sum(1 for r in session_results if r.get("success", False))
701 | success_rate = successful_commands / total_commands
702 | avg_execution_time = np.mean(
703 | [r.get("execution_time", 0) for r in session_results if r.get("success")]
704 | )
705 |
706 | assert success_rate >= 0.75, f"Session success rate too low: {success_rate:.1%}"
707 | assert avg_execution_time < 5.0, (
708 | f"Average execution time too high: {avg_execution_time:.3f}s"
709 | )
710 |
711 | logger.info(
712 | f"Claude Desktop Simulation Results:\n"
713 | f" • Commands Executed: {total_commands}\n"
714 | f" • Successful: {successful_commands}\n"
715 | f" • Success Rate: {success_rate:.1%}\n"
716 | f" • Avg Execution Time: {avg_execution_time:.3f}s"
717 | )
718 |
719 | return {
720 | "session_results": session_results,
721 | "success_rate": success_rate,
722 | "avg_execution_time": avg_execution_time,
723 | }
724 |
725 | async def test_tool_parameter_validation_comprehensive(
726 | self, setup_tools, mock_context
727 | ):
728 | """Test comprehensive parameter validation across all tools."""
729 | mcp = setup_tools
730 |
731 | validation_tests = []
732 |
733 | # Test parameter validation for key tools
734 | test_cases = [
735 | {
736 | "tool": "run_backtest",
737 | "valid_params": {"symbol": "AAPL", "strategy": "sma_cross"},
738 | "invalid_params": [
739 | {"symbol": "", "strategy": "sma_cross"}, # Empty symbol
740 | {"symbol": "AAPL", "strategy": ""}, # Empty strategy
741 | {
742 | "symbol": "AAPL",
743 | "strategy": "sma_cross",
744 | "fast_period": "not_a_number",
745 | }, # Invalid number
746 | ],
747 | },
748 | {
749 | "tool": "optimize_strategy",
750 | "valid_params": {"symbol": "AAPL", "strategy": "sma_cross"},
751 | "invalid_params": [
752 | {
753 | "symbol": "AAPL",
754 | "strategy": "invalid_strategy",
755 | }, # Invalid strategy
756 | {
757 | "symbol": "AAPL",
758 | "strategy": "sma_cross",
759 | "top_n": -1,
760 | }, # Negative top_n
761 | ],
762 | },
763 | ]
764 |
765 | for test_case in test_cases:
766 | tool_name = test_case["tool"]
767 | if tool_name in mcp.tool_functions:
768 | tool_func = mcp.tool_functions[tool_name]
769 |
770 | # Test valid parameters
771 | try:
772 | with patch("maverick_mcp.backtesting.VectorBTEngine"):
773 | await tool_func(mock_context, **test_case["valid_params"])
774 | validation_tests.append(
775 | {
776 | "tool": tool_name,
777 | "test": "valid_params",
778 | "success": True,
779 | }
780 | )
781 | except Exception as e:
782 | validation_tests.append(
783 | {
784 | "tool": tool_name,
785 | "test": "valid_params",
786 | "success": False,
787 | "error": str(e),
788 | }
789 | )
790 |
791 | # Test invalid parameters
792 | for invalid_params in test_case["invalid_params"]:
793 | try:
794 | with patch("maverick_mcp.backtesting.VectorBTEngine"):
795 | await tool_func(mock_context, **invalid_params)
796 | # If we got here, validation didn't catch the error
797 | validation_tests.append(
798 | {
799 | "tool": tool_name,
800 | "test": f"invalid_params_{invalid_params}",
801 | "success": False,
802 | "error": "Validation should have failed but didn't",
803 | }
804 | )
805 | except Exception as e:
806 | # Expected to fail
807 | validation_tests.append(
808 | {
809 | "tool": tool_name,
810 | "test": f"invalid_params_{invalid_params}",
811 | "success": True,
812 | "expected_error": str(e),
813 | }
814 | )
815 |
816 | # Calculate validation success rate
817 | total_validation_tests = len(validation_tests)
818 | successful_validations = sum(
819 | 1 for t in validation_tests if t.get("success", False)
820 | )
821 | validation_success_rate = (
822 | successful_validations / total_validation_tests
823 | if total_validation_tests > 0
824 | else 0
825 | )
826 |
827 | logger.info(
828 | f"Parameter Validation Results:\n"
829 | f" • Total Validation Tests: {total_validation_tests}\n"
830 | f" • Successful Validations: {successful_validations}\n"
831 | f" • Validation Success Rate: {validation_success_rate:.1%}"
832 | )
833 |
834 | return {
835 | "validation_tests": validation_tests,
836 | "validation_success_rate": validation_success_rate,
837 | }
838 |
839 |
840 | if __name__ == "__main__":
841 | # Run MCP tools integration tests
842 | pytest.main(
843 | [
844 | __file__,
845 | "-v",
846 | "--tb=short",
847 | "--asyncio-mode=auto",
848 | "--timeout=300", # 5 minute timeout
849 | "--durations=10",
850 | ]
851 | )
852 |
```
--------------------------------------------------------------------------------
/maverick_mcp/agents/supervisor.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | SupervisorAgent implementation using 2025 LangGraph patterns.
3 |
4 | Orchestrates multiple specialized agents with intelligent routing, result synthesis,
5 | and conflict resolution for comprehensive financial analysis.
6 | """
7 |
8 | import logging
9 | from datetime import datetime
10 | from typing import Any
11 |
12 | from langchain_core.language_models import BaseChatModel
13 | from langchain_core.messages import HumanMessage, SystemMessage
14 | from langchain_core.tools import BaseTool
15 | from langgraph.checkpoint.memory import MemorySaver
16 | from langgraph.graph import END, START, StateGraph
17 | from langgraph.types import Command
18 |
19 | from maverick_mcp.agents.base import INVESTOR_PERSONAS, PersonaAwareAgent
20 | from maverick_mcp.config.settings import get_settings
21 | from maverick_mcp.exceptions import AgentInitializationError
22 | from maverick_mcp.memory.stores import ConversationStore
23 | from maverick_mcp.workflows.state import SupervisorState
24 |
25 | logger = logging.getLogger(__name__)
26 | settings = get_settings()
27 |
28 | # Query routing matrix for intelligent agent selection
29 | ROUTING_MATRIX = {
30 | "market_screening": {
31 | "agents": ["market"],
32 | "primary": "market",
33 | "parallel": False,
34 | "confidence_threshold": 0.7,
35 | "synthesis_required": False,
36 | },
37 | "technical_analysis": {
38 | "agents": ["technical"],
39 | "primary": "technical",
40 | "parallel": False,
41 | "confidence_threshold": 0.8,
42 | "synthesis_required": False,
43 | },
44 | "stock_investment_decision": {
45 | "agents": ["market", "technical"],
46 | "primary": "technical",
47 | "parallel": True,
48 | "confidence_threshold": 0.85,
49 | "synthesis_required": True,
50 | },
51 | "portfolio_analysis": {
52 | "agents": ["market", "technical"],
53 | "primary": "market",
54 | "parallel": True,
55 | "confidence_threshold": 0.75,
56 | "synthesis_required": True,
57 | },
58 | "deep_research": {
59 | "agents": ["research"], # Research agent handles comprehensive analysis
60 | "primary": "research",
61 | "parallel": False,
62 | "confidence_threshold": 0.9,
63 | "synthesis_required": False, # Research agent provides complete analysis
64 | },
65 | "company_research": {
66 | "agents": ["research"], # Dedicated company research
67 | "primary": "research",
68 | "parallel": False,
69 | "confidence_threshold": 0.85,
70 | "synthesis_required": False,
71 | },
72 | "sentiment_analysis": {
73 | "agents": ["research"], # Market sentiment analysis
74 | "primary": "research",
75 | "parallel": False,
76 | "confidence_threshold": 0.8,
77 | "synthesis_required": False,
78 | },
79 | "risk_assessment": {
80 | "agents": ["market", "technical"], # Future risk agent integration
81 | "primary": "market",
82 | "parallel": True,
83 | "confidence_threshold": 0.8,
84 | "synthesis_required": True,
85 | },
86 | }
87 |
88 |
89 | class QueryClassifier:
90 | """LLM-powered query classification with rule-based fallback."""
91 |
92 | def __init__(self, llm: BaseChatModel):
93 | self.llm = llm
94 |
95 | async def classify_query(self, query: str, persona: str) -> dict[str, Any]:
96 | """Classify query using LLM with structured output."""
97 |
98 | classification_prompt = f"""
99 | Analyze this financial query and classify it for multi-agent routing.
100 |
101 | Query: "{query}"
102 | Investor Persona: {persona}
103 |
104 | Classify into one of these categories:
105 | 1. market_screening - Finding stocks, sector analysis, market breadth
106 | 2. technical_analysis - Chart patterns, indicators, entry/exit points
107 | 3. stock_investment_decision - Complete analysis of specific stock(s)
108 | 4. portfolio_analysis - Portfolio optimization, risk assessment
109 | 5. deep_research - Fundamental analysis, company research, news analysis
110 | 6. risk_assessment - Position sizing, risk management, portfolio risk
111 |
112 | Consider the complexity and return classification with confidence.
113 |
114 | Return ONLY valid JSON in this exact format:
115 | {{
116 | "category": "category_name",
117 | "confidence": 0.85,
118 | "required_agents": ["agent1", "agent2"],
119 | "complexity": "simple",
120 | "estimated_execution_time_ms": 30000,
121 | "parallel_capable": true,
122 | "reasoning": "Brief explanation of classification"
123 | }}
124 | """
125 |
126 | try:
127 | response = await self.llm.ainvoke(
128 | [
129 | SystemMessage(
130 | content="You are a financial query classifier. Return only valid JSON."
131 | ),
132 | HumanMessage(content=classification_prompt),
133 | ]
134 | )
135 |
136 | # Parse LLM response
137 | import json
138 |
139 | classification = json.loads(response.content.strip())
140 |
141 | # Validate and enhance with routing matrix
142 | category = classification.get("category", "stock_investment_decision")
143 | routing_config = ROUTING_MATRIX.get(
144 | category, ROUTING_MATRIX["stock_investment_decision"]
145 | )
146 |
147 | return {
148 | **classification,
149 | "routing_config": routing_config,
150 | "timestamp": datetime.now(),
151 | }
152 |
153 | except Exception as e:
154 | logger.warning(f"LLM classification failed: {e}, using rule-based fallback")
155 | return self._rule_based_fallback(query, persona)
156 |
157 | def _rule_based_fallback(self, query: str, persona: str) -> dict[str, Any]:
158 | """Rule-based classification fallback."""
159 | query_lower = query.lower()
160 |
161 | # Simple keyword-based classification
162 | if any(
163 | word in query_lower for word in ["screen", "find stocks", "scan", "search"]
164 | ):
165 | category = "market_screening"
166 | elif any(
167 | word in query_lower
168 | for word in ["chart", "technical", "rsi", "macd", "pattern"]
169 | ):
170 | category = "technical_analysis"
171 | elif any(
172 | word in query_lower for word in ["portfolio", "allocation", "diversif"]
173 | ):
174 | category = "portfolio_analysis"
175 | elif any(
176 | word in query_lower
177 | for word in ["research", "fundamental", "news", "earnings"]
178 | ):
179 | category = "deep_research"
180 | elif any(
181 | word in query_lower
182 | for word in ["company", "business", "competitive", "industry"]
183 | ):
184 | category = "company_research"
185 | elif any(
186 | word in query_lower for word in ["sentiment", "opinion", "mood", "feeling"]
187 | ):
188 | category = "sentiment_analysis"
189 | elif any(
190 | word in query_lower for word in ["risk", "position size", "stop loss"]
191 | ):
192 | category = "risk_assessment"
193 | else:
194 | category = "stock_investment_decision"
195 |
196 | routing_config = ROUTING_MATRIX[category]
197 |
198 | return {
199 | "category": category,
200 | "confidence": 0.6,
201 | "required_agents": routing_config["agents"],
202 | "complexity": "moderate",
203 | "estimated_execution_time_ms": 60000,
204 | "parallel_capable": routing_config["parallel"],
205 | "reasoning": "Rule-based classification fallback",
206 | "routing_config": routing_config,
207 | "timestamp": datetime.now(),
208 | }
209 |
210 |
211 | class ResultSynthesizer:
212 | """Synthesize results from multiple agents with conflict resolution."""
213 |
214 | def __init__(self, llm: BaseChatModel, persona):
215 | self.llm = llm
216 | self.persona = persona
217 |
218 | async def synthesize_results(
219 | self,
220 | agent_results: dict[str, Any],
221 | query_type: str,
222 | conflicts: list[dict[str, Any]],
223 | ) -> dict[str, Any]:
224 | """Synthesize final recommendation from agent results."""
225 |
226 | # Calculate agent weights based on query type and persona
227 | weights = self._calculate_agent_weights(query_type, agent_results)
228 |
229 | # Create synthesis prompt
230 | synthesis_prompt = self._build_synthesis_prompt(
231 | agent_results, weights, query_type, conflicts
232 | )
233 |
234 | # Use LLM to synthesize coherent response
235 | synthesis_response = await self.llm.ainvoke(
236 | [
237 | SystemMessage(content="You are a financial analysis synthesizer."),
238 | HumanMessage(content=synthesis_prompt),
239 | ]
240 | )
241 |
242 | return {
243 | "synthesis": synthesis_response.content,
244 | "weights_applied": weights,
245 | "conflicts_resolved": len(conflicts),
246 | "confidence_score": self._calculate_overall_confidence(
247 | agent_results, weights
248 | ),
249 | "contributing_agents": list(agent_results.keys()),
250 | "persona_alignment": self._assess_persona_alignment(
251 | synthesis_response.content
252 | ),
253 | }
254 |
255 | def _calculate_agent_weights(
256 | self, query_type: str, agent_results: dict
257 | ) -> dict[str, float]:
258 | """Calculate weights for agent results based on context."""
259 | base_weights = {
260 | "market_screening": {"market": 0.9, "technical": 0.1},
261 | "technical_analysis": {"market": 0.2, "technical": 0.8},
262 | "stock_investment_decision": {"market": 0.4, "technical": 0.6},
263 | "portfolio_analysis": {"market": 0.6, "technical": 0.4},
264 | "deep_research": {"research": 1.0},
265 | "company_research": {"research": 1.0},
266 | "sentiment_analysis": {"research": 1.0},
267 | "risk_assessment": {"market": 0.3, "technical": 0.3, "risk": 0.4},
268 | }
269 |
270 | weights = base_weights.get(query_type, {"market": 0.5, "technical": 0.5})
271 |
272 | # Adjust weights based on agent confidence scores
273 | for agent, base_weight in weights.items():
274 | if agent in agent_results:
275 | confidence = agent_results[agent].get("confidence_score", 0.5)
276 | weights[agent] = base_weight * (0.5 + confidence * 0.5)
277 |
278 | # Normalize weights to sum to 1.0
279 | total_weight = sum(weights.values())
280 | if total_weight > 0:
281 | weights = {k: v / total_weight for k, v in weights.items()}
282 |
283 | return weights
284 |
285 | def _build_synthesis_prompt(
286 | self,
287 | agent_results: dict[str, Any],
288 | weights: dict[str, float],
289 | query_type: str,
290 | conflicts: list[dict[str, Any]],
291 | ) -> str:
292 | """Build synthesis prompt for LLM."""
293 |
294 | prompt = f"""
295 | Synthesize a comprehensive financial analysis response from multiple specialized agents.
296 |
297 | Query Type: {query_type}
298 | Investor Persona: {self.persona.name} - {", ".join(self.persona.characteristics)}
299 |
300 | Agent Results:
301 | """
302 |
303 | for agent, result in agent_results.items():
304 | weight = weights.get(agent, 0.0)
305 | prompt += f"\n{agent.upper()} Agent (Weight: {weight:.2f}):\n"
306 | prompt += f" - Confidence: {result.get('confidence_score', 0.5)}\n"
307 | prompt += (
308 | f" - Analysis: {result.get('analysis', 'No analysis provided')}\n"
309 | )
310 | if "recommendations" in result:
311 | prompt += f" - Recommendations: {result['recommendations']}\n"
312 |
313 | if conflicts:
314 | prompt += f"\nConflicts Detected ({len(conflicts)}):\n"
315 | for i, conflict in enumerate(conflicts, 1):
316 | prompt += f"{i}. {conflict}\n"
317 |
318 | prompt += f"""
319 |
320 | Please synthesize these results into a coherent, actionable response that:
321 | 1. Weighs agent inputs according to their weights and confidence scores
322 | 2. Resolves any conflicts using the {self.persona.name} investor perspective
323 | 3. Provides clear, actionable recommendations aligned with {self.persona.name} characteristics
324 | 4. Includes appropriate risk disclaimers
325 | 5. Maintains professional, confident tone
326 |
327 | Focus on actionable insights for the {self.persona.name} investor profile.
328 | """
329 |
330 | return prompt
331 |
332 | def _calculate_overall_confidence(
333 | self, agent_results: dict, weights: dict[str, float]
334 | ) -> float:
335 | """Calculate weighted overall confidence score."""
336 | total_confidence = 0.0
337 | total_weight = 0.0
338 |
339 | for agent, weight in weights.items():
340 | if agent in agent_results:
341 | confidence = agent_results[agent].get("confidence_score", 0.5)
342 | total_confidence += confidence * weight
343 | total_weight += weight
344 |
345 | return total_confidence / total_weight if total_weight > 0 else 0.5
346 |
347 | def _assess_persona_alignment(self, synthesis_content: str) -> float:
348 | """Assess how well synthesis aligns with investor persona."""
349 | # Simple keyword-based alignment scoring
350 | persona_keywords = {
351 | "conservative": ["stable", "dividend", "low-risk", "preservation"],
352 | "moderate": ["balanced", "diversified", "moderate", "growth"],
353 | "aggressive": ["growth", "momentum", "high-return", "opportunity"],
354 | }
355 |
356 | keywords = persona_keywords.get(self.persona.name.lower(), [])
357 | content_lower = synthesis_content.lower()
358 |
359 | alignment_score = sum(1 for keyword in keywords if keyword in content_lower)
360 | return min(alignment_score / len(keywords) if keywords else 0.5, 1.0)
361 |
362 |
363 | class SupervisorAgent(PersonaAwareAgent):
364 | """
365 | Multi-agent supervisor using 2025 LangGraph patterns.
366 |
367 | Orchestrates MarketAnalysisAgent, TechnicalAnalysisAgent, and future DeepResearchAgent
368 | with intelligent routing, result synthesis, and conflict resolution.
369 | """
370 |
371 | def __init__(
372 | self,
373 | llm: BaseChatModel,
374 | agents: dict[str, PersonaAwareAgent],
375 | persona: str = "moderate",
376 | checkpointer: MemorySaver | None = None,
377 | ttl_hours: int = 1,
378 | routing_strategy: str = "llm_powered",
379 | synthesis_mode: str = "weighted",
380 | conflict_resolution: str = "confidence_based",
381 | max_iterations: int = 5,
382 | ):
383 | """Initialize supervisor with existing agent instances."""
384 |
385 | if not agents:
386 | raise AgentInitializationError(
387 | agent_type="SupervisorAgent",
388 | reason="No agents provided for supervision",
389 | )
390 |
391 | # Store agent references
392 | self.agents = agents
393 | self.market_agent = agents.get("market")
394 | self.technical_agent = agents.get("technical")
395 | self.research_agent = agents.get("research") # DeepResearchAgent integration
396 |
397 | # Configuration
398 | self.routing_strategy = routing_strategy
399 | self.synthesis_mode = synthesis_mode
400 | self.conflict_resolution = conflict_resolution
401 | self.max_iterations = max_iterations
402 |
403 | # Ensure all agents use the same persona
404 | persona_obj = INVESTOR_PERSONAS.get(persona, INVESTOR_PERSONAS["moderate"])
405 | for agent in agents.values():
406 | if hasattr(agent, "persona"):
407 | agent.persona = persona_obj
408 |
409 | # Get supervisor-specific tools
410 | supervisor_tools = self._get_supervisor_tools()
411 |
412 | # Initialize base class
413 | super().__init__(
414 | llm=llm,
415 | tools=supervisor_tools,
416 | persona=persona,
417 | checkpointer=checkpointer or MemorySaver(),
418 | ttl_hours=ttl_hours,
419 | )
420 |
421 | # Initialize components
422 | self.conversation_store = ConversationStore(ttl_hours=ttl_hours)
423 | self.query_classifier = QueryClassifier(llm)
424 | self.result_synthesizer = ResultSynthesizer(llm, self.persona)
425 |
426 | logger.info(
427 | f"SupervisorAgent initialized with {len(agents)} agents: {list(agents.keys())}"
428 | )
429 |
430 | def get_state_schema(self) -> type:
431 | """Return SupervisorState schema."""
432 | return SupervisorState
433 |
434 | def _get_supervisor_tools(self) -> list[BaseTool]:
435 | """Get tools specific to supervision and coordination."""
436 | from langchain_core.tools import tool
437 |
438 | tools = []
439 |
440 | if self.market_agent:
441 |
442 | @tool
443 | async def query_market_agent(
444 | query: str,
445 | session_id: str,
446 | screening_strategy: str = "momentum",
447 | max_results: int = 20,
448 | ) -> dict[str, Any]:
449 | """Query the market analysis agent for stock screening and market analysis."""
450 | try:
451 | return await self.market_agent.analyze_market(
452 | query=query,
453 | session_id=session_id,
454 | screening_strategy=screening_strategy,
455 | max_results=max_results,
456 | )
457 | except Exception as e:
458 | return {"error": f"Market agent error: {str(e)}"}
459 |
460 | tools.append(query_market_agent)
461 |
462 | if self.technical_agent:
463 |
464 | @tool
465 | async def query_technical_agent(
466 | symbol: str, timeframe: str = "1d", indicators: list[str] | None = None
467 | ) -> dict[str, Any]:
468 | """Query the technical analysis agent for chart analysis and indicators."""
469 | try:
470 | if indicators is None:
471 | indicators = ["sma_20", "rsi", "macd"]
472 |
473 | return await self.technical_agent.analyze_stock(
474 | symbol=symbol, timeframe=timeframe, indicators=indicators
475 | )
476 | except Exception as e:
477 | return {"error": f"Technical agent error: {str(e)}"}
478 |
479 | tools.append(query_technical_agent)
480 |
481 | if self.research_agent:
482 |
483 | @tool
484 | async def query_research_agent(
485 | query: str,
486 | session_id: str,
487 | research_scope: str = "comprehensive",
488 | max_sources: int = 50,
489 | timeframe: str = "1m",
490 | ) -> dict[str, Any]:
491 | """Query the deep research agent for comprehensive research and analysis."""
492 | try:
493 | return await self.research_agent.research_topic(
494 | query=query,
495 | session_id=session_id,
496 | research_scope=research_scope,
497 | max_sources=max_sources,
498 | timeframe=timeframe,
499 | )
500 | except Exception as e:
501 | return {"error": f"Research agent error: {str(e)}"}
502 |
503 | @tool
504 | async def analyze_company_research(
505 | symbol: str, session_id: str, include_competitive: bool = True
506 | ) -> dict[str, Any]:
507 | """Perform comprehensive company research and fundamental analysis."""
508 | try:
509 | return await self.research_agent.research_company_comprehensive(
510 | symbol=symbol,
511 | session_id=session_id,
512 | include_competitive_analysis=include_competitive,
513 | )
514 | except Exception as e:
515 | return {"error": f"Company research error: {str(e)}"}
516 |
517 | @tool
518 | async def analyze_market_sentiment_research(
519 | topic: str, session_id: str, timeframe: str = "1w"
520 | ) -> dict[str, Any]:
521 | """Analyze market sentiment using deep research capabilities."""
522 | try:
523 | return await self.research_agent.analyze_market_sentiment(
524 | topic=topic, session_id=session_id, timeframe=timeframe
525 | )
526 | except Exception as e:
527 | return {"error": f"Sentiment analysis error: {str(e)}"}
528 |
529 | tools.extend(
530 | [
531 | query_research_agent,
532 | analyze_company_research,
533 | analyze_market_sentiment_research,
534 | ]
535 | )
536 |
537 | return tools
538 |
539 | def _build_graph(self):
540 | """Build supervisor graph with multi-agent coordination."""
541 | workflow = StateGraph(SupervisorState)
542 |
543 | # Core supervisor nodes
544 | workflow.add_node("analyze_query", self._analyze_query)
545 | workflow.add_node("create_execution_plan", self._create_execution_plan)
546 | workflow.add_node("route_to_agents", self._route_to_agents)
547 | workflow.add_node("aggregate_results", self._aggregate_results)
548 | workflow.add_node("resolve_conflicts", self._resolve_conflicts)
549 | workflow.add_node("synthesize_response", self._synthesize_response)
550 |
551 | # Agent invocation nodes
552 | if self.market_agent:
553 | workflow.add_node("invoke_market_agent", self._invoke_market_agent)
554 | if self.technical_agent:
555 | workflow.add_node("invoke_technical_agent", self._invoke_technical_agent)
556 | if self.research_agent:
557 | workflow.add_node("invoke_research_agent", self._invoke_research_agent)
558 |
559 | # Coordination nodes
560 | workflow.add_node("parallel_coordinator", self._parallel_coordinator)
561 |
562 | # Tool node
563 | if self.tools:
564 | from langgraph.prebuilt import ToolNode
565 |
566 | tool_node = ToolNode(self.tools)
567 | workflow.add_node("tools", tool_node)
568 |
569 | # Define workflow edges
570 | workflow.add_edge(START, "analyze_query")
571 | workflow.add_edge("analyze_query", "create_execution_plan")
572 | workflow.add_edge("create_execution_plan", "route_to_agents")
573 |
574 | # Conditional routing based on execution plan
575 | workflow.add_conditional_edges(
576 | "route_to_agents",
577 | self._route_decision,
578 | {
579 | "market_only": "invoke_market_agent"
580 | if self.market_agent
581 | else "synthesize_response",
582 | "technical_only": "invoke_technical_agent"
583 | if self.technical_agent
584 | else "synthesize_response",
585 | "research_only": "invoke_research_agent"
586 | if self.research_agent
587 | else "synthesize_response",
588 | "parallel_execution": "parallel_coordinator",
589 | "use_tools": "tools" if self.tools else "synthesize_response",
590 | "synthesize": "synthesize_response",
591 | },
592 | )
593 |
594 | # Agent result collection
595 | if self.market_agent:
596 | workflow.add_edge("invoke_market_agent", "aggregate_results")
597 | if self.technical_agent:
598 | workflow.add_edge("invoke_technical_agent", "aggregate_results")
599 | if self.research_agent:
600 | workflow.add_edge("invoke_research_agent", "aggregate_results")
601 |
602 | workflow.add_edge("parallel_coordinator", "aggregate_results")
603 |
604 | if self.tools:
605 | workflow.add_edge("tools", "aggregate_results")
606 |
607 | # Conflict detection and resolution
608 | workflow.add_conditional_edges(
609 | "aggregate_results",
610 | self._check_conflicts,
611 | {"resolve": "resolve_conflicts", "synthesize": "synthesize_response"},
612 | )
613 |
614 | workflow.add_edge("resolve_conflicts", "synthesize_response")
615 | workflow.add_edge("synthesize_response", END)
616 |
617 | return workflow.compile(checkpointer=self.checkpointer)
618 |
619 | # Workflow node implementations will continue...
620 | # (The rest of the implementation follows the same pattern)
621 |
622 | async def coordinate_agents(
623 | self, query: str, session_id: str, **kwargs
624 | ) -> dict[str, Any]:
625 | """
626 | Main entry point for multi-agent coordination.
627 |
628 | Args:
629 | query: User query requiring multiple agents
630 | session_id: Session identifier
631 | **kwargs: Additional parameters
632 |
633 | Returns:
634 | Coordinated response from multiple agents
635 | """
636 | start_time = datetime.now()
637 |
638 | # Initialize supervisor state
639 | initial_state = {
640 | "messages": [HumanMessage(content=query)],
641 | "persona": self.persona.name,
642 | "session_id": session_id,
643 | "timestamp": datetime.now(),
644 | "query_classification": {},
645 | "execution_plan": [],
646 | "current_subtask_index": 0,
647 | "routing_strategy": self.routing_strategy,
648 | "active_agents": [],
649 | "agent_results": {},
650 | "agent_confidence": {},
651 | "agent_execution_times": {},
652 | "agent_errors": {},
653 | "workflow_status": "planning",
654 | "parallel_execution": False,
655 | "dependency_graph": {},
656 | "max_iterations": self.max_iterations,
657 | "current_iteration": 0,
658 | "conflicts_detected": [],
659 | "conflict_resolution": {},
660 | "synthesis_weights": {},
661 | "final_recommendation_confidence": 0.0,
662 | "synthesis_mode": self.synthesis_mode,
663 | "total_execution_time_ms": 0.0,
664 | "agent_coordination_overhead_ms": 0.0,
665 | "synthesis_time_ms": 0.0,
666 | "cache_utilization": {},
667 | "api_calls_made": 0,
668 | "cache_hits": 0,
669 | "cache_misses": 0,
670 | # Legacy fields initialized as None for backward compatibility
671 | "query_type": None,
672 | "subtasks": None,
673 | "current_subtask": None,
674 | "workflow_plan": None,
675 | "completed_steps": None,
676 | "pending_steps": None,
677 | "final_recommendations": None,
678 | "confidence_scores": None,
679 | "risk_warnings": None,
680 | }
681 |
682 | # Add any additional parameters
683 | initial_state.update(kwargs)
684 |
685 | # Execute supervision workflow
686 | try:
687 | result = await self.graph.ainvoke(
688 | initial_state,
689 | config={
690 | "configurable": {
691 | "thread_id": session_id,
692 | "checkpoint_ns": "supervisor",
693 | }
694 | },
695 | )
696 |
697 | # Calculate total execution time
698 | execution_time = (datetime.now() - start_time).total_seconds() * 1000
699 | result["total_execution_time_ms"] = execution_time
700 |
701 | return self._format_supervisor_response(result)
702 |
703 | except Exception as e:
704 | logger.error(f"Error in supervisor coordination: {e}")
705 | return {
706 | "status": "error",
707 | "error": str(e),
708 | "total_execution_time_ms": (datetime.now() - start_time).total_seconds()
709 | * 1000,
710 | "agent_type": "supervisor",
711 | }
712 |
713 | def _format_supervisor_response(self, result: dict[str, Any]) -> dict[str, Any]:
714 | """Format supervisor response for consistent output."""
715 | return {
716 | "status": "success",
717 | "agent_type": "supervisor",
718 | "persona": result.get("persona"),
719 | "query_classification": result.get("query_classification", {}),
720 | "agents_used": result.get("active_agents", []),
721 | "synthesis": result.get("messages", [])[-1].content
722 | if result.get("messages")
723 | else "No synthesis available",
724 | "confidence_score": result.get("final_recommendation_confidence", 0.0),
725 | "execution_time_ms": result.get("total_execution_time_ms", 0.0),
726 | "conflicts_resolved": len(result.get("conflicts_detected", [])),
727 | "workflow_status": result.get("workflow_status", "completed"),
728 | }
729 |
730 | # Placeholder implementations for workflow nodes
731 | # These will be implemented based on the specific node logic
732 |
733 | async def _analyze_query(self, state: SupervisorState) -> Command:
734 | """Analyze query to determine routing strategy and requirements."""
735 | query = state["messages"][-1].content if state["messages"] else ""
736 |
737 | # Classify the query
738 | classification = await self.query_classifier.classify_query(
739 | query, state["persona"]
740 | )
741 |
742 | return Command(
743 | goto="create_execution_plan",
744 | update={
745 | "query_classification": classification,
746 | "workflow_status": "analyzing",
747 | },
748 | )
749 |
750 | async def _create_execution_plan(self, state: SupervisorState) -> Command:
751 | """Create execution plan based on query classification."""
752 | classification = state["query_classification"]
753 |
754 | # Create execution plan based on classification
755 | execution_plan = [
756 | {
757 | "task_id": "main_analysis",
758 | "agents": classification.get("required_agents", ["market"]),
759 | "parallel": classification.get("parallel_capable", False),
760 | "priority": 1,
761 | }
762 | ]
763 |
764 | return Command(
765 | goto="route_to_agents",
766 | update={"execution_plan": execution_plan, "workflow_status": "planning"},
767 | )
768 |
769 | async def _route_to_agents(self, state: SupervisorState) -> Command:
770 | """Route query to appropriate agents based on execution plan."""
771 | return Command(
772 | goto="parallel_execution", update={"workflow_status": "executing"}
773 | )
774 |
775 | async def _route_decision(self, state: SupervisorState) -> str:
776 | """Decide routing strategy based on state."""
777 | classification = state.get("query_classification", {})
778 | required_agents = classification.get("required_agents", ["market"])
779 | parallel = classification.get("parallel_capable", False)
780 |
781 | if len(required_agents) == 1:
782 | agent = required_agents[0]
783 | if agent == "market" and self.market_agent:
784 | return "market_only"
785 | elif agent == "technical" and self.technical_agent:
786 | return "technical_only"
787 | elif agent == "research" and self.research_agent:
788 | return "research_only"
789 | elif len(required_agents) > 1 and parallel:
790 | return "parallel_execution"
791 |
792 | return "synthesize"
793 |
794 | async def _parallel_coordinator(self, state: SupervisorState) -> Command:
795 | """Coordinate parallel execution of multiple agents."""
796 | # This would implement parallel agent coordination
797 | # For now, return to aggregation
798 | return Command(
799 | goto="aggregate_results", update={"workflow_status": "aggregating"}
800 | )
801 |
802 | async def _invoke_market_agent(self, state: SupervisorState) -> Command:
803 | """Invoke market analysis agent."""
804 | if not self.market_agent:
805 | return Command(
806 | goto="aggregate_results",
807 | update={"agent_errors": {"market": "Market agent not available"}},
808 | )
809 |
810 | try:
811 | query = state["messages"][-1].content if state["messages"] else ""
812 | result = await self.market_agent.analyze_market(
813 | query=query, session_id=state["session_id"]
814 | )
815 |
816 | return Command(
817 | goto="aggregate_results",
818 | update={
819 | "agent_results": {"market": result},
820 | "active_agents": ["market"],
821 | },
822 | )
823 |
824 | except Exception as e:
825 | return Command(
826 | goto="aggregate_results",
827 | update={
828 | "agent_errors": {"market": str(e)},
829 | "active_agents": ["market"],
830 | },
831 | )
832 |
833 | async def _invoke_technical_agent(self, state: SupervisorState) -> Command:
834 | """Invoke technical analysis agent."""
835 | if not self.technical_agent:
836 | return Command(
837 | goto="aggregate_results",
838 | update={"agent_errors": {"technical": "Technical agent not available"}},
839 | )
840 |
841 | # This would implement technical agent invocation
842 | return Command(
843 | goto="aggregate_results", update={"active_agents": ["technical"]}
844 | )
845 |
846 | async def _invoke_research_agent(self, state: SupervisorState) -> Command:
847 | """Invoke deep research agent (future implementation)."""
848 | if not self.research_agent:
849 | return Command(
850 | goto="aggregate_results",
851 | update={"agent_errors": {"research": "Research agent not available"}},
852 | )
853 |
854 | # Future implementation
855 | return Command(goto="aggregate_results", update={"active_agents": ["research"]})
856 |
857 | async def _aggregate_results(self, state: SupervisorState) -> Command:
858 | """Aggregate results from all agents."""
859 | return Command(
860 | goto="synthesize_response", update={"workflow_status": "synthesizing"}
861 | )
862 |
863 | def _check_conflicts(self, state: SupervisorState) -> str:
864 | """Check if there are conflicts between agent results."""
865 | conflicts = state.get("conflicts_detected", [])
866 | return "resolve" if conflicts else "synthesize"
867 |
868 | async def _resolve_conflicts(self, state: SupervisorState) -> Command:
869 | """Resolve conflicts between agent recommendations."""
870 | return Command(
871 | goto="synthesize_response",
872 | update={"conflict_resolution": {"strategy": "confidence_based"}},
873 | )
874 |
875 | async def _synthesize_response(self, state: SupervisorState) -> Command:
876 | """Synthesize final response from agent results."""
877 | agent_results = state.get("agent_results", {})
878 | conflicts = state.get("conflicts_detected", [])
879 | classification = state.get("query_classification", {})
880 |
881 | if agent_results:
882 | synthesis = await self.result_synthesizer.synthesize_results(
883 | agent_results=agent_results,
884 | query_type=classification.get("category", "stock_investment_decision"),
885 | conflicts=conflicts,
886 | )
887 |
888 | return Command(
889 | goto="__end__",
890 | update={
891 | "final_recommendation_confidence": synthesis["confidence_score"],
892 | "synthesis_weights": synthesis["weights_applied"],
893 | "workflow_status": "completed",
894 | "messages": state["messages"]
895 | + [HumanMessage(content=synthesis["synthesis"])],
896 | },
897 | )
898 | else:
899 | return Command(
900 | goto="__end__",
901 | update={
902 | "workflow_status": "completed",
903 | "messages": state["messages"]
904 | + [
905 | HumanMessage(content="No agent results available for synthesis")
906 | ],
907 | },
908 | )
909 |
```
--------------------------------------------------------------------------------
/maverick_mcp/backtesting/strategies/ml/regime_aware.py:
--------------------------------------------------------------------------------
```python
1 | """Market regime-aware trading strategies with automatic strategy switching."""
2 |
3 | import logging
4 | from typing import Any
5 |
6 | import numpy as np
7 | import pandas as pd
8 | from pandas import DataFrame, Series
9 | from sklearn.cluster import KMeans
10 | from sklearn.mixture import GaussianMixture
11 | from sklearn.preprocessing import StandardScaler
12 |
13 | from maverick_mcp.backtesting.strategies.base import Strategy
14 |
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | class MarketRegimeDetector:
19 | """Detect market regimes using various statistical methods."""
20 |
21 | def __init__(
22 | self, method: str = "hmm", n_regimes: int = 3, lookback_period: int = 50
23 | ):
24 | """Initialize regime detector.
25 |
26 | Args:
27 | method: Detection method ('hmm', 'kmeans', 'threshold')
28 | n_regimes: Number of market regimes to detect
29 | lookback_period: Period for regime detection
30 | """
31 | self.method = method
32 | self.n_regimes = n_regimes
33 | self.lookback_period = lookback_period
34 | self.scaler = StandardScaler()
35 |
36 | # Initialize detection model
37 | self.model = None
38 | self.is_fitted = False
39 | self._initialize_model()
40 |
41 | def _initialize_model(self):
42 | """Initialize regime detection model with better configurations."""
43 | if self.method == "hmm":
44 | # Use GaussianMixture with more stable configuration
45 | self.model = GaussianMixture(
46 | n_components=self.n_regimes,
47 | covariance_type="diag", # Use diagonal covariance for stability
48 | random_state=42,
49 | max_iter=200,
50 | tol=1e-6,
51 | reg_covar=1e-6, # Regularization for numerical stability
52 | init_params="kmeans", # Better initialization
53 | warm_start=False,
54 | )
55 | elif self.method == "kmeans":
56 | self.model = KMeans(
57 | n_clusters=self.n_regimes,
58 | random_state=42,
59 | n_init=10,
60 | max_iter=500,
61 | tol=1e-6,
62 | algorithm="lloyd", # More stable algorithm
63 | )
64 | elif self.method == "threshold":
65 | # Threshold-based regime detection
66 | self.model = None
67 | else:
68 | raise ValueError(f"Unsupported regime detection method: {self.method}")
69 |
70 | def extract_regime_features(self, data: DataFrame) -> np.ndarray:
71 | """Extract robust features for regime detection.
72 |
73 | Args:
74 | data: Price data
75 |
76 | Returns:
77 | Feature array with consistent dimensionality and stability
78 | """
79 | try:
80 | # Validate input data
81 | if data is None or data.empty or len(data) < 10:
82 | logger.debug("Insufficient data for regime feature extraction")
83 | return np.array([])
84 |
85 | if "close" not in data.columns:
86 | logger.warning("Close price data not available for regime features")
87 | return np.array([])
88 |
89 | features = []
90 | returns = data["close"].pct_change().dropna()
91 |
92 | if len(returns) == 0:
93 | logger.debug("No valid returns data for regime features")
94 | return np.array([])
95 |
96 | # Rolling statistics with robust error handling
97 | for window in [5, 10, 20]:
98 | if len(returns) >= window:
99 | window_returns = returns.rolling(window)
100 |
101 | mean_return = window_returns.mean().iloc[-1]
102 | std_return = window_returns.std().iloc[-1]
103 |
104 | # Robust skewness and kurtosis
105 | if window >= 5:
106 | skew_return = window_returns.skew().iloc[-1]
107 | kurt_return = window_returns.kurt().iloc[-1]
108 | else:
109 | skew_return = 0.0
110 | kurt_return = 0.0
111 |
112 | # Replace NaN/inf values with sensible defaults
113 | features.extend(
114 | [
115 | mean_return if np.isfinite(mean_return) else 0.0,
116 | std_return if np.isfinite(std_return) else 0.01,
117 | skew_return if np.isfinite(skew_return) else 0.0,
118 | kurt_return if np.isfinite(kurt_return) else 0.0,
119 | ]
120 | )
121 | else:
122 | # Default values for insufficient data
123 | features.extend([0.0, 0.01, 0.0, 0.0])
124 |
125 | # Enhanced technical indicators for regime detection
126 | current_price = data["close"].iloc[-1]
127 |
128 | # Multiple timeframe trend strength
129 | if len(data) >= 20:
130 | # Short-term trend (20-day)
131 | sma_20 = data["close"].rolling(20).mean()
132 | sma_20_value = (
133 | float(sma_20.iloc[-1]) if not pd.isna(sma_20.iloc[-1]) else 0.0
134 | )
135 | if sma_20_value != 0.0:
136 | trend_strength_20 = (current_price - sma_20_value) / sma_20_value
137 | else:
138 | trend_strength_20 = 0.0
139 | features.append(
140 | trend_strength_20 if np.isfinite(trend_strength_20) else 0.0
141 | )
142 |
143 | # Price momentum (rate of change)
144 | prev_price = (
145 | float(data["close"].iloc[-20])
146 | if not pd.isna(data["close"].iloc[-20])
147 | else current_price
148 | )
149 | if prev_price != 0.0:
150 | momentum_20 = (current_price - prev_price) / prev_price
151 | else:
152 | momentum_20 = 0.0
153 | features.append(momentum_20 if np.isfinite(momentum_20) else 0.0)
154 | else:
155 | features.extend([0.0, 0.0])
156 |
157 | # Multi-timeframe volatility regime detection
158 | if len(returns) >= 20:
159 | vol_short = returns.rolling(20).std().iloc[-1] * np.sqrt(
160 | 252
161 | ) # Annualized
162 | vol_medium = (
163 | returns.rolling(60).std().iloc[-1] * np.sqrt(252)
164 | if len(returns) >= 60
165 | else vol_short
166 | )
167 |
168 | # Volatility regime indicator
169 | vol_regime = vol_short / vol_medium if vol_medium > 0 else 1.0
170 | features.append(vol_regime if np.isfinite(vol_regime) else 1.0)
171 |
172 | # Absolute volatility level (normalized)
173 | vol_level = min(vol_short / 0.3, 3.0) # Cap at 3x of 30% volatility
174 | features.append(vol_level if np.isfinite(vol_level) else 1.0)
175 | else:
176 | features.extend([1.0, 1.0])
177 |
178 | # Market structure and volume features (if available)
179 | if "volume" in data.columns and len(data) >= 10:
180 | current_volume = data["volume"].iloc[-1]
181 |
182 | # Volume trend
183 | if len(data) >= 20:
184 | volume_ma_short = data["volume"].rolling(10).mean().iloc[-1]
185 | volume_ma_long = data["volume"].rolling(20).mean().iloc[-1]
186 |
187 | volume_trend = (
188 | volume_ma_short / volume_ma_long if volume_ma_long > 0 else 1.0
189 | )
190 | features.append(volume_trend if np.isfinite(volume_trend) else 1.0)
191 |
192 | # Volume surge indicator
193 | volume_surge = (
194 | current_volume / volume_ma_long if volume_ma_long > 0 else 1.0
195 | )
196 | features.append(
197 | min(volume_surge, 10.0) if np.isfinite(volume_surge) else 1.0
198 | )
199 | else:
200 | features.extend([1.0, 1.0])
201 | else:
202 | features.extend([1.0, 1.0])
203 |
204 | # Price dispersion (high-low range analysis)
205 | if "high" in data.columns and "low" in data.columns and len(data) >= 10:
206 | hl_range = (data["high"] - data["low"]) / data["close"]
207 | avg_range = (
208 | hl_range.rolling(20).mean().iloc[-1]
209 | if len(data) >= 20
210 | else hl_range.mean()
211 | )
212 | current_range = hl_range.iloc[-1]
213 |
214 | range_regime = current_range / avg_range if avg_range > 0 else 1.0
215 | features.append(range_regime if np.isfinite(range_regime) else 1.0)
216 | else:
217 | features.append(1.0)
218 |
219 | feature_array = np.array(features)
220 |
221 | # Final validation and cleaning
222 | if len(feature_array) == 0:
223 | return np.array([])
224 |
225 | # Replace any remaining NaN/inf values
226 | feature_array = np.nan_to_num(
227 | feature_array, nan=0.0, posinf=1.0, neginf=-1.0
228 | )
229 |
230 | return feature_array
231 |
232 | except Exception as e:
233 | logger.error(f"Error extracting regime features: {e}")
234 | return np.array([])
235 |
236 | def detect_regime_threshold(self, data: DataFrame) -> int:
237 | """Detect regime using threshold-based method.
238 |
239 | Args:
240 | data: Price data
241 |
242 | Returns:
243 | Regime label (0: bear/declining, 1: sideways, 2: bull/trending)
244 | """
245 | if len(data) < 20:
246 | return 1 # Default to sideways
247 |
248 | # Calculate trend and volatility measures
249 | returns = data["close"].pct_change()
250 |
251 | # Trend measure (20-day slope)
252 | x = np.arange(20)
253 | y = data["close"].iloc[-20:].values
254 | trend_slope = np.polyfit(x, y, 1)[0] / y[-1] # Normalized slope
255 |
256 | # Volatility measure
257 | vol_20 = returns.rolling(20).std().iloc[-1] * np.sqrt(252)
258 |
259 | # Define regime thresholds
260 | trend_threshold = 0.001 # 0.1% daily trend threshold
261 | vol_threshold = 0.25 # 25% annual volatility threshold
262 |
263 | # Classify regime
264 | if trend_slope > trend_threshold and vol_20 < vol_threshold:
265 | return 2 # Bull/trending market
266 | elif trend_slope < -trend_threshold and vol_20 > vol_threshold:
267 | return 0 # Bear/declining market
268 | else:
269 | return 1 # Sideways/uncertain market
270 |
271 | def fit_regimes(self, data: DataFrame) -> None:
272 | """Fit regime detection model to historical data with enhanced robustness.
273 |
274 | Args:
275 | data: Historical price data
276 | """
277 | if self.method == "threshold":
278 | self.is_fitted = True
279 | return
280 |
281 | try:
282 | # Need sufficient data for stable regime detection
283 | min_required_samples = max(50, self.n_regimes * 20)
284 | if len(data) < min_required_samples + self.lookback_period:
285 | logger.warning(
286 | f"Insufficient data for regime fitting: {len(data)} < {min_required_samples + self.lookback_period}"
287 | )
288 | self.is_fitted = True
289 | return
290 |
291 | # Extract features for regime detection with temporal consistency
292 | feature_list = []
293 | feature_consistency_count = None
294 |
295 | # Use overlapping windows for more stable regime detection
296 | step_size = max(1, self.lookback_period // 10)
297 |
298 | for i in range(self.lookback_period, len(data), step_size):
299 | window_data = data.iloc[max(0, i - self.lookback_period) : i + 1]
300 | features = self.extract_regime_features(window_data)
301 |
302 | if len(features) > 0 and np.all(np.isfinite(features)):
303 | # Check feature consistency
304 | if feature_consistency_count is None:
305 | feature_consistency_count = len(features)
306 | elif len(features) != feature_consistency_count:
307 | logger.warning(
308 | f"Feature dimension mismatch: expected {feature_consistency_count}, got {len(features)}"
309 | )
310 | continue
311 |
312 | feature_list.append(features)
313 |
314 | if len(feature_list) < min_required_samples:
315 | logger.warning(
316 | f"Insufficient valid samples for regime fitting: {len(feature_list)} < {min_required_samples}"
317 | )
318 | self.is_fitted = True
319 | return
320 |
321 | # Ensure we have valid feature_list before creating array
322 | if len(feature_list) == 0:
323 | logger.warning(
324 | "Empty feature list after filtering, cannot create feature matrix"
325 | )
326 | self.is_fitted = True
327 | return
328 |
329 | X = np.array(feature_list)
330 |
331 | # Additional data quality checks
332 | if X.size == 0:
333 | logger.warning("Empty feature matrix, cannot fit regime detector")
334 | self.is_fitted = True
335 | return
336 | elif np.any(np.isnan(X)) or np.any(np.isinf(X)):
337 | logger.warning("Found NaN or inf values in feature matrix, cleaning...")
338 | X = np.nan_to_num(X, nan=0.0, posinf=1.0, neginf=-1.0)
339 |
340 | # Check for zero variance features
341 | feature_std = np.std(X, axis=0)
342 | zero_variance_features = np.where(feature_std < 1e-8)[0]
343 | if len(zero_variance_features) > 0:
344 | logger.debug(
345 | f"Found {len(zero_variance_features)} zero-variance features"
346 | )
347 | # Add small noise to zero-variance features
348 | for idx in zero_variance_features:
349 | X[:, idx] += np.random.normal(0, 1e-6, X.shape[0])
350 |
351 | # Scale features with robust scaler
352 | X_scaled = self.scaler.fit_transform(X)
353 |
354 | # Fit model with better error handling
355 | try:
356 | if self.method == "hmm":
357 | # For GaussianMixture, ensure numerical stability
358 | self.model.fit(X_scaled)
359 |
360 | # Validate fitted model
361 | if (
362 | not hasattr(self.model, "weights_")
363 | or len(self.model.weights_) != self.n_regimes
364 | ):
365 | raise ValueError("Model fitting failed - invalid weights")
366 |
367 | # Check convergence
368 | if not self.model.converged_:
369 | logger.warning(
370 | "GaussianMixture did not converge, but will proceed"
371 | )
372 |
373 | elif self.method == "kmeans":
374 | self.model.fit(X_scaled)
375 |
376 | # Validate fitted model
377 | if (
378 | not hasattr(self.model, "cluster_centers_")
379 | or len(self.model.cluster_centers_) != self.n_regimes
380 | ):
381 | raise ValueError(
382 | "KMeans fitting failed - invalid cluster centers"
383 | )
384 |
385 | self.is_fitted = True
386 |
387 | # Log fitting success with model diagnostics
388 | if self.method == "hmm":
389 | avg_log_likelihood = self.model.score(X_scaled) / len(X_scaled)
390 | logger.info(
391 | f"Fitted {self.method} regime detector with {len(X)} samples, avg log-likelihood: {avg_log_likelihood:.4f}"
392 | )
393 | else:
394 | inertia = (
395 | self.model.inertia_
396 | if hasattr(self.model, "inertia_")
397 | else "N/A"
398 | )
399 | logger.info(
400 | f"Fitted {self.method} regime detector with {len(X)} samples, inertia: {inertia}"
401 | )
402 |
403 | except Exception as model_error:
404 | logger.error(f"Model fitting failed: {model_error}")
405 | logger.info("Falling back to threshold method")
406 | self.method = "threshold" # Fallback to threshold method
407 | self.is_fitted = True
408 |
409 | except Exception as e:
410 | logger.error(f"Error fitting regime detector: {e}")
411 | self.is_fitted = True # Allow fallback to threshold method
412 |
413 | def detect_current_regime(self, data: DataFrame) -> int:
414 | """Detect current market regime with enhanced error handling.
415 |
416 | Args:
417 | data: Recent price data
418 |
419 | Returns:
420 | Regime label (0: bear, 1: sideways, 2: bull)
421 | """
422 | if not self.is_fitted:
423 | logger.debug("Regime detector not fitted, using threshold method")
424 | return self.detect_regime_threshold(data)
425 |
426 | try:
427 | if self.method == "threshold":
428 | return self.detect_regime_threshold(data)
429 |
430 | # Extract features for current regime
431 | features = self.extract_regime_features(data)
432 |
433 | if len(features) == 0:
434 | logger.debug("No features extracted, falling back to threshold method")
435 | return self.detect_regime_threshold(data)
436 |
437 | # Check for non-finite features only if features array is not empty
438 | if features.size > 0 and np.any(~np.isfinite(features)):
439 | logger.debug("Non-finite features detected, cleaning and proceeding")
440 | features = np.nan_to_num(features, nan=0.0, posinf=1.0, neginf=-1.0)
441 |
442 | # Validate feature consistency with training
443 | expected_features = (
444 | self.scaler.n_features_in_
445 | if hasattr(self.scaler, "n_features_in_")
446 | else None
447 | )
448 | if expected_features is not None and len(features) != expected_features:
449 | logger.warning(
450 | f"Feature count mismatch in prediction: expected {expected_features}, got {len(features)}"
451 | )
452 | return self.detect_regime_threshold(data)
453 |
454 | # Scale features and predict regime
455 | try:
456 | X = self.scaler.transform([features])
457 | regime = self.model.predict(X)[0]
458 |
459 | # Validate regime prediction
460 | if regime < 0 or regime >= self.n_regimes:
461 | logger.warning(
462 | f"Invalid regime prediction: {regime}, using threshold method"
463 | )
464 | return self.detect_regime_threshold(data)
465 |
466 | return int(regime)
467 |
468 | except Exception as pred_error:
469 | logger.debug(
470 | f"Prediction error: {pred_error}, falling back to threshold method"
471 | )
472 | return self.detect_regime_threshold(data)
473 |
474 | except Exception as e:
475 | logger.error(f"Error detecting current regime: {e}")
476 | return self.detect_regime_threshold(data) # Always fallback to threshold
477 |
478 | def get_regime_probabilities(self, data: DataFrame) -> np.ndarray:
479 | """Get probabilities for each regime.
480 |
481 | Args:
482 | data: Recent price data
483 |
484 | Returns:
485 | Array of regime probabilities
486 | """
487 | if not self.is_fitted or self.method == "threshold":
488 | # For threshold method, return deterministic probabilities
489 | regime = self.detect_current_regime(data)
490 | probs = np.zeros(self.n_regimes)
491 | probs[regime] = 1.0
492 | return probs
493 |
494 | try:
495 | features = self.extract_regime_features(data)
496 |
497 | if len(features) == 0:
498 | return np.ones(self.n_regimes) / self.n_regimes
499 | elif features.size > 0 and np.any(np.isnan(features)):
500 | return np.ones(self.n_regimes) / self.n_regimes
501 |
502 | X = self.scaler.transform([features])
503 |
504 | if hasattr(self.model, "predict_proba"):
505 | return self.model.predict_proba(X)[0]
506 | else:
507 | # For methods without probabilities, return one-hot encoding
508 | regime = self.model.predict(X)[0]
509 | probs = np.zeros(self.n_regimes)
510 | probs[regime] = 1.0
511 | return probs
512 |
513 | except Exception as e:
514 | logger.error(f"Error getting regime probabilities: {e}")
515 | return np.ones(self.n_regimes) / self.n_regimes
516 |
517 |
518 | class RegimeAwareStrategy(Strategy):
519 | """Strategy that switches between different strategies based on market regime."""
520 |
521 | def __init__(
522 | self,
523 | regime_strategies: dict[int, Strategy],
524 | regime_detector: MarketRegimeDetector = None,
525 | regime_names: dict[int, str] = None,
526 | switch_threshold: float = 0.7,
527 | min_regime_duration: int = 5,
528 | parameters: dict[str, Any] = None,
529 | ):
530 | """Initialize regime-aware strategy.
531 |
532 | Args:
533 | regime_strategies: Dictionary mapping regime labels to strategies
534 | regime_detector: Market regime detector instance
535 | regime_names: Names for each regime
536 | switch_threshold: Probability threshold for regime switching
537 | min_regime_duration: Minimum duration before switching regimes
538 | parameters: Additional parameters
539 | """
540 | super().__init__(parameters)
541 | self.regime_strategies = regime_strategies
542 | self.regime_detector = regime_detector or MarketRegimeDetector()
543 | self.regime_names = regime_names or {0: "Bear", 1: "Sideways", 2: "Bull"}
544 | self.switch_threshold = switch_threshold
545 | self.min_regime_duration = min_regime_duration
546 |
547 | # Regime tracking
548 | self.current_regime = 1 # Start with sideways
549 | self.regime_history = []
550 | self.regime_duration = 0
551 | self.regime_switches = 0
552 |
553 | @property
554 | def name(self) -> str:
555 | """Get strategy name."""
556 | strategy_names = [s.name for s in self.regime_strategies.values()]
557 | return f"RegimeAware({','.join(strategy_names)})"
558 |
559 | @property
560 | def description(self) -> str:
561 | """Get strategy description."""
562 | return f"Regime-aware strategy switching between {len(self.regime_strategies)} strategies based on market conditions"
563 |
564 | def fit_regime_detector(self, data: DataFrame) -> None:
565 | """Fit regime detector to historical data.
566 |
567 | Args:
568 | data: Historical price data
569 | """
570 | self.regime_detector.fit_regimes(data)
571 |
572 | def update_current_regime(self, data: DataFrame, current_idx: int) -> bool:
573 | """Update current market regime.
574 |
575 | Args:
576 | data: Price data
577 | current_idx: Current index in data
578 |
579 | Returns:
580 | True if regime changed, False otherwise
581 | """
582 | # Get regime probabilities
583 | window_data = data.iloc[
584 | max(0, current_idx - self.regime_detector.lookback_period) : current_idx + 1
585 | ]
586 | regime_probs = self.regime_detector.get_regime_probabilities(window_data)
587 |
588 | # Find most likely regime
589 | most_likely_regime = np.argmax(regime_probs)
590 | max_prob = regime_probs[most_likely_regime]
591 |
592 | # Check if we should switch regimes
593 | regime_changed = False
594 |
595 | if (
596 | most_likely_regime != self.current_regime
597 | and max_prob >= self.switch_threshold
598 | and self.regime_duration >= self.min_regime_duration
599 | ):
600 | old_regime = self.current_regime
601 | self.current_regime = most_likely_regime
602 | self.regime_duration = 0
603 | self.regime_switches += 1
604 | regime_changed = True
605 |
606 | logger.info(
607 | f"Regime switch: {self.regime_names.get(old_regime, old_regime)} -> "
608 | f"{self.regime_names.get(self.current_regime, self.current_regime)} "
609 | f"(prob: {max_prob:.3f})"
610 | )
611 | else:
612 | self.regime_duration += 1
613 |
614 | # Track regime history
615 | self.regime_history.append(
616 | {
617 | "index": current_idx,
618 | "regime": self.current_regime,
619 | "probabilities": regime_probs.tolist(),
620 | "duration": self.regime_duration,
621 | "switched": regime_changed,
622 | }
623 | )
624 |
625 | return regime_changed
626 |
627 | def get_active_strategy(self) -> Strategy:
628 | """Get currently active strategy based on regime.
629 |
630 | Returns:
631 | Active strategy for current regime
632 | """
633 | if self.current_regime in self.regime_strategies:
634 | return self.regime_strategies[self.current_regime]
635 | else:
636 | # Fallback to first available strategy
637 | return next(iter(self.regime_strategies.values()))
638 |
639 | def generate_signals(self, data: DataFrame) -> tuple[Series, Series]:
640 | """Generate regime-aware trading signals.
641 |
642 | Args:
643 | data: Price data with OHLCV columns
644 |
645 | Returns:
646 | Tuple of (entry_signals, exit_signals) as boolean Series
647 | """
648 | try:
649 | # Validate input data
650 | if data is None or len(data) == 0:
651 | logger.warning("Empty or invalid data provided to generate_signals")
652 | # Create empty Series with a dummy index to avoid empty array issues
653 | dummy_index = pd.DatetimeIndex([pd.Timestamp.now()])
654 | return pd.Series(False, index=dummy_index), pd.Series(
655 | False, index=dummy_index
656 | )
657 |
658 | # Ensure minimum data requirements
659 | min_required_data = max(50, self.regime_detector.lookback_period)
660 | if len(data) < min_required_data:
661 | logger.warning(
662 | f"Insufficient data for regime-aware strategy: {len(data)} < {min_required_data}"
663 | )
664 | # Return all False signals but with valid data index
665 | return pd.Series(False, index=data.index), pd.Series(
666 | False, index=data.index
667 | )
668 |
669 | # Fit regime detector if not already done
670 | if not self.regime_detector.is_fitted:
671 | try:
672 | self.fit_regime_detector(data)
673 | except Exception as e:
674 | logger.error(
675 | f"Failed to fit regime detector: {e}, falling back to single strategy"
676 | )
677 | # Fallback to using first available strategy without regime switching
678 | fallback_strategy = next(iter(self.regime_strategies.values()))
679 | return fallback_strategy.generate_signals(data)
680 |
681 | entry_signals = pd.Series(False, index=data.index)
682 | exit_signals = pd.Series(False, index=data.index)
683 |
684 | # Generate signals with regime awareness
685 | current_strategy = None
686 |
687 | for idx in range(len(data)):
688 | # Update regime
689 | regime_changed = self.update_current_regime(data, idx)
690 |
691 | # Get active strategy
692 | active_strategy = self.get_active_strategy()
693 |
694 | # If regime changed, regenerate signals from new strategy
695 | if regime_changed or current_strategy != active_strategy:
696 | current_strategy = active_strategy
697 |
698 | # Generate signals for remaining data
699 | remaining_data = data.iloc[idx:]
700 | if len(remaining_data) > 0:
701 | strategy_entry, strategy_exit = (
702 | current_strategy.generate_signals(remaining_data)
703 | )
704 |
705 | # Update signals for remaining period
706 | end_idx = min(idx + len(strategy_entry), len(data))
707 | entry_signals.iloc[idx:end_idx] = strategy_entry.iloc[
708 | : end_idx - idx
709 | ]
710 | exit_signals.iloc[idx:end_idx] = strategy_exit.iloc[
711 | : end_idx - idx
712 | ]
713 |
714 | logger.info(
715 | f"Generated regime-aware signals with {self.regime_switches} regime switches"
716 | )
717 |
718 | return entry_signals, exit_signals
719 |
720 | except Exception as e:
721 | logger.error(f"Error generating regime-aware signals: {e}")
722 | # Ensure we always return valid series even on error
723 | if data is not None and len(data) > 0:
724 | return pd.Series(False, index=data.index), pd.Series(
725 | False, index=data.index
726 | )
727 | else:
728 | # Create dummy index to avoid empty array issues
729 | dummy_index = pd.DatetimeIndex([pd.Timestamp.now()])
730 | return pd.Series(False, index=dummy_index), pd.Series(
731 | False, index=dummy_index
732 | )
733 |
734 | def get_regime_analysis(self) -> dict[str, Any]:
735 | """Get analysis of regime detection and switching.
736 |
737 | Returns:
738 | Dictionary with regime analysis
739 | """
740 | if not self.regime_history:
741 | return {}
742 |
743 | regime_counts = {}
744 | regime_durations = {}
745 |
746 | for record in self.regime_history:
747 | regime = record["regime"]
748 | regime_name = self.regime_names.get(regime, f"Regime_{regime}")
749 |
750 | if regime_name not in regime_counts:
751 | regime_counts[regime_name] = 0
752 | regime_durations[regime_name] = []
753 |
754 | regime_counts[regime_name] += 1
755 |
756 | # Track regime durations
757 | if record["switched"] and len(self.regime_history) > 1:
758 | # Find duration of previous regime
759 | prev_regime_start = 0
760 | for i in range(len(self.regime_history) - 2, -1, -1):
761 | if (
762 | self.regime_history[i]["regime"]
763 | != self.regime_history[-1]["regime"]
764 | ):
765 | prev_regime_start = i + 1
766 | break
767 |
768 | duration = len(self.regime_history) - prev_regime_start - 1
769 | prev_regime = self.regime_history[prev_regime_start]["regime"]
770 | prev_regime_name = self.regime_names.get(
771 | prev_regime, f"Regime_{prev_regime}"
772 | )
773 |
774 | if prev_regime_name in regime_durations:
775 | regime_durations[prev_regime_name].append(duration)
776 |
777 | # Calculate average durations
778 | avg_durations = {}
779 | for regime_name, durations in regime_durations.items():
780 | if durations:
781 | avg_durations[regime_name] = np.mean(durations)
782 | else:
783 | avg_durations[regime_name] = 0
784 |
785 | return {
786 | "current_regime": self.regime_names.get(
787 | self.current_regime, self.current_regime
788 | ),
789 | "total_switches": self.regime_switches,
790 | "regime_counts": regime_counts,
791 | "average_regime_durations": avg_durations,
792 | "regime_history": self.regime_history[-50:], # Last 50 records
793 | "active_strategy": self.get_active_strategy().name,
794 | }
795 |
796 | def validate_parameters(self) -> bool:
797 | """Validate regime-aware strategy parameters.
798 |
799 | Returns:
800 | True if parameters are valid
801 | """
802 | if not self.regime_strategies:
803 | return False
804 |
805 | if self.switch_threshold < 0 or self.switch_threshold > 1:
806 | return False
807 |
808 | if self.min_regime_duration < 0:
809 | return False
810 |
811 | # Validate individual strategies
812 | for strategy in self.regime_strategies.values():
813 | if not strategy.validate_parameters():
814 | return False
815 |
816 | return True
817 |
818 | def get_default_parameters(self) -> dict[str, Any]:
819 | """Get default parameters for regime-aware strategy.
820 |
821 | Returns:
822 | Dictionary of default parameters
823 | """
824 | return {
825 | "switch_threshold": 0.7,
826 | "min_regime_duration": 5,
827 | "regime_detection_method": "hmm",
828 | "n_regimes": 3,
829 | "lookback_period": 50,
830 | }
831 |
832 |
833 | class AdaptiveRegimeStrategy(RegimeAwareStrategy):
834 | """Advanced regime-aware strategy with adaptive regime detection."""
835 |
836 | def __init__(
837 | self,
838 | regime_strategies: dict[int, Strategy],
839 | adaptation_frequency: int = 100,
840 | regime_confidence_threshold: float = 0.6,
841 | **kwargs,
842 | ):
843 | """Initialize adaptive regime strategy.
844 |
845 | Args:
846 | regime_strategies: Dictionary mapping regime labels to strategies
847 | adaptation_frequency: How often to re-fit regime detector
848 | regime_confidence_threshold: Minimum confidence for regime detection
849 | **kwargs: Additional parameters for RegimeAwareStrategy
850 | """
851 | super().__init__(regime_strategies, **kwargs)
852 | self.adaptation_frequency = adaptation_frequency
853 | self.regime_confidence_threshold = regime_confidence_threshold
854 | self.last_adaptation = 0
855 |
856 | @property
857 | def name(self) -> str:
858 | """Get strategy name."""
859 | return f"Adaptive{super().name}"
860 |
861 | def adapt_regime_detector(self, data: DataFrame, current_idx: int) -> None:
862 | """Re-fit regime detector with recent data.
863 |
864 | Args:
865 | data: Price data
866 | current_idx: Current index
867 | """
868 | if current_idx - self.last_adaptation < self.adaptation_frequency:
869 | return
870 |
871 | try:
872 | # Use recent data for adaptation
873 | adaptation_data = data.iloc[max(0, current_idx - 500) : current_idx]
874 |
875 | if len(adaptation_data) >= self.regime_detector.lookback_period:
876 | logger.info(f"Adapting regime detector at index {current_idx}")
877 | self.regime_detector.fit_regimes(adaptation_data)
878 | self.last_adaptation = current_idx
879 |
880 | except Exception as e:
881 | logger.error(f"Error adapting regime detector: {e}")
882 |
883 | def generate_signals(self, data: DataFrame) -> tuple[Series, Series]:
884 | """Generate adaptive regime-aware signals.
885 |
886 | Args:
887 | data: Price data with OHLCV columns
888 |
889 | Returns:
890 | Tuple of (entry_signals, exit_signals) as boolean Series
891 | """
892 | # Periodically adapt regime detector
893 | for idx in range(
894 | self.adaptation_frequency, len(data), self.adaptation_frequency
895 | ):
896 | self.adapt_regime_detector(data, idx)
897 |
898 | # Generate signals using parent method
899 | return super().generate_signals(data)
900 |
```
--------------------------------------------------------------------------------
/tests/integration/test_orchestration_complete.py:
--------------------------------------------------------------------------------
```python
1 | #!/usr/bin/env python3
2 | """
3 | Comprehensive Integration Test Suite for MaverickMCP Orchestration Capabilities
4 |
5 | This test suite thoroughly validates all orchestration features including:
6 | - agents_orchestrated_analysis with different personas and routing strategies
7 | - agents_deep_research_financial with various research depths and focus areas
8 | - agents_compare_multi_agent_analysis with different agent combinations
9 |
10 | The tests simulate real Claude Desktop usage patterns and validate end-to-end
11 | functionality with comprehensive error handling and performance monitoring.
12 | """
13 |
14 | import asyncio
15 | import json
16 | import logging
17 | import os
18 | import sys
19 | import time
20 | import tracemalloc
21 | from datetime import datetime
22 | from typing import Any
23 |
24 | # Add project root to Python path
25 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
26 |
27 | # Test configuration
28 | TEST_CONFIG = {
29 | "timeout_seconds": 300, # 5 minutes max per test
30 | "concurrent_limit": 3, # Maximum concurrent tests
31 | "performance_monitoring": True,
32 | "detailed_validation": True,
33 | "save_results": True,
34 | }
35 |
36 | # Test scenarios for each orchestration tool
37 | ORCHESTRATED_ANALYSIS_SCENARIOS = [
38 | {
39 | "name": "Conservative_LLM_Powered_Routing",
40 | "query": "Analyze AAPL for long-term investment",
41 | "persona": "conservative",
42 | "routing_strategy": "llm_powered",
43 | "max_agents": 2,
44 | "parallel_execution": False,
45 | },
46 | {
47 | "name": "Aggressive_Rule_Based_Routing",
48 | "query": "Find high momentum growth stocks in tech",
49 | "persona": "aggressive",
50 | "routing_strategy": "rule_based",
51 | "max_agents": 3,
52 | "parallel_execution": True,
53 | },
54 | {
55 | "name": "Moderate_Hybrid_Routing",
56 | "query": "Portfolio analysis for MSFT GOOGL NVDA",
57 | "persona": "moderate",
58 | "routing_strategy": "hybrid",
59 | "max_agents": 2,
60 | "parallel_execution": True,
61 | },
62 | {
63 | "name": "Day_Trader_Fast_Execution",
64 | "query": "Quick technical analysis on SPY options",
65 | "persona": "day_trader",
66 | "routing_strategy": "rule_based",
67 | "max_agents": 1,
68 | "parallel_execution": False,
69 | },
70 | ]
71 |
72 | DEEP_RESEARCH_SCENARIOS = [
73 | {
74 | "name": "Basic_Company_Research",
75 | "research_topic": "Tesla Inc stock analysis",
76 | "persona": "moderate",
77 | "research_depth": "basic",
78 | "focus_areas": ["fundamentals"],
79 | "timeframe": "7d",
80 | },
81 | {
82 | "name": "Standard_Sector_Research",
83 | "research_topic": "renewable energy sector trends",
84 | "persona": "conservative",
85 | "research_depth": "standard",
86 | "focus_areas": ["market_sentiment", "competitive_landscape"],
87 | "timeframe": "30d",
88 | },
89 | {
90 | "name": "Comprehensive_Market_Research",
91 | "research_topic": "AI and machine learning investment opportunities",
92 | "persona": "aggressive",
93 | "research_depth": "comprehensive",
94 | "focus_areas": ["fundamentals", "technicals", "market_sentiment"],
95 | "timeframe": "90d",
96 | },
97 | {
98 | "name": "Exhaustive_Crypto_Research",
99 | "research_topic": "Bitcoin and cryptocurrency market analysis",
100 | "persona": "day_trader",
101 | "research_depth": "exhaustive",
102 | "focus_areas": ["technicals", "market_sentiment", "competitive_landscape"],
103 | "timeframe": "1y",
104 | },
105 | ]
106 |
107 | MULTI_AGENT_COMPARISON_SCENARIOS = [
108 | {
109 | "name": "Market_vs_Supervisor_Stock_Analysis",
110 | "query": "Should I invest in Apple stock now?",
111 | "agent_types": ["market", "supervisor"],
112 | "persona": "moderate",
113 | },
114 | {
115 | "name": "Conservative_Multi_Agent_Portfolio",
116 | "query": "Build a balanced portfolio for retirement",
117 | "agent_types": ["market", "supervisor"],
118 | "persona": "conservative",
119 | },
120 | {
121 | "name": "Aggressive_Growth_Strategy",
122 | "query": "Find the best growth stocks for 2025",
123 | "agent_types": ["market", "supervisor"],
124 | "persona": "aggressive",
125 | },
126 | ]
127 |
128 | ERROR_HANDLING_SCENARIOS = [
129 | {
130 | "tool": "orchestrated_analysis",
131 | "params": {
132 | "query": "", # Empty query
133 | "persona": "invalid_persona",
134 | "routing_strategy": "unknown_strategy",
135 | },
136 | },
137 | {
138 | "tool": "deep_research_financial",
139 | "params": {
140 | "research_topic": "XYZ",
141 | "research_depth": "invalid_depth",
142 | "focus_areas": ["invalid_area"],
143 | },
144 | },
145 | {
146 | "tool": "compare_multi_agent_analysis",
147 | "params": {
148 | "query": "test",
149 | "agent_types": ["nonexistent_agent"],
150 | "persona": "unknown",
151 | },
152 | },
153 | ]
154 |
155 |
156 | class TestResult:
157 | """Container for individual test results."""
158 |
159 | def __init__(self, test_name: str, tool_name: str):
160 | self.test_name = test_name
161 | self.tool_name = tool_name
162 | self.start_time = time.time()
163 | self.end_time: float | None = None
164 | self.success = False
165 | self.error: str | None = None
166 | self.response: dict[str, Any] | None = None
167 | self.execution_time_ms: float | None = None
168 | self.memory_usage_mb: float | None = None
169 | self.validation_results: dict[str, bool] = {}
170 |
171 | def mark_completed(
172 | self, success: bool, response: dict | None = None, error: str | None = None
173 | ):
174 | """Mark test as completed with results."""
175 | self.end_time = time.time()
176 | self.execution_time_ms = (self.end_time - self.start_time) * 1000
177 | self.success = success
178 | self.response = response
179 | self.error = error
180 |
181 | def to_dict(self) -> dict[str, Any]:
182 | """Convert to dictionary for serialization."""
183 | return {
184 | "test_name": self.test_name,
185 | "tool_name": self.tool_name,
186 | "success": self.success,
187 | "execution_time_ms": self.execution_time_ms,
188 | "memory_usage_mb": self.memory_usage_mb,
189 | "error": self.error,
190 | "validation_results": self.validation_results,
191 | "response_summary": self._summarize_response() if self.response else None,
192 | }
193 |
194 | def _summarize_response(self) -> dict[str, Any]:
195 | """Create summary of response for reporting."""
196 | if not self.response:
197 | return {}
198 |
199 | summary = {
200 | "status": self.response.get("status"),
201 | "agent_type": self.response.get("agent_type"),
202 | "persona": self.response.get("persona"),
203 | }
204 |
205 | # Add tool-specific summary fields
206 | if "agents_used" in self.response:
207 | summary["agents_used"] = self.response["agents_used"]
208 | if "sources_analyzed" in self.response:
209 | summary["sources_analyzed"] = self.response["sources_analyzed"]
210 | if "agents_compared" in self.response:
211 | summary["agents_compared"] = self.response["agents_compared"]
212 |
213 | return summary
214 |
215 |
216 | class IntegrationTestSuite:
217 | """Comprehensive integration test suite for MCP orchestration tools."""
218 |
219 | def __init__(self):
220 | self.setup_logging()
221 | self.results: list[TestResult] = []
222 | self.start_time = time.time()
223 | self.session_id = f"test_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
224 |
225 | # Import MCP tools
226 | self._import_tools()
227 |
228 | def setup_logging(self):
229 | """Configure logging for test execution."""
230 | logging.basicConfig(
231 | level=logging.INFO,
232 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
233 | handlers=[
234 | logging.FileHandler(
235 | f"integration_test_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
236 | ),
237 | logging.StreamHandler(),
238 | ],
239 | )
240 | self.logger = logging.getLogger(__name__)
241 |
242 | def _import_tools(self):
243 | """Import and initialize MCP tools."""
244 | try:
245 | from maverick_mcp.api.routers.agents import (
246 | compare_multi_agent_analysis,
247 | deep_research_financial,
248 | orchestrated_analysis,
249 | )
250 |
251 | self.orchestrated_analysis = orchestrated_analysis
252 | self.deep_research_financial = deep_research_financial
253 | self.compare_multi_agent_analysis = compare_multi_agent_analysis
254 |
255 | self.logger.info("Successfully imported MCP orchestration tools")
256 |
257 | except ImportError as e:
258 | self.logger.error(f"Failed to import MCP tools: {e}")
259 | raise RuntimeError(f"Cannot run tests without MCP tools: {e}")
260 |
261 | def print_header(self, title: str):
262 | """Print formatted test section header."""
263 | print(f"\n{'=' * 80}")
264 | print(f" {title}")
265 | print(f"{'=' * 80}")
266 |
267 | def print_progress(self, current: int, total: int, test_name: str):
268 | """Print progress indicator."""
269 | percentage = (current / total) * 100
270 | print(f"[{current:2d}/{total}] ({percentage:5.1f}%) {test_name}")
271 |
272 | async def test_orchestrated_analysis(self) -> list[TestResult]:
273 | """Test agents_orchestrated_analysis with various scenarios."""
274 | self.print_header("Testing Orchestrated Analysis Tool")
275 | results = []
276 |
277 | for i, scenario in enumerate(ORCHESTRATED_ANALYSIS_SCENARIOS):
278 | test_name = f"orchestrated_analysis_{scenario['name']}"
279 | self.print_progress(i + 1, len(ORCHESTRATED_ANALYSIS_SCENARIOS), test_name)
280 |
281 | result = TestResult(test_name, "agents_orchestrated_analysis")
282 |
283 | try:
284 | if TEST_CONFIG["performance_monitoring"]:
285 | tracemalloc.start()
286 |
287 | # Add unique session ID for each test
288 | scenario_params = scenario.copy()
289 | scenario_params["session_id"] = f"{self.session_id}_{test_name}"
290 |
291 | # Execute with timeout
292 | response = await asyncio.wait_for(
293 | self.orchestrated_analysis(**scenario_params),
294 | timeout=TEST_CONFIG["timeout_seconds"],
295 | )
296 |
297 | # Validate response
298 | validation_results = self._validate_orchestrated_response(response)
299 | result.validation_results = validation_results
300 |
301 | success = (
302 | all(validation_results.values())
303 | and response.get("status") == "success"
304 | )
305 | result.mark_completed(success, response)
306 |
307 | if TEST_CONFIG["performance_monitoring"]:
308 | current, peak = tracemalloc.get_traced_memory()
309 | result.memory_usage_mb = peak / 1024 / 1024
310 | tracemalloc.stop()
311 |
312 | self.logger.info(
313 | f"✓ {test_name}: {'PASS' if success else 'FAIL'} "
314 | f"({result.execution_time_ms:.0f}ms)"
315 | )
316 |
317 | except TimeoutError:
318 | result.mark_completed(False, error="Test timeout")
319 | self.logger.warning(f"✗ {test_name}: TIMEOUT")
320 |
321 | except Exception as e:
322 | result.mark_completed(False, error=str(e))
323 | self.logger.error(f"✗ {test_name}: ERROR - {e}")
324 |
325 | results.append(result)
326 |
327 | return results
328 |
329 | async def test_deep_research_financial(self) -> list[TestResult]:
330 | """Test agents_deep_research_financial with various scenarios."""
331 | self.print_header("Testing Deep Research Financial Tool")
332 | results = []
333 |
334 | for i, scenario in enumerate(DEEP_RESEARCH_SCENARIOS):
335 | test_name = f"deep_research_{scenario['name']}"
336 | self.print_progress(i + 1, len(DEEP_RESEARCH_SCENARIOS), test_name)
337 |
338 | result = TestResult(test_name, "agents_deep_research_financial")
339 |
340 | try:
341 | if TEST_CONFIG["performance_monitoring"]:
342 | tracemalloc.start()
343 |
344 | # Add unique session ID
345 | scenario_params = scenario.copy()
346 | scenario_params["session_id"] = f"{self.session_id}_{test_name}"
347 |
348 | response = await asyncio.wait_for(
349 | self.deep_research_financial(**scenario_params),
350 | timeout=TEST_CONFIG["timeout_seconds"],
351 | )
352 |
353 | validation_results = self._validate_research_response(response)
354 | result.validation_results = validation_results
355 |
356 | success = (
357 | all(validation_results.values())
358 | and response.get("status") == "success"
359 | )
360 | result.mark_completed(success, response)
361 |
362 | if TEST_CONFIG["performance_monitoring"]:
363 | current, peak = tracemalloc.get_traced_memory()
364 | result.memory_usage_mb = peak / 1024 / 1024
365 | tracemalloc.stop()
366 |
367 | self.logger.info(
368 | f"✓ {test_name}: {'PASS' if success else 'FAIL'} "
369 | f"({result.execution_time_ms:.0f}ms)"
370 | )
371 |
372 | except TimeoutError:
373 | result.mark_completed(False, error="Test timeout")
374 | self.logger.warning(f"✗ {test_name}: TIMEOUT")
375 |
376 | except Exception as e:
377 | result.mark_completed(False, error=str(e))
378 | self.logger.error(f"✗ {test_name}: ERROR - {e}")
379 |
380 | results.append(result)
381 |
382 | return results
383 |
384 | async def test_compare_multi_agent_analysis(self) -> list[TestResult]:
385 | """Test agents_compare_multi_agent_analysis with various scenarios."""
386 | self.print_header("Testing Multi-Agent Comparison Tool")
387 | results = []
388 |
389 | for i, scenario in enumerate(MULTI_AGENT_COMPARISON_SCENARIOS):
390 | test_name = f"multi_agent_{scenario['name']}"
391 | self.print_progress(i + 1, len(MULTI_AGENT_COMPARISON_SCENARIOS), test_name)
392 |
393 | result = TestResult(test_name, "agents_compare_multi_agent_analysis")
394 |
395 | try:
396 | if TEST_CONFIG["performance_monitoring"]:
397 | tracemalloc.start()
398 |
399 | scenario_params = scenario.copy()
400 | scenario_params["session_id"] = f"{self.session_id}_{test_name}"
401 |
402 | response = await asyncio.wait_for(
403 | self.compare_multi_agent_analysis(**scenario_params),
404 | timeout=TEST_CONFIG["timeout_seconds"],
405 | )
406 |
407 | validation_results = self._validate_comparison_response(response)
408 | result.validation_results = validation_results
409 |
410 | success = (
411 | all(validation_results.values())
412 | and response.get("status") == "success"
413 | )
414 | result.mark_completed(success, response)
415 |
416 | if TEST_CONFIG["performance_monitoring"]:
417 | current, peak = tracemalloc.get_traced_memory()
418 | result.memory_usage_mb = peak / 1024 / 1024
419 | tracemalloc.stop()
420 |
421 | self.logger.info(
422 | f"✓ {test_name}: {'PASS' if success else 'FAIL'} "
423 | f"({result.execution_time_ms:.0f}ms)"
424 | )
425 |
426 | except TimeoutError:
427 | result.mark_completed(False, error="Test timeout")
428 | self.logger.warning(f"✗ {test_name}: TIMEOUT")
429 |
430 | except Exception as e:
431 | result.mark_completed(False, error=str(e))
432 | self.logger.error(f"✗ {test_name}: ERROR - {e}")
433 |
434 | results.append(result)
435 |
436 | return results
437 |
438 | async def test_error_handling(self) -> list[TestResult]:
439 | """Test error handling with invalid inputs."""
440 | self.print_header("Testing Error Handling")
441 | results = []
442 |
443 | for i, scenario in enumerate(ERROR_HANDLING_SCENARIOS):
444 | test_name = f"error_handling_{scenario['tool']}"
445 | self.print_progress(i + 1, len(ERROR_HANDLING_SCENARIOS), test_name)
446 |
447 | result = TestResult(test_name, scenario["tool"])
448 |
449 | try:
450 | # Get the tool function
451 | tool_func = getattr(self, scenario["tool"])
452 |
453 | # Add session ID
454 | params = scenario["params"].copy()
455 | params["session_id"] = f"{self.session_id}_{test_name}"
456 |
457 | response = await asyncio.wait_for(
458 | tool_func(**params),
459 | timeout=60, # Shorter timeout for error cases
460 | )
461 |
462 | # For error handling tests, we expect graceful error handling
463 | # Success means the tool returned an error response without crashing
464 | has_error_field = (
465 | "error" in response or response.get("status") == "error"
466 | )
467 | success = has_error_field and isinstance(response, dict)
468 |
469 | result.validation_results = {"graceful_error_handling": success}
470 | result.mark_completed(success, response)
471 |
472 | self.logger.info(
473 | f"✓ {test_name}: {'PASS' if success else 'FAIL'} - "
474 | f"Graceful error handling: {has_error_field}"
475 | )
476 |
477 | except TimeoutError:
478 | result.mark_completed(False, error="Test timeout")
479 | self.logger.warning(f"✗ {test_name}: TIMEOUT")
480 |
481 | except Exception as e:
482 | # For error handling tests, exceptions are actually failures
483 | result.mark_completed(False, error=f"Unhandled exception: {str(e)}")
484 | self.logger.error(f"✗ {test_name}: UNHANDLED EXCEPTION - {e}")
485 |
486 | results.append(result)
487 |
488 | return results
489 |
490 | async def test_concurrent_execution(self) -> list[TestResult]:
491 | """Test concurrent execution of multiple tools."""
492 | self.print_header("Testing Concurrent Execution")
493 | results = []
494 |
495 | # Create concurrent test scenarios
496 | concurrent_tasks = [
497 | (
498 | "concurrent_orchestrated",
499 | self.orchestrated_analysis,
500 | {
501 | "query": "Analyze MSFT for investment",
502 | "persona": "moderate",
503 | "routing_strategy": "llm_powered",
504 | "session_id": f"{self.session_id}_concurrent_1",
505 | },
506 | ),
507 | (
508 | "concurrent_research",
509 | self.deep_research_financial,
510 | {
511 | "research_topic": "Amazon business model",
512 | "persona": "conservative",
513 | "research_depth": "standard",
514 | "session_id": f"{self.session_id}_concurrent_2",
515 | },
516 | ),
517 | (
518 | "concurrent_comparison",
519 | self.compare_multi_agent_analysis,
520 | {
521 | "query": "Best tech stocks for portfolio",
522 | "persona": "aggressive",
523 | "session_id": f"{self.session_id}_concurrent_3",
524 | },
525 | ),
526 | ]
527 |
528 | start_time = time.time()
529 |
530 | try:
531 | # Execute all tasks concurrently
532 | concurrent_results = await asyncio.gather(
533 | *[
534 | task_func(**task_params)
535 | for _, task_func, task_params in concurrent_tasks
536 | ],
537 | return_exceptions=True,
538 | )
539 |
540 | execution_time = (time.time() - start_time) * 1000
541 |
542 | # Process results
543 | for i, (task_name, _, _) in enumerate(concurrent_tasks):
544 | result = TestResult(task_name, "concurrent_execution")
545 |
546 | if i < len(concurrent_results):
547 | response = concurrent_results[i]
548 |
549 | if isinstance(response, Exception):
550 | result.mark_completed(False, error=str(response))
551 | success = False
552 | else:
553 | success = (
554 | isinstance(response, dict)
555 | and response.get("status") != "error"
556 | )
557 | result.mark_completed(success, response)
558 |
559 | result.validation_results = {"concurrent_execution": success}
560 | self.logger.info(f"✓ {task_name}: {'PASS' if success else 'FAIL'}")
561 | else:
562 | result.mark_completed(False, error="No result returned")
563 | self.logger.error(f"✗ {task_name}: No result returned")
564 |
565 | results.append(result)
566 |
567 | # Add overall concurrent test result
568 | concurrent_summary = TestResult(
569 | "concurrent_execution_summary", "performance"
570 | )
571 | concurrent_summary.execution_time_ms = execution_time
572 | concurrent_summary.validation_results = {
573 | "all_completed": len(concurrent_results) == len(concurrent_tasks),
574 | "no_crashes": all(
575 | not isinstance(r, Exception) for r in concurrent_results
576 | ),
577 | "reasonable_time": execution_time < 180000, # 3 minutes
578 | }
579 | concurrent_summary.mark_completed(
580 | all(concurrent_summary.validation_results.values()),
581 | {
582 | "concurrent_tasks": len(concurrent_tasks),
583 | "total_time_ms": execution_time,
584 | },
585 | )
586 | results.append(concurrent_summary)
587 |
588 | self.logger.info(
589 | f"Concurrent execution completed in {execution_time:.0f}ms"
590 | )
591 |
592 | except Exception as e:
593 | error_result = TestResult("concurrent_execution_error", "performance")
594 | error_result.mark_completed(False, error=str(e))
595 | results.append(error_result)
596 | self.logger.error(f"✗ Concurrent execution failed: {e}")
597 |
598 | return results
599 |
600 | def _validate_orchestrated_response(
601 | self, response: dict[str, Any]
602 | ) -> dict[str, bool]:
603 | """Validate orchestrated analysis response format."""
604 | validations = {
605 | "has_status": "status" in response,
606 | "has_agent_type": "agent_type" in response,
607 | "has_persona": "persona" in response,
608 | "has_session_id": "session_id" in response,
609 | "status_is_success": response.get("status") == "success",
610 | "has_routing_strategy": "routing_strategy" in response,
611 | "has_execution_time": "execution_time_ms" in response
612 | and isinstance(response.get("execution_time_ms"), int | float),
613 | }
614 |
615 | # Additional validations for successful responses
616 | if response.get("status") == "success":
617 | validations.update(
618 | {
619 | "has_agents_used": "agents_used" in response,
620 | "agents_used_is_list": isinstance(
621 | response.get("agents_used"), list
622 | ),
623 | "has_synthesis_confidence": "synthesis_confidence" in response,
624 | }
625 | )
626 |
627 | return validations
628 |
629 | def _validate_research_response(self, response: dict[str, Any]) -> dict[str, bool]:
630 | """Validate deep research response format."""
631 | validations = {
632 | "has_status": "status" in response,
633 | "has_agent_type": "agent_type" in response,
634 | "has_persona": "persona" in response,
635 | "has_research_topic": "research_topic" in response,
636 | "status_is_success": response.get("status") == "success",
637 | "has_research_depth": "research_depth" in response,
638 | "has_focus_areas": "focus_areas" in response,
639 | }
640 |
641 | if response.get("status") == "success":
642 | validations.update(
643 | {
644 | "has_sources_analyzed": "sources_analyzed" in response,
645 | "sources_analyzed_is_numeric": isinstance(
646 | response.get("sources_analyzed"), int | float
647 | ),
648 | "has_research_confidence": "research_confidence" in response,
649 | "has_validation_checks": "validation_checks_passed" in response,
650 | }
651 | )
652 |
653 | return validations
654 |
655 | def _validate_comparison_response(
656 | self, response: dict[str, Any]
657 | ) -> dict[str, bool]:
658 | """Validate multi-agent comparison response format."""
659 | validations = {
660 | "has_status": "status" in response,
661 | "has_query": "query" in response,
662 | "has_persona": "persona" in response,
663 | "status_is_success": response.get("status") == "success",
664 | "has_agents_compared": "agents_compared" in response,
665 | }
666 |
667 | if response.get("status") == "success":
668 | validations.update(
669 | {
670 | "agents_compared_is_list": isinstance(
671 | response.get("agents_compared"), list
672 | ),
673 | "has_comparison": "comparison" in response,
674 | "comparison_is_dict": isinstance(response.get("comparison"), dict),
675 | "has_execution_times": "execution_times_ms" in response,
676 | "has_insights": "insights" in response,
677 | }
678 | )
679 |
680 | return validations
681 |
682 | async def run_performance_benchmark(self):
683 | """Run performance benchmarks for all tools."""
684 | self.print_header("Performance Benchmarking")
685 |
686 | benchmark_scenarios = [
687 | (
688 | "orchestrated_fast",
689 | self.orchestrated_analysis,
690 | {
691 | "query": "Quick AAPL analysis",
692 | "persona": "moderate",
693 | "routing_strategy": "rule_based",
694 | "max_agents": 1,
695 | "parallel_execution": False,
696 | },
697 | ),
698 | (
699 | "research_basic",
700 | self.deep_research_financial,
701 | {
702 | "research_topic": "Microsoft",
703 | "research_depth": "basic",
704 | "persona": "moderate",
705 | },
706 | ),
707 | (
708 | "comparison_minimal",
709 | self.compare_multi_agent_analysis,
710 | {
711 | "query": "Compare AAPL vs MSFT",
712 | "agent_types": ["market"],
713 | "persona": "moderate",
714 | },
715 | ),
716 | ]
717 |
718 | performance_results = []
719 |
720 | for test_name, tool_func, params in benchmark_scenarios:
721 | print(f"Benchmarking {test_name}...")
722 |
723 | # Add session ID
724 | params["session_id"] = f"{self.session_id}_benchmark_{test_name}"
725 |
726 | # Run multiple iterations for average performance
727 | times = []
728 |
729 | for i in range(3): # 3 iterations for average
730 | start_time = time.time()
731 |
732 | try:
733 | await tool_func(**params)
734 | end_time = time.time()
735 | execution_time = (end_time - start_time) * 1000
736 | times.append(execution_time)
737 |
738 | except Exception as e:
739 | self.logger.error(
740 | f"Benchmark {test_name} iteration {i + 1} failed: {e}"
741 | )
742 | times.append(float("inf"))
743 |
744 | # Calculate performance metrics
745 | valid_times = [t for t in times if t != float("inf")]
746 | if valid_times:
747 | avg_time = sum(valid_times) / len(valid_times)
748 | min_time = min(valid_times)
749 | max_time = max(valid_times)
750 |
751 | performance_results.append(
752 | {
753 | "test": test_name,
754 | "avg_time_ms": avg_time,
755 | "min_time_ms": min_time,
756 | "max_time_ms": max_time,
757 | "successful_runs": len(valid_times),
758 | }
759 | )
760 |
761 | print(
762 | f" {test_name}: Avg={avg_time:.0f}ms, Min={min_time:.0f}ms, Max={max_time:.0f}ms"
763 | )
764 | else:
765 | print(f" {test_name}: All iterations failed")
766 |
767 | return performance_results
768 |
769 | def generate_test_report(self):
770 | """Generate comprehensive test report."""
771 | self.print_header("Test Results Summary")
772 |
773 | total_tests = len(self.results)
774 | passed_tests = sum(1 for r in self.results if r.success)
775 | failed_tests = total_tests - passed_tests
776 |
777 | total_time = time.time() - self.start_time
778 |
779 | print(f"Total Tests: {total_tests}")
780 | print(f"Passed: {passed_tests} ({passed_tests / total_tests * 100:.1f}%)")
781 | print(f"Failed: {failed_tests} ({failed_tests / total_tests * 100:.1f}%)")
782 | print(f"Total Execution Time: {total_time:.2f}s")
783 |
784 | # Group results by tool
785 | by_tool = {}
786 | for result in self.results:
787 | if result.tool_name not in by_tool:
788 | by_tool[result.tool_name] = []
789 | by_tool[result.tool_name].append(result)
790 |
791 | print("\nResults by Tool:")
792 | for tool_name, tool_results in by_tool.items():
793 | tool_passed = sum(1 for r in tool_results if r.success)
794 | tool_total = len(tool_results)
795 | print(
796 | f" {tool_name}: {tool_passed}/{tool_total} passed "
797 | f"({tool_passed / tool_total * 100:.1f}%)"
798 | )
799 |
800 | # Performance summary
801 | execution_times = [
802 | r.execution_time_ms for r in self.results if r.execution_time_ms
803 | ]
804 | if execution_times:
805 | avg_time = sum(execution_times) / len(execution_times)
806 | print("\nPerformance Summary:")
807 | print(f" Average execution time: {avg_time:.0f}ms")
808 | print(f" Fastest test: {min(execution_times):.0f}ms")
809 | print(f" Slowest test: {max(execution_times):.0f}ms")
810 |
811 | # Failed tests details
812 | failed_results = [r for r in self.results if not r.success]
813 | if failed_results:
814 | print("\nFailed Tests:")
815 | for result in failed_results:
816 | print(f" ✗ {result.test_name}: {result.error}")
817 |
818 | return {
819 | "summary": {
820 | "total_tests": total_tests,
821 | "passed_tests": passed_tests,
822 | "failed_tests": failed_tests,
823 | "pass_rate": passed_tests / total_tests if total_tests > 0 else 0,
824 | "total_execution_time_s": total_time,
825 | },
826 | "by_tool": {
827 | tool: {
828 | "total": len(results),
829 | "passed": sum(1 for r in results if r.success),
830 | "pass_rate": sum(1 for r in results if r.success) / len(results),
831 | }
832 | for tool, results in by_tool.items()
833 | },
834 | "performance": {
835 | "avg_execution_time_ms": avg_time if execution_times else None,
836 | "min_execution_time_ms": min(execution_times)
837 | if execution_times
838 | else None,
839 | "max_execution_time_ms": max(execution_times)
840 | if execution_times
841 | else None,
842 | },
843 | "failed_tests": [r.to_dict() for r in failed_results],
844 | }
845 |
846 | def save_results(self, report: dict[str, Any], performance_data: list[dict]):
847 | """Save detailed results to files."""
848 | if not TEST_CONFIG["save_results"]:
849 | return
850 |
851 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
852 |
853 | # Save detailed results
854 | detailed_results = {
855 | "test_session": self.session_id,
856 | "timestamp": timestamp,
857 | "config": TEST_CONFIG,
858 | "summary": report,
859 | "detailed_results": [r.to_dict() for r in self.results],
860 | "performance_benchmarks": performance_data,
861 | }
862 |
863 | with open(f"integration_test_results_{timestamp}.json", "w") as f:
864 | json.dump(detailed_results, f, indent=2, default=str)
865 |
866 | print(f"\nDetailed results saved to: integration_test_results_{timestamp}.json")
867 |
868 | async def run_all_tests(self):
869 | """Run the complete test suite."""
870 | self.print_header(
871 | f"MaverickMCP Orchestration Integration Test Suite - {self.session_id}"
872 | )
873 |
874 | print("Test Configuration:")
875 | for key, value in TEST_CONFIG.items():
876 | print(f" {key}: {value}")
877 |
878 | try:
879 | # Run all test categories
880 | orchestrated_results = await self.test_orchestrated_analysis()
881 | self.results.extend(orchestrated_results)
882 |
883 | research_results = await self.test_deep_research_financial()
884 | self.results.extend(research_results)
885 |
886 | comparison_results = await self.test_compare_multi_agent_analysis()
887 | self.results.extend(comparison_results)
888 |
889 | error_handling_results = await self.test_error_handling()
890 | self.results.extend(error_handling_results)
891 |
892 | concurrent_results = await self.test_concurrent_execution()
893 | self.results.extend(concurrent_results)
894 |
895 | # Performance benchmarks
896 | performance_data = await self.run_performance_benchmark()
897 |
898 | # Generate and save report
899 | report = self.generate_test_report()
900 | self.save_results(report, performance_data)
901 |
902 | # Final status
903 | total_passed = sum(1 for r in self.results if r.success)
904 | total_tests = len(self.results)
905 |
906 | if total_passed == total_tests:
907 | print(f"\n🎉 ALL TESTS PASSED! ({total_passed}/{total_tests})")
908 | return 0
909 | else:
910 | print(f"\n⚠️ SOME TESTS FAILED ({total_passed}/{total_tests} passed)")
911 | return 1
912 |
913 | except Exception as e:
914 | self.logger.error(f"Test suite execution failed: {e}")
915 | print(f"\n💥 TEST SUITE EXECUTION FAILED: {e}")
916 | return 2
917 |
918 |
919 | async def main():
920 | """Main test execution function."""
921 | # Set environment variables for testing if needed
922 | if not os.getenv("OPENAI_API_KEY"):
923 | print("⚠️ Warning: OPENAI_API_KEY not set - tests will use mock responses")
924 |
925 | test_suite = IntegrationTestSuite()
926 | exit_code = await test_suite.run_all_tests()
927 |
928 | return exit_code
929 |
930 |
931 | if __name__ == "__main__":
932 | # Make the script executable
933 | import sys
934 |
935 | try:
936 | exit_code = asyncio.run(main())
937 | sys.exit(exit_code)
938 | except KeyboardInterrupt:
939 | print("\n🛑 Tests interrupted by user")
940 | sys.exit(130) # SIGINT exit code
941 | except Exception as e:
942 | print(f"\n💥 Unexpected error: {e}")
943 | sys.exit(1)
944 |
```