This is page 33 of 47. Use http://codebase.md/doobidoo/mcp-memory-service?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .claude
│ ├── agents
│ │ ├── amp-bridge.md
│ │ ├── amp-pr-automator.md
│ │ ├── code-quality-guard.md
│ │ ├── gemini-pr-automator.md
│ │ └── github-release-manager.md
│ ├── settings.local.json.backup
│ └── settings.local.json.local
├── .commit-message
├── .dockerignore
├── .env.example
├── .env.sqlite.backup
├── .envnn#
├── .gitattributes
├── .github
│ ├── FUNDING.yml
│ ├── ISSUE_TEMPLATE
│ │ ├── bug_report.yml
│ │ ├── config.yml
│ │ ├── feature_request.yml
│ │ └── performance_issue.yml
│ ├── pull_request_template.md
│ └── workflows
│ ├── bridge-tests.yml
│ ├── CACHE_FIX.md
│ ├── claude-code-review.yml
│ ├── claude.yml
│ ├── cleanup-images.yml.disabled
│ ├── dev-setup-validation.yml
│ ├── docker-publish.yml
│ ├── LATEST_FIXES.md
│ ├── main-optimized.yml.disabled
│ ├── main.yml
│ ├── publish-and-test.yml
│ ├── README_OPTIMIZATION.md
│ ├── release-tag.yml.disabled
│ ├── release.yml
│ ├── roadmap-review-reminder.yml
│ ├── SECRET_CONDITIONAL_FIX.md
│ └── WORKFLOW_FIXES.md
├── .gitignore
├── .mcp.json.backup
├── .mcp.json.template
├── .pyscn
│ ├── .gitignore
│ └── reports
│ └── analyze_20251123_214224.html
├── AGENTS.md
├── archive
│ ├── deployment
│ │ ├── deploy_fastmcp_fixed.sh
│ │ ├── deploy_http_with_mcp.sh
│ │ └── deploy_mcp_v4.sh
│ ├── deployment-configs
│ │ ├── empty_config.yml
│ │ └── smithery.yaml
│ ├── development
│ │ └── test_fastmcp.py
│ ├── docs-removed-2025-08-23
│ │ ├── authentication.md
│ │ ├── claude_integration.md
│ │ ├── claude-code-compatibility.md
│ │ ├── claude-code-integration.md
│ │ ├── claude-code-quickstart.md
│ │ ├── claude-desktop-setup.md
│ │ ├── complete-setup-guide.md
│ │ ├── database-synchronization.md
│ │ ├── development
│ │ │ ├── autonomous-memory-consolidation.md
│ │ │ ├── CLEANUP_PLAN.md
│ │ │ ├── CLEANUP_README.md
│ │ │ ├── CLEANUP_SUMMARY.md
│ │ │ ├── dream-inspired-memory-consolidation.md
│ │ │ ├── hybrid-slm-memory-consolidation.md
│ │ │ ├── mcp-milestone.md
│ │ │ ├── multi-client-architecture.md
│ │ │ ├── test-results.md
│ │ │ └── TIMESTAMP_FIX_SUMMARY.md
│ │ ├── distributed-sync.md
│ │ ├── invocation_guide.md
│ │ ├── macos-intel.md
│ │ ├── master-guide.md
│ │ ├── mcp-client-configuration.md
│ │ ├── multi-client-server.md
│ │ ├── service-installation.md
│ │ ├── sessions
│ │ │ └── MCP_ENHANCEMENT_SESSION_MEMORY_v4.1.0.md
│ │ ├── UBUNTU_SETUP.md
│ │ ├── ubuntu.md
│ │ ├── windows-setup.md
│ │ └── windows.md
│ ├── docs-root-cleanup-2025-08-23
│ │ ├── AWESOME_LIST_SUBMISSION.md
│ │ ├── CLOUDFLARE_IMPLEMENTATION.md
│ │ ├── DOCUMENTATION_ANALYSIS.md
│ │ ├── DOCUMENTATION_CLEANUP_PLAN.md
│ │ ├── DOCUMENTATION_CONSOLIDATION_COMPLETE.md
│ │ ├── LITESTREAM_SETUP_GUIDE.md
│ │ ├── lm_studio_system_prompt.md
│ │ ├── PYTORCH_DOWNLOAD_FIX.md
│ │ └── README-ORIGINAL-BACKUP.md
│ ├── investigations
│ │ └── MACOS_HOOKS_INVESTIGATION.md
│ ├── litestream-configs-v6.3.0
│ │ ├── install_service.sh
│ │ ├── litestream_master_config_fixed.yml
│ │ ├── litestream_master_config.yml
│ │ ├── litestream_replica_config_fixed.yml
│ │ ├── litestream_replica_config.yml
│ │ ├── litestream_replica_simple.yml
│ │ ├── litestream-http.service
│ │ ├── litestream.service
│ │ └── requirements-cloudflare.txt
│ ├── release-notes
│ │ └── release-notes-v7.1.4.md
│ └── setup-development
│ ├── README.md
│ ├── setup_consolidation_mdns.sh
│ ├── STARTUP_SETUP_GUIDE.md
│ └── test_service.sh
├── CHANGELOG-HISTORIC.md
├── CHANGELOG.md
├── claude_commands
│ ├── memory-context.md
│ ├── memory-health.md
│ ├── memory-ingest-dir.md
│ ├── memory-ingest.md
│ ├── memory-recall.md
│ ├── memory-search.md
│ ├── memory-store.md
│ ├── README.md
│ └── session-start.md
├── claude-hooks
│ ├── config.json
│ ├── config.template.json
│ ├── CONFIGURATION.md
│ ├── core
│ │ ├── memory-retrieval.js
│ │ ├── mid-conversation.js
│ │ ├── session-end.js
│ │ ├── session-start.js
│ │ └── topic-change.js
│ ├── debug-pattern-test.js
│ ├── install_claude_hooks_windows.ps1
│ ├── install_hooks.py
│ ├── memory-mode-controller.js
│ ├── MIGRATION.md
│ ├── README-NATURAL-TRIGGERS.md
│ ├── README-phase2.md
│ ├── README.md
│ ├── simple-test.js
│ ├── statusline.sh
│ ├── test-adaptive-weights.js
│ ├── test-dual-protocol-hook.js
│ ├── test-mcp-hook.js
│ ├── test-natural-triggers.js
│ ├── test-recency-scoring.js
│ ├── tests
│ │ ├── integration-test.js
│ │ ├── phase2-integration-test.js
│ │ ├── test-code-execution.js
│ │ ├── test-cross-session.json
│ │ ├── test-session-tracking.json
│ │ └── test-threading.json
│ ├── utilities
│ │ ├── adaptive-pattern-detector.js
│ │ ├── context-formatter.js
│ │ ├── context-shift-detector.js
│ │ ├── conversation-analyzer.js
│ │ ├── dynamic-context-updater.js
│ │ ├── git-analyzer.js
│ │ ├── mcp-client.js
│ │ ├── memory-client.js
│ │ ├── memory-scorer.js
│ │ ├── performance-manager.js
│ │ ├── project-detector.js
│ │ ├── session-tracker.js
│ │ ├── tiered-conversation-monitor.js
│ │ └── version-checker.js
│ └── WINDOWS-SESSIONSTART-BUG.md
├── CLAUDE.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Development-Sprint-November-2025.md
├── docs
│ ├── amp-cli-bridge.md
│ ├── api
│ │ ├── code-execution-interface.md
│ │ ├── memory-metadata-api.md
│ │ ├── PHASE1_IMPLEMENTATION_SUMMARY.md
│ │ ├── PHASE2_IMPLEMENTATION_SUMMARY.md
│ │ ├── PHASE2_REPORT.md
│ │ └── tag-standardization.md
│ ├── architecture
│ │ ├── search-enhancement-spec.md
│ │ └── search-examples.md
│ ├── architecture.md
│ ├── archive
│ │ └── obsolete-workflows
│ │ ├── load_memory_context.md
│ │ └── README.md
│ ├── assets
│ │ └── images
│ │ ├── dashboard-v3.3.0-preview.png
│ │ ├── memory-awareness-hooks-example.png
│ │ ├── project-infographic.svg
│ │ └── README.md
│ ├── CLAUDE_CODE_QUICK_REFERENCE.md
│ ├── cloudflare-setup.md
│ ├── deployment
│ │ ├── docker.md
│ │ ├── dual-service.md
│ │ ├── production-guide.md
│ │ └── systemd-service.md
│ ├── development
│ │ ├── ai-agent-instructions.md
│ │ ├── code-quality
│ │ │ ├── phase-2a-completion.md
│ │ │ ├── phase-2a-handle-get-prompt.md
│ │ │ ├── phase-2a-index.md
│ │ │ ├── phase-2a-install-package.md
│ │ │ └── phase-2b-session-summary.md
│ │ ├── code-quality-workflow.md
│ │ ├── dashboard-workflow.md
│ │ ├── issue-management.md
│ │ ├── pr-review-guide.md
│ │ ├── refactoring-notes.md
│ │ ├── release-checklist.md
│ │ └── todo-tracker.md
│ ├── docker-optimized-build.md
│ ├── document-ingestion.md
│ ├── DOCUMENTATION_AUDIT.md
│ ├── enhancement-roadmap-issue-14.md
│ ├── examples
│ │ ├── analysis-scripts.js
│ │ ├── maintenance-session-example.md
│ │ ├── memory-distribution-chart.jsx
│ │ └── tag-schema.json
│ ├── first-time-setup.md
│ ├── glama-deployment.md
│ ├── guides
│ │ ├── advanced-command-examples.md
│ │ ├── chromadb-migration.md
│ │ ├── commands-vs-mcp-server.md
│ │ ├── mcp-enhancements.md
│ │ ├── mdns-service-discovery.md
│ │ ├── memory-consolidation-guide.md
│ │ ├── migration.md
│ │ ├── scripts.md
│ │ └── STORAGE_BACKENDS.md
│ ├── HOOK_IMPROVEMENTS.md
│ ├── hooks
│ │ └── phase2-code-execution-migration.md
│ ├── http-server-management.md
│ ├── ide-compatability.md
│ ├── IMAGE_RETENTION_POLICY.md
│ ├── images
│ │ └── dashboard-placeholder.md
│ ├── implementation
│ │ ├── health_checks.md
│ │ └── performance.md
│ ├── IMPLEMENTATION_PLAN_HTTP_SSE.md
│ ├── integration
│ │ ├── homebrew.md
│ │ └── multi-client.md
│ ├── integrations
│ │ ├── gemini.md
│ │ ├── groq-bridge.md
│ │ ├── groq-integration-summary.md
│ │ └── groq-model-comparison.md
│ ├── integrations.md
│ ├── legacy
│ │ └── dual-protocol-hooks.md
│ ├── LM_STUDIO_COMPATIBILITY.md
│ ├── maintenance
│ │ └── memory-maintenance.md
│ ├── mastery
│ │ ├── api-reference.md
│ │ ├── architecture-overview.md
│ │ ├── configuration-guide.md
│ │ ├── local-setup-and-run.md
│ │ ├── testing-guide.md
│ │ └── troubleshooting.md
│ ├── migration
│ │ └── code-execution-api-quick-start.md
│ ├── natural-memory-triggers
│ │ ├── cli-reference.md
│ │ ├── installation-guide.md
│ │ └── performance-optimization.md
│ ├── oauth-setup.md
│ ├── pr-graphql-integration.md
│ ├── quick-setup-cloudflare-dual-environment.md
│ ├── README.md
│ ├── remote-configuration-wiki-section.md
│ ├── research
│ │ ├── code-execution-interface-implementation.md
│ │ └── code-execution-interface-summary.md
│ ├── ROADMAP.md
│ ├── sqlite-vec-backend.md
│ ├── statistics
│ │ ├── charts
│ │ │ ├── activity_patterns.png
│ │ │ ├── contributors.png
│ │ │ ├── growth_trajectory.png
│ │ │ ├── monthly_activity.png
│ │ │ └── october_sprint.png
│ │ ├── data
│ │ │ ├── activity_by_day.csv
│ │ │ ├── activity_by_hour.csv
│ │ │ ├── contributors.csv
│ │ │ └── monthly_activity.csv
│ │ ├── generate_charts.py
│ │ └── REPOSITORY_STATISTICS.md
│ ├── technical
│ │ ├── development.md
│ │ ├── memory-migration.md
│ │ ├── migration-log.md
│ │ ├── sqlite-vec-embedding-fixes.md
│ │ └── tag-storage.md
│ ├── testing
│ │ └── regression-tests.md
│ ├── testing-cloudflare-backend.md
│ ├── troubleshooting
│ │ ├── cloudflare-api-token-setup.md
│ │ ├── cloudflare-authentication.md
│ │ ├── general.md
│ │ ├── hooks-quick-reference.md
│ │ ├── pr162-schema-caching-issue.md
│ │ ├── session-end-hooks.md
│ │ └── sync-issues.md
│ └── tutorials
│ ├── advanced-techniques.md
│ ├── data-analysis.md
│ └── demo-session-walkthrough.md
├── examples
│ ├── claude_desktop_config_template.json
│ ├── claude_desktop_config_windows.json
│ ├── claude-desktop-http-config.json
│ ├── config
│ │ └── claude_desktop_config.json
│ ├── http-mcp-bridge.js
│ ├── memory_export_template.json
│ ├── README.md
│ ├── setup
│ │ └── setup_multi_client_complete.py
│ └── start_https_example.sh
├── install_service.py
├── install.py
├── LICENSE
├── NOTICE
├── pyproject.toml
├── pytest.ini
├── README.md
├── run_server.py
├── scripts
│ ├── .claude
│ │ └── settings.local.json
│ ├── archive
│ │ └── check_missing_timestamps.py
│ ├── backup
│ │ ├── backup_memories.py
│ │ ├── backup_sqlite_vec.sh
│ │ ├── export_distributable_memories.sh
│ │ └── restore_memories.py
│ ├── benchmarks
│ │ ├── benchmark_code_execution_api.py
│ │ ├── benchmark_hybrid_sync.py
│ │ └── benchmark_server_caching.py
│ ├── database
│ │ ├── analyze_sqlite_vec_db.py
│ │ ├── check_sqlite_vec_status.py
│ │ ├── db_health_check.py
│ │ └── simple_timestamp_check.py
│ ├── development
│ │ ├── debug_server_initialization.py
│ │ ├── find_orphaned_files.py
│ │ ├── fix_mdns.sh
│ │ ├── fix_sitecustomize.py
│ │ ├── remote_ingest.sh
│ │ ├── setup-git-merge-drivers.sh
│ │ ├── uv-lock-merge.sh
│ │ └── verify_hybrid_sync.py
│ ├── hooks
│ │ └── pre-commit
│ ├── installation
│ │ ├── install_linux_service.py
│ │ ├── install_macos_service.py
│ │ ├── install_uv.py
│ │ ├── install_windows_service.py
│ │ ├── install.py
│ │ ├── setup_backup_cron.sh
│ │ ├── setup_claude_mcp.sh
│ │ └── setup_cloudflare_resources.py
│ ├── linux
│ │ ├── service_status.sh
│ │ ├── start_service.sh
│ │ ├── stop_service.sh
│ │ ├── uninstall_service.sh
│ │ └── view_logs.sh
│ ├── maintenance
│ │ ├── assign_memory_types.py
│ │ ├── check_memory_types.py
│ │ ├── cleanup_corrupted_encoding.py
│ │ ├── cleanup_memories.py
│ │ ├── cleanup_organize.py
│ │ ├── consolidate_memory_types.py
│ │ ├── consolidation_mappings.json
│ │ ├── delete_orphaned_vectors_fixed.py
│ │ ├── fast_cleanup_duplicates_with_tracking.sh
│ │ ├── find_all_duplicates.py
│ │ ├── find_cloudflare_duplicates.py
│ │ ├── find_duplicates.py
│ │ ├── memory-types.md
│ │ ├── README.md
│ │ ├── recover_timestamps_from_cloudflare.py
│ │ ├── regenerate_embeddings.py
│ │ ├── repair_malformed_tags.py
│ │ ├── repair_memories.py
│ │ ├── repair_sqlite_vec_embeddings.py
│ │ ├── repair_zero_embeddings.py
│ │ ├── restore_from_json_export.py
│ │ └── scan_todos.sh
│ ├── migration
│ │ ├── cleanup_mcp_timestamps.py
│ │ ├── legacy
│ │ │ └── migrate_chroma_to_sqlite.py
│ │ ├── mcp-migration.py
│ │ ├── migrate_sqlite_vec_embeddings.py
│ │ ├── migrate_storage.py
│ │ ├── migrate_tags.py
│ │ ├── migrate_timestamps.py
│ │ ├── migrate_to_cloudflare.py
│ │ ├── migrate_to_sqlite_vec.py
│ │ ├── migrate_v5_enhanced.py
│ │ ├── TIMESTAMP_CLEANUP_README.md
│ │ └── verify_mcp_timestamps.py
│ ├── pr
│ │ ├── amp_collect_results.sh
│ │ ├── amp_detect_breaking_changes.sh
│ │ ├── amp_generate_tests.sh
│ │ ├── amp_pr_review.sh
│ │ ├── amp_quality_gate.sh
│ │ ├── amp_suggest_fixes.sh
│ │ ├── auto_review.sh
│ │ ├── detect_breaking_changes.sh
│ │ ├── generate_tests.sh
│ │ ├── lib
│ │ │ └── graphql_helpers.sh
│ │ ├── quality_gate.sh
│ │ ├── resolve_threads.sh
│ │ ├── run_pyscn_analysis.sh
│ │ ├── run_quality_checks.sh
│ │ ├── thread_status.sh
│ │ └── watch_reviews.sh
│ ├── quality
│ │ ├── fix_dead_code_install.sh
│ │ ├── phase1_dead_code_analysis.md
│ │ ├── phase2_complexity_analysis.md
│ │ ├── README_PHASE1.md
│ │ ├── README_PHASE2.md
│ │ ├── track_pyscn_metrics.sh
│ │ └── weekly_quality_review.sh
│ ├── README.md
│ ├── run
│ │ ├── run_mcp_memory.sh
│ │ ├── run-with-uv.sh
│ │ └── start_sqlite_vec.sh
│ ├── run_memory_server.py
│ ├── server
│ │ ├── check_http_server.py
│ │ ├── check_server_health.py
│ │ ├── memory_offline.py
│ │ ├── preload_models.py
│ │ ├── run_http_server.py
│ │ ├── run_memory_server.py
│ │ ├── start_http_server.bat
│ │ └── start_http_server.sh
│ ├── service
│ │ ├── deploy_dual_services.sh
│ │ ├── install_http_service.sh
│ │ ├── mcp-memory-http.service
│ │ ├── mcp-memory.service
│ │ ├── memory_service_manager.sh
│ │ ├── service_control.sh
│ │ ├── service_utils.py
│ │ └── update_service.sh
│ ├── sync
│ │ ├── check_drift.py
│ │ ├── claude_sync_commands.py
│ │ ├── export_memories.py
│ │ ├── import_memories.py
│ │ ├── litestream
│ │ │ ├── apply_local_changes.sh
│ │ │ ├── enhanced_memory_store.sh
│ │ │ ├── init_staging_db.sh
│ │ │ ├── io.litestream.replication.plist
│ │ │ ├── manual_sync.sh
│ │ │ ├── memory_sync.sh
│ │ │ ├── pull_remote_changes.sh
│ │ │ ├── push_to_remote.sh
│ │ │ ├── README.md
│ │ │ ├── resolve_conflicts.sh
│ │ │ ├── setup_local_litestream.sh
│ │ │ ├── setup_remote_litestream.sh
│ │ │ ├── staging_db_init.sql
│ │ │ ├── stash_local_changes.sh
│ │ │ ├── sync_from_remote_noconfig.sh
│ │ │ └── sync_from_remote.sh
│ │ ├── README.md
│ │ ├── safe_cloudflare_update.sh
│ │ ├── sync_memory_backends.py
│ │ └── sync_now.py
│ ├── testing
│ │ ├── run_complete_test.py
│ │ ├── run_memory_test.sh
│ │ ├── simple_test.py
│ │ ├── test_cleanup_logic.py
│ │ ├── test_cloudflare_backend.py
│ │ ├── test_docker_functionality.py
│ │ ├── test_installation.py
│ │ ├── test_mdns.py
│ │ ├── test_memory_api.py
│ │ ├── test_memory_simple.py
│ │ ├── test_migration.py
│ │ ├── test_search_api.py
│ │ ├── test_sqlite_vec_embeddings.py
│ │ ├── test_sse_events.py
│ │ ├── test-connection.py
│ │ └── test-hook.js
│ ├── utils
│ │ ├── claude_commands_utils.py
│ │ ├── generate_personalized_claude_md.sh
│ │ ├── groq
│ │ ├── groq_agent_bridge.py
│ │ ├── list-collections.py
│ │ ├── memory_wrapper_uv.py
│ │ ├── query_memories.py
│ │ ├── smithery_wrapper.py
│ │ ├── test_groq_bridge.sh
│ │ └── uv_wrapper.py
│ └── validation
│ ├── check_dev_setup.py
│ ├── check_documentation_links.py
│ ├── diagnose_backend_config.py
│ ├── validate_configuration_complete.py
│ ├── validate_memories.py
│ ├── validate_migration.py
│ ├── validate_timestamp_integrity.py
│ ├── verify_environment.py
│ ├── verify_pytorch_windows.py
│ └── verify_torch.py
├── SECURITY.md
├── selective_timestamp_recovery.py
├── SPONSORS.md
├── src
│ └── mcp_memory_service
│ ├── __init__.py
│ ├── api
│ │ ├── __init__.py
│ │ ├── client.py
│ │ ├── operations.py
│ │ ├── sync_wrapper.py
│ │ └── types.py
│ ├── backup
│ │ ├── __init__.py
│ │ └── scheduler.py
│ ├── cli
│ │ ├── __init__.py
│ │ ├── ingestion.py
│ │ ├── main.py
│ │ └── utils.py
│ ├── config.py
│ ├── consolidation
│ │ ├── __init__.py
│ │ ├── associations.py
│ │ ├── base.py
│ │ ├── clustering.py
│ │ ├── compression.py
│ │ ├── consolidator.py
│ │ ├── decay.py
│ │ ├── forgetting.py
│ │ ├── health.py
│ │ └── scheduler.py
│ ├── dependency_check.py
│ ├── discovery
│ │ ├── __init__.py
│ │ ├── client.py
│ │ └── mdns_service.py
│ ├── embeddings
│ │ ├── __init__.py
│ │ └── onnx_embeddings.py
│ ├── ingestion
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── chunker.py
│ │ ├── csv_loader.py
│ │ ├── json_loader.py
│ │ ├── pdf_loader.py
│ │ ├── registry.py
│ │ ├── semtools_loader.py
│ │ └── text_loader.py
│ ├── lm_studio_compat.py
│ ├── mcp_server.py
│ ├── models
│ │ ├── __init__.py
│ │ └── memory.py
│ ├── server.py
│ ├── services
│ │ ├── __init__.py
│ │ └── memory_service.py
│ ├── storage
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── cloudflare.py
│ │ ├── factory.py
│ │ ├── http_client.py
│ │ ├── hybrid.py
│ │ └── sqlite_vec.py
│ ├── sync
│ │ ├── __init__.py
│ │ ├── exporter.py
│ │ ├── importer.py
│ │ └── litestream_config.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── cache_manager.py
│ │ ├── content_splitter.py
│ │ ├── db_utils.py
│ │ ├── debug.py
│ │ ├── document_processing.py
│ │ ├── gpu_detection.py
│ │ ├── hashing.py
│ │ ├── http_server_manager.py
│ │ ├── port_detection.py
│ │ ├── system_detection.py
│ │ └── time_parser.py
│ └── web
│ ├── __init__.py
│ ├── api
│ │ ├── __init__.py
│ │ ├── analytics.py
│ │ ├── backup.py
│ │ ├── consolidation.py
│ │ ├── documents.py
│ │ ├── events.py
│ │ ├── health.py
│ │ ├── manage.py
│ │ ├── mcp.py
│ │ ├── memories.py
│ │ ├── search.py
│ │ └── sync.py
│ ├── app.py
│ ├── dependencies.py
│ ├── oauth
│ │ ├── __init__.py
│ │ ├── authorization.py
│ │ ├── discovery.py
│ │ ├── middleware.py
│ │ ├── models.py
│ │ ├── registration.py
│ │ └── storage.py
│ ├── sse.py
│ └── static
│ ├── app.js
│ ├── index.html
│ ├── README.md
│ ├── sse_test.html
│ └── style.css
├── start_http_debug.bat
├── start_http_server.sh
├── test_document.txt
├── test_version_checker.js
├── tests
│ ├── __init__.py
│ ├── api
│ │ ├── __init__.py
│ │ ├── test_compact_types.py
│ │ └── test_operations.py
│ ├── bridge
│ │ ├── mock_responses.js
│ │ ├── package-lock.json
│ │ ├── package.json
│ │ └── test_http_mcp_bridge.js
│ ├── conftest.py
│ ├── consolidation
│ │ ├── __init__.py
│ │ ├── conftest.py
│ │ ├── test_associations.py
│ │ ├── test_clustering.py
│ │ ├── test_compression.py
│ │ ├── test_consolidator.py
│ │ ├── test_decay.py
│ │ └── test_forgetting.py
│ ├── contracts
│ │ └── api-specification.yml
│ ├── integration
│ │ ├── package-lock.json
│ │ ├── package.json
│ │ ├── test_api_key_fallback.py
│ │ ├── test_api_memories_chronological.py
│ │ ├── test_api_tag_time_search.py
│ │ ├── test_api_with_memory_service.py
│ │ ├── test_bridge_integration.js
│ │ ├── test_cli_interfaces.py
│ │ ├── test_cloudflare_connection.py
│ │ ├── test_concurrent_clients.py
│ │ ├── test_data_serialization_consistency.py
│ │ ├── test_http_server_startup.py
│ │ ├── test_mcp_memory.py
│ │ ├── test_mdns_integration.py
│ │ ├── test_oauth_basic_auth.py
│ │ ├── test_oauth_flow.py
│ │ ├── test_server_handlers.py
│ │ └── test_store_memory.py
│ ├── performance
│ │ ├── test_background_sync.py
│ │ └── test_hybrid_live.py
│ ├── README.md
│ ├── smithery
│ │ └── test_smithery.py
│ ├── sqlite
│ │ └── simple_sqlite_vec_test.py
│ ├── test_client.py
│ ├── test_content_splitting.py
│ ├── test_database.py
│ ├── test_hybrid_cloudflare_limits.py
│ ├── test_hybrid_storage.py
│ ├── test_memory_ops.py
│ ├── test_semantic_search.py
│ ├── test_sqlite_vec_storage.py
│ ├── test_time_parser.py
│ ├── test_timestamp_preservation.py
│ ├── timestamp
│ │ ├── test_hook_vs_manual_storage.py
│ │ ├── test_issue99_final_validation.py
│ │ ├── test_search_retrieval_inconsistency.py
│ │ ├── test_timestamp_issue.py
│ │ └── test_timestamp_simple.py
│ └── unit
│ ├── conftest.py
│ ├── test_cloudflare_storage.py
│ ├── test_csv_loader.py
│ ├── test_fastapi_dependencies.py
│ ├── test_import.py
│ ├── test_json_loader.py
│ ├── test_mdns_simple.py
│ ├── test_mdns.py
│ ├── test_memory_service.py
│ ├── test_memory.py
│ ├── test_semtools_loader.py
│ ├── test_storage_interface_compatibility.py
│ └── test_tag_time_filtering.py
├── tools
│ ├── docker
│ │ ├── DEPRECATED.md
│ │ ├── docker-compose.http.yml
│ │ ├── docker-compose.pythonpath.yml
│ │ ├── docker-compose.standalone.yml
│ │ ├── docker-compose.uv.yml
│ │ ├── docker-compose.yml
│ │ ├── docker-entrypoint-persistent.sh
│ │ ├── docker-entrypoint-unified.sh
│ │ ├── docker-entrypoint.sh
│ │ ├── Dockerfile
│ │ ├── Dockerfile.glama
│ │ ├── Dockerfile.slim
│ │ ├── README.md
│ │ └── test-docker-modes.sh
│ └── README.md
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/src/mcp_memory_service/consolidation/consolidator.py:
--------------------------------------------------------------------------------
```python
1 | # Copyright 2024 Heinrich Krupp
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """Main dream-inspired consolidation orchestrator."""
16 |
17 | import asyncio
18 | from typing import List, Dict, Any, Optional, Protocol, Tuple
19 | from datetime import datetime, timedelta
20 | import logging
21 | import time
22 |
23 | from .base import ConsolidationConfig, ConsolidationReport, ConsolidationError
24 | from .decay import ExponentialDecayCalculator
25 | from .associations import CreativeAssociationEngine
26 | from .clustering import SemanticClusteringEngine
27 | from .compression import SemanticCompressionEngine
28 | from .forgetting import ControlledForgettingEngine
29 | from .health import ConsolidationHealthMonitor
30 | from ..models.memory import Memory
31 |
32 | # Protocol for storage backend interface
33 | class StorageProtocol(Protocol):
34 | async def get_all_memories(self) -> List[Memory]: ...
35 | async def get_memories_by_time_range(self, start_time: float, end_time: float) -> List[Memory]: ...
36 | async def store(self, memory: Memory) -> Tuple[bool, str]: ...
37 | async def update_memory(self, memory: Memory) -> bool: ...
38 | async def delete_memory(self, content_hash: str) -> bool: ...
39 | async def get_memory_connections(self) -> Dict[str, int]: ...
40 | async def get_access_patterns(self) -> Dict[str, datetime]: ...
41 |
42 |
43 | class SyncPauseContext:
44 | """Context manager for pausing/resuming hybrid backend sync."""
45 |
46 | def __init__(self, storage, logger):
47 | self.storage = storage
48 | self.logger = logger
49 | self.is_hybrid = hasattr(storage, 'pause_sync') and hasattr(storage, 'resume_sync')
50 | self.sync_paused = False
51 |
52 | async def __aenter__(self):
53 | if self.is_hybrid:
54 | self.logger.info("Pausing hybrid backend sync during consolidation")
55 | await self.storage.pause_sync()
56 | self.sync_paused = True
57 | return self
58 |
59 | async def __aexit__(self, exc_type, exc_val, exc_tb):
60 | if self.sync_paused:
61 | try:
62 | self.logger.info("Resuming hybrid backend sync after consolidation")
63 | await self.storage.resume_sync()
64 | except Exception as e:
65 | self.logger.error(f"Failed to resume sync: {e}", exc_info=True)
66 |
67 |
68 | def check_horizon_requirements(time_horizon: str, phase_name: str,
69 | enabled_phases: Dict[str, List[str]]) -> bool:
70 | """Check if a consolidation phase should run for the given horizon.
71 |
72 | Args:
73 | time_horizon: Current time horizon (daily, weekly, etc.)
74 | phase_name: Phase identifier (clustering, associations, etc.)
75 | enabled_phases: Dict mapping phase names to applicable horizons
76 |
77 | Returns:
78 | bool: True if phase should run
79 | """
80 | applicable_horizons = enabled_phases.get(phase_name, [])
81 | return time_horizon in applicable_horizons
82 |
83 |
84 | # Horizon configuration
85 | HORIZON_CONFIGS = {
86 | 'daily': {'delta': timedelta(days=1), 'cutoff_days': 2},
87 | 'weekly': {'delta': timedelta(days=7), 'cutoff_days': None},
88 | 'monthly': {'delta': timedelta(days=30), 'cutoff_days': None},
89 | 'quarterly': {'delta': timedelta(days=90), 'cutoff_days': 90},
90 | 'yearly': {'delta': timedelta(days=365), 'cutoff_days': 365}
91 | }
92 |
93 |
94 | def filter_memories_by_age(memories: List[Memory], cutoff_date: datetime) -> List[Memory]:
95 | """Filter memories created before the cutoff date.
96 |
97 | Args:
98 | memories: List of Memory objects
99 | cutoff_date: Only keep memories older than this
100 |
101 | Returns:
102 | Filtered list of memories
103 | """
104 | return [
105 | m for m in memories
106 | if m.created_at and datetime.utcfromtimestamp(m.created_at) < cutoff_date
107 | ]
108 |
109 | class DreamInspiredConsolidator:
110 | """
111 | Main consolidation engine with biologically-inspired processing.
112 |
113 | Orchestrates the full consolidation pipeline including:
114 | - Exponential decay scoring
115 | - Creative association discovery
116 | - Semantic clustering and compression
117 | - Controlled forgetting with archival
118 | """
119 |
120 | # Phase enablement configuration
121 | ENABLED_PHASES = {
122 | 'clustering': ['weekly', 'monthly', 'quarterly'],
123 | 'associations': ['weekly', 'monthly'],
124 | 'compression': ['weekly', 'monthly', 'quarterly'],
125 | 'forgetting': ['monthly', 'quarterly', 'yearly']
126 | }
127 |
128 | def __init__(self, storage: StorageProtocol, config: ConsolidationConfig):
129 | self.storage = storage
130 | self.config = config
131 | self.logger = logging.getLogger(__name__)
132 |
133 | # Initialize component engines
134 | self.decay_calculator = ExponentialDecayCalculator(config)
135 | self.association_engine = CreativeAssociationEngine(config)
136 | self.clustering_engine = SemanticClusteringEngine(config)
137 | self.compression_engine = SemanticCompressionEngine(config)
138 | self.forgetting_engine = ControlledForgettingEngine(config)
139 |
140 | # Initialize health monitoring
141 | self.health_monitor = ConsolidationHealthMonitor(config)
142 |
143 | # Performance tracking
144 | self.last_consolidation_times = {}
145 | self.consolidation_stats = {
146 | 'total_runs': 0,
147 | 'successful_runs': 0,
148 | 'total_memories_processed': 0,
149 | 'total_associations_created': 0,
150 | 'total_clusters_created': 0,
151 | 'total_memories_compressed': 0,
152 | 'total_memories_archived': 0
153 | }
154 |
155 | async def consolidate(self, time_horizon: str, **kwargs) -> ConsolidationReport:
156 | """
157 | Run full consolidation pipeline for given time horizon.
158 |
159 | Args:
160 | time_horizon: 'daily', 'weekly', 'monthly', 'quarterly', 'yearly'
161 | **kwargs: Additional parameters for consolidation
162 |
163 | Returns:
164 | ConsolidationReport with results and performance metrics
165 | """
166 | start_time = datetime.now()
167 | report = ConsolidationReport(
168 | time_horizon=time_horizon,
169 | start_time=start_time,
170 | end_time=start_time, # Will be updated at the end
171 | memories_processed=0
172 | )
173 |
174 | try:
175 | self.logger.info(f"Starting {time_horizon} consolidation - this may take several minutes depending on memory count...")
176 |
177 | # Use context manager for sync pause/resume
178 | async with SyncPauseContext(self.storage, self.logger):
179 | # 1. Retrieve memories for processing
180 | memories = await self._get_memories_for_horizon(time_horizon, **kwargs)
181 | report.memories_processed = len(memories)
182 |
183 | if not memories:
184 | self.logger.info(f"No memories to process for {time_horizon} consolidation")
185 | return self._finalize_report(report, [])
186 |
187 | self.logger.info(f"✓ Found {len(memories)} memories to process")
188 |
189 | # 2. Calculate/update relevance scores
190 | self.logger.info(f"📊 Phase 1/6: Calculating relevance scores for {len(memories)} memories...")
191 | performance_start = time.time()
192 | relevance_scores = await self._update_relevance_scores(memories, time_horizon)
193 | self.logger.info(f"✓ Relevance scoring completed in {time.time() - performance_start:.1f}s")
194 |
195 | # 3. Cluster by semantic similarity (if enabled and appropriate)
196 | clusters = []
197 | if self.config.clustering_enabled and check_horizon_requirements(
198 | time_horizon, 'clustering', self.ENABLED_PHASES
199 | ):
200 | self.logger.info(f"🔗 Phase 2/6: Clustering memories by semantic similarity...")
201 | performance_start = time.time()
202 | clusters = await self.clustering_engine.process(memories)
203 | report.clusters_created = len(clusters)
204 | self.logger.info(f"✓ Clustering completed in {time.time() - performance_start:.1f}s, created {len(clusters)} clusters")
205 |
206 | # 4. Run creative associations (if enabled and appropriate)
207 | associations = []
208 | if self.config.associations_enabled and check_horizon_requirements(
209 | time_horizon, 'associations', self.ENABLED_PHASES
210 | ):
211 | self.logger.info(f"🧠 Phase 3/6: Discovering creative associations...")
212 | performance_start = time.time()
213 | existing_associations = await self._get_existing_associations()
214 | associations = await self.association_engine.process(
215 | memories, existing_associations=existing_associations
216 | )
217 | report.associations_discovered = len(associations)
218 | self.logger.info(f"✓ Association discovery completed in {time.time() - performance_start:.1f}s, found {len(associations)} associations")
219 |
220 | # Store new associations as memories
221 | await self._store_associations_as_memories(associations)
222 |
223 | # 5. Compress clusters (if enabled and clusters exist)
224 | compression_results = []
225 | if self.config.compression_enabled and clusters and check_horizon_requirements(
226 | time_horizon, 'compression', self.ENABLED_PHASES
227 | ):
228 | self.logger.info(f"🗜️ Phase 4/6: Compressing memory clusters...")
229 | performance_start = time.time()
230 | compression_results = await self.compression_engine.process(clusters, memories)
231 | report.memories_compressed = len(compression_results)
232 | self.logger.info(f"✓ Compression completed in {time.time() - performance_start:.1f}s, compressed {len(compression_results)} clusters")
233 |
234 | # Store compressed memories and update originals
235 | await self._handle_compression_results(compression_results)
236 |
237 | # 6. Controlled forgetting (if enabled and appropriate)
238 | forgetting_results = []
239 | if self.config.forgetting_enabled and check_horizon_requirements(
240 | time_horizon, 'forgetting', self.ENABLED_PHASES
241 | ):
242 | self.logger.info(f"🗂️ Phase 5/6: Applying controlled forgetting...")
243 | performance_start = time.time()
244 | access_patterns = await self._get_access_patterns()
245 | forgetting_results = await self.forgetting_engine.process(
246 | memories, relevance_scores,
247 | access_patterns=access_patterns,
248 | time_horizon=time_horizon
249 | )
250 | report.memories_archived = len([r for r in forgetting_results if r.action_taken in ['archived', 'deleted']])
251 | self.logger.info(f"✓ Forgetting completed in {time.time() - performance_start:.1f}s, processed {len(forgetting_results)} candidates")
252 |
253 | # Apply forgetting results to storage
254 | await self._apply_forgetting_results(forgetting_results)
255 |
256 | # 7. Update consolidation statistics
257 | self._update_consolidation_stats(report)
258 |
259 | # 8. Track consolidation timestamp for incremental mode
260 | if self.config.incremental_mode:
261 | await self._update_consolidation_timestamps(memories)
262 |
263 | # 9. Finalize report
264 | return self._finalize_report(report, [])
265 |
266 | except ConsolidationError as e:
267 | # Re-raise configuration and validation errors
268 | self.logger.error(f"Configuration error during {time_horizon} consolidation: {e}")
269 | self.health_monitor.record_error('consolidator', e, {'time_horizon': time_horizon})
270 | raise
271 | except Exception as e:
272 | self.logger.error(f"Error during {time_horizon} consolidation: {e}")
273 | self.health_monitor.record_error('consolidator', e, {'time_horizon': time_horizon})
274 | report.errors.append(str(e))
275 | return self._finalize_report(report, [str(e)])
276 |
277 | async def _get_memories_for_horizon(self, time_horizon: str, **kwargs) -> List[Memory]:
278 | """Get memories appropriate for the given time horizon.
279 |
280 | With incremental mode enabled, returns oldest-first batch of memories
281 | that haven't been recently consolidated.
282 | """
283 | now = datetime.now(timezone.utc)
284 |
285 | # Validate time horizon
286 | if time_horizon not in HORIZON_CONFIGS:
287 | raise ConsolidationError(f"Unknown time horizon: {time_horizon}")
288 |
289 | config = HORIZON_CONFIGS[time_horizon]
290 |
291 | # For daily processing, get recent memories (no change - already efficient)
292 | if time_horizon == 'daily':
293 | cutoff_days = config.get('cutoff_days', 2)
294 | start_time = (now - timedelta(days=cutoff_days)).timestamp()
295 | end_time = now.timestamp()
296 | memories = await self.storage.get_memories_by_time_range(start_time, end_time)
297 | else:
298 | # For longer horizons: incremental oldest-first processing
299 | memories = await self.storage.get_all_memories()
300 |
301 | # Filter by relevance to time horizon (quarterly/yearly still focus on old memories)
302 | cutoff_days = config.get('cutoff_days')
303 | if cutoff_days is not None:
304 | cutoff_date = now - timedelta(days=cutoff_days)
305 | memories = filter_memories_by_age(memories, cutoff_date)
306 |
307 | # Incremental mode: Sort oldest-first and batch
308 | if self.config.incremental_mode:
309 | # Sort by last_consolidated_at (oldest first), fallback to created_at
310 | def get_consolidation_sort_key(memory: Memory) -> float:
311 | # Check metadata for last consolidation timestamp
312 | if memory.metadata and 'last_consolidated_at' in memory.metadata:
313 | return float(memory.metadata['last_consolidated_at'])
314 | # Fall back to created_at (treat never-consolidated as oldest)
315 | return memory.created_at if memory.created_at else 0.0
316 |
317 | memories.sort(key=get_consolidation_sort_key)
318 |
319 | # Limit to batch size
320 | batch_size = self.config.batch_size
321 | if len(memories) > batch_size:
322 | self.logger.info(f"Incremental mode: Processing {batch_size} oldest memories (out of {len(memories)} total)")
323 | memories = memories[:batch_size]
324 |
325 | return memories
326 |
327 | async def _update_relevance_scores(self, memories: List[Memory], time_horizon: str) -> List:
328 | """Calculate and update relevance scores for memories."""
329 | # Get connection and access data
330 | connections = await self._get_memory_connections()
331 | access_patterns = await self._get_access_patterns()
332 |
333 | # Calculate relevance scores
334 | relevance_scores = await self.decay_calculator.process(
335 | memories,
336 | connections=connections,
337 | access_patterns=access_patterns,
338 | reference_time=datetime.now()
339 | )
340 |
341 | # Update memory metadata with relevance scores
342 | for memory in memories:
343 | score = next((s for s in relevance_scores if s.memory_hash == memory.content_hash), None)
344 | if score:
345 | updated_memory = await self.decay_calculator.update_memory_relevance_metadata(memory, score)
346 | await self.storage.update_memory(updated_memory)
347 |
348 | return relevance_scores
349 |
350 | async def _get_memory_connections(self) -> Dict[str, int]:
351 | """Get memory connection counts from storage."""
352 | try:
353 | return await self.storage.get_memory_connections()
354 | except AttributeError:
355 | # Fallback if storage doesn't implement connection tracking
356 | self.logger.warning("Storage backend doesn't support connection tracking")
357 | return {}
358 |
359 | async def _get_access_patterns(self) -> Dict[str, datetime]:
360 | """Get memory access patterns from storage."""
361 | try:
362 | return await self.storage.get_access_patterns()
363 | except AttributeError:
364 | # Fallback if storage doesn't implement access tracking
365 | self.logger.warning("Storage backend doesn't support access pattern tracking")
366 | return {}
367 |
368 | async def _get_existing_associations(self) -> set:
369 | """Get existing memory associations to avoid duplicates."""
370 | try:
371 | # Look for existing association memories
372 | all_memories = await self.storage.get_all_memories()
373 | associations = set()
374 |
375 | for memory in all_memories:
376 | if memory.memory_type == 'association' and 'source_memory_hashes' in memory.metadata:
377 | source_hashes = memory.metadata['source_memory_hashes']
378 | if isinstance(source_hashes, list) and len(source_hashes) >= 2:
379 | # Create canonical pair representation
380 | pair_key = tuple(sorted(source_hashes[:2]))
381 | associations.add(pair_key)
382 |
383 | return associations
384 |
385 | except Exception as e:
386 | self.logger.warning(f"Error getting existing associations: {e}")
387 | return set()
388 |
389 | async def _store_associations_as_memories(self, associations) -> None:
390 | """Store discovered associations as first-class memories."""
391 | for association in associations:
392 | # Create memory content from association
393 | source_hashes = association.source_memory_hashes
394 | similarity = association.similarity_score
395 | connection_type = association.connection_type
396 |
397 | content = f"Association between memories {source_hashes[0][:8]} and {source_hashes[1][:8]}: {connection_type} (similarity: {similarity:.3f})"
398 |
399 | # Create association memory
400 | association_memory = Memory(
401 | content=content,
402 | content_hash=f"assoc_{source_hashes[0][:8]}_{source_hashes[1][:8]}",
403 | tags=['association', 'discovered'] + connection_type.split(', '),
404 | memory_type='association',
405 | metadata={
406 | 'source_memory_hashes': source_hashes,
407 | 'similarity_score': similarity,
408 | 'connection_type': connection_type,
409 | 'discovery_method': association.discovery_method,
410 | 'discovery_date': association.discovery_date.isoformat(),
411 | **association.metadata
412 | },
413 | created_at=datetime.now().timestamp(),
414 | created_at_iso=datetime.now().isoformat() + 'Z'
415 | )
416 |
417 | # Store the association memory
418 | success, _ = await self.storage.store(association_memory)
419 | if not success:
420 | logger.warning(f"Failed to store association memory for {memory1_hash} <-> {memory2_hash}")
421 |
422 | async def _handle_compression_results(self, compression_results) -> None:
423 | """Handle storage of compressed memories and linking to originals."""
424 | for result in compression_results:
425 | # Store compressed memory
426 | success, _ = await self.storage.store(result.compressed_memory)
427 | if not success:
428 | logger.warning(f"Failed to store compressed memory")
429 |
430 | # Update original memories with compression links
431 | # This could involve adding metadata pointing to the compressed version
432 | # Implementation depends on how the storage backend handles relationships
433 | pass
434 |
435 | async def _apply_forgetting_results(self, forgetting_results) -> None:
436 | """Apply forgetting results to the storage backend."""
437 | for result in forgetting_results:
438 | if result.action_taken == 'deleted':
439 | await self.storage.delete_memory(result.memory_hash)
440 | elif result.action_taken == 'compressed' and result.compressed_version:
441 | # Replace original with compressed version
442 | await self.storage.delete_memory(result.memory_hash)
443 | success, _ = await self.storage.store(result.compressed_version)
444 | if not success:
445 | logger.warning(f"Failed to store compressed version for {result.memory_hash}")
446 | # 'archived' memories are handled by the forgetting engine
447 |
448 | async def _update_consolidation_timestamps(self, memories: List[Memory]) -> None:
449 | """Mark memories with last_consolidated_at timestamp for incremental mode using batch updates."""
450 | consolidation_time = datetime.now().timestamp()
451 |
452 | self.logger.info(f"Marking {len(memories)} memories with consolidation timestamp (batch mode)")
453 |
454 | # Update all memories in-place
455 | for memory in memories:
456 | if memory.metadata is None:
457 | memory.metadata = {}
458 | memory.metadata['last_consolidated_at'] = consolidation_time
459 |
460 | # Use batch update for optimal performance
461 | try:
462 | results = await self.storage.update_memories_batch(memories)
463 | success_count = sum(results)
464 | self.logger.info(f"Consolidation timestamps updated: {success_count}/{len(memories)} memories")
465 |
466 | if success_count < len(memories):
467 | failed_count = len(memories) - success_count
468 | self.logger.warning(f"{failed_count} memories failed to update during timestamp marking")
469 |
470 | except Exception as e:
471 | self.logger.error(f"Batch timestamp update failed: {e}")
472 | # Fallback to individual updates if batch fails
473 | self.logger.info("Falling back to individual timestamp updates")
474 | success_count = 0
475 | for memory in memories:
476 | try:
477 | success = await self.storage.update_memory(memory)
478 | if success:
479 | success_count += 1
480 | except Exception as mem_error:
481 | self.logger.warning(f"Failed to update consolidation timestamp for {memory.content_hash}: {mem_error}")
482 |
483 | self.logger.info(f"Fallback completed: {success_count}/{len(memories)} memories updated")
484 |
485 | def _update_consolidation_stats(self, report: ConsolidationReport) -> None:
486 | """Update internal consolidation statistics."""
487 | self.consolidation_stats['total_runs'] += 1
488 | if not report.errors:
489 | self.consolidation_stats['successful_runs'] += 1
490 |
491 | self.consolidation_stats['total_memories_processed'] += report.memories_processed
492 | self.consolidation_stats['total_associations_created'] += report.associations_discovered
493 | self.consolidation_stats['total_clusters_created'] += report.clusters_created
494 | self.consolidation_stats['total_memories_compressed'] += report.memories_compressed
495 | self.consolidation_stats['total_memories_archived'] += report.memories_archived
496 |
497 | # Update last consolidation time
498 | self.last_consolidation_times[report.time_horizon] = report.start_time
499 |
500 | def _finalize_report(self, report: ConsolidationReport, errors: List[str]) -> ConsolidationReport:
501 | """Finalize the consolidation report."""
502 | report.end_time = datetime.now()
503 | report.errors.extend(errors)
504 |
505 | # Add performance metrics
506 | duration = (report.end_time - report.start_time).total_seconds()
507 | success = len(errors) == 0
508 | report.performance_metrics = {
509 | 'duration_seconds': duration,
510 | 'memories_per_second': report.memories_processed / duration if duration > 0 else 0,
511 | 'success': success
512 | }
513 |
514 | # Record performance in health monitor
515 | self.health_monitor.record_consolidation_performance(
516 | time_horizon=report.time_horizon,
517 | duration=duration,
518 | memories_processed=report.memories_processed,
519 | success=success,
520 | errors=errors
521 | )
522 |
523 | # Log summary
524 | if errors:
525 | self.logger.error(f"Consolidation {report.time_horizon} completed with errors: {errors}")
526 | else:
527 | self.logger.info(
528 | f"Consolidation {report.time_horizon} completed successfully: "
529 | f"{report.memories_processed} memories, {report.associations_discovered} associations, "
530 | f"{report.clusters_created} clusters, {report.memories_compressed} compressed, "
531 | f"{report.memories_archived} archived in {duration:.2f}s"
532 | )
533 |
534 | return report
535 |
536 | async def health_check(self) -> Dict[str, Any]:
537 | """Perform health check on the consolidation system."""
538 | return await self.health_monitor.check_overall_health()
539 |
540 | async def get_health_summary(self) -> Dict[str, Any]:
541 | """Get a summary of consolidation system health."""
542 | return await self.health_monitor.get_health_summary()
543 |
544 | def get_error_history(self, limit: int = 50) -> List[Dict[str, Any]]:
545 | """Get recent error history."""
546 | return self.health_monitor.error_history[-limit:]
547 |
548 | def get_performance_history(self, limit: int = 100) -> List[Dict[str, Any]]:
549 | """Get recent performance history."""
550 | return self.health_monitor.performance_history[-limit:]
551 |
552 | def resolve_health_alert(self, alert_id: str):
553 | """Resolve a health alert."""
554 | self.health_monitor.resolve_alert(alert_id)
555 |
556 | async def get_consolidation_recommendations(self, time_horizon: str) -> Dict[str, Any]:
557 | """Get recommendations for consolidation based on current memory state."""
558 | try:
559 | memories = await self._get_memories_for_horizon(time_horizon)
560 |
561 | if not memories:
562 | return {
563 | 'recommendation': 'no_action',
564 | 'reason': 'No memories to process',
565 | 'memory_count': 0
566 | }
567 |
568 | # Analyze memory distribution
569 | memory_types = {}
570 | total_size = 0
571 | old_memories = 0
572 | now = datetime.now()
573 |
574 | for memory in memories:
575 | memory_type = memory.memory_type or 'standard'
576 | memory_types[memory_type] = memory_types.get(memory_type, 0) + 1
577 | total_size += len(memory.content)
578 |
579 | if memory.created_at:
580 | age_days = (now - datetime.utcfromtimestamp(memory.created_at)).days
581 | if age_days > 30:
582 | old_memories += 1
583 |
584 | # Generate recommendations
585 | recommendations = []
586 |
587 | if len(memories) > 1000:
588 | recommendations.append("Consider running compression to reduce memory usage")
589 |
590 | if old_memories > len(memories) * 0.5:
591 | recommendations.append("Many old memories present - consider forgetting/archival")
592 |
593 | if len(memories) > 100 and time_horizon in ['weekly', 'monthly']:
594 | recommendations.append("Good candidate for association discovery")
595 |
596 | if not recommendations:
597 | recommendations.append("Memory state looks healthy")
598 |
599 | return {
600 | 'recommendation': 'consolidation_beneficial' if len(recommendations) > 1 else 'optional',
601 | 'reasons': recommendations,
602 | 'memory_count': len(memories),
603 | 'memory_types': memory_types,
604 | 'total_size_bytes': total_size,
605 | 'old_memory_percentage': (old_memories / len(memories)) * 100,
606 | 'estimated_duration_seconds': len(memories) * 0.01 # Rough estimate
607 | }
608 |
609 | except Exception as e:
610 | return {
611 | 'recommendation': 'error',
612 | 'error': str(e),
613 | 'memory_count': 0
614 | }
```
--------------------------------------------------------------------------------
/tests/integration/test_api_with_memory_service.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Integration tests for API endpoints using MemoryService.
3 |
4 | These tests verify that the API layer correctly integrates with
5 | MemoryService for all memory operations and maintains consistent behavior.
6 | """
7 |
8 | import pytest
9 | import pytest_asyncio
10 | import tempfile
11 | import os
12 | from unittest.mock import AsyncMock, MagicMock, patch
13 | from fastapi.testclient import TestClient
14 |
15 | from mcp_memory_service.web.dependencies import set_storage, get_memory_service
16 | from mcp_memory_service.services.memory_service import MemoryService
17 | from mcp_memory_service.models.memory import Memory
18 | from mcp_memory_service.storage.sqlite_vec import SqliteVecMemoryStorage
19 |
20 |
21 | # Test Fixtures
22 |
23 | @pytest.fixture
24 | def temp_db():
25 | """Create a temporary database for testing."""
26 | with tempfile.TemporaryDirectory() as tmpdir:
27 | db_path = os.path.join(tmpdir, "test.db")
28 | yield db_path
29 |
30 |
31 | @pytest_asyncio.fixture
32 | async def initialized_storage(temp_db):
33 | """Create and initialize a real SQLite storage backend."""
34 | storage = SqliteVecMemoryStorage(temp_db)
35 | await storage.initialize()
36 | yield storage
37 | storage.close()
38 |
39 |
40 | @pytest.fixture
41 | def test_app(initialized_storage):
42 | """Create a FastAPI test application with initialized storage."""
43 | # Import here to avoid circular dependencies
44 | from mcp_memory_service.web.server import app
45 |
46 | # Set storage for the app
47 | set_storage(initialized_storage)
48 |
49 | client = TestClient(app)
50 | yield client
51 |
52 |
53 | @pytest.fixture
54 | def mock_storage():
55 | """Create a mock storage for isolated testing."""
56 | storage = AsyncMock()
57 | return storage
58 |
59 |
60 | @pytest.fixture
61 | def mock_memory_service(mock_storage):
62 | """Create a MemoryService with mock storage."""
63 | return MemoryService(storage=mock_storage)
64 |
65 |
66 | @pytest.fixture
67 | def sample_memory():
68 | """Create a sample memory for testing."""
69 | return Memory(
70 | content="Integration test memory",
71 | content_hash="test_hash_123",
72 | tags=["integration", "test"],
73 | memory_type="note",
74 | metadata={"source": "test"},
75 | created_at=1698765432.0,
76 | updated_at=1698765432.0
77 | )
78 |
79 |
80 | # Test API Store Memory Endpoint
81 |
82 | @pytest.mark.asyncio
83 | async def test_api_store_memory_uses_service(mock_storage):
84 | """Test that POST /api/memories uses MemoryService."""
85 | mock_storage.store.return_value = None
86 |
87 | # Create service
88 | service = MemoryService(storage=mock_storage)
89 |
90 | # Simulate API call through service
91 | result = await service.store_memory(
92 | content="Test API storage",
93 | tags=["api", "test"],
94 | memory_type="note"
95 | )
96 |
97 | assert result["success"] is True
98 | assert "memory" in result
99 | mock_storage.store.assert_called_once()
100 |
101 |
102 | @pytest.mark.asyncio
103 | async def test_api_store_memory_hostname_from_header(mock_storage):
104 | """Test that X-Client-Hostname header is processed correctly."""
105 | mock_storage.store.return_value = None
106 |
107 | service = MemoryService(storage=mock_storage)
108 |
109 | # Simulate API call with hostname
110 | result = await service.store_memory(
111 | content="Test with hostname",
112 | tags=["test"],
113 | client_hostname="client-machine"
114 | )
115 |
116 | # Verify hostname tag was added
117 | stored_memory = mock_storage.store.call_args.args[0]
118 | assert "source:client-machine" in stored_memory.tags
119 | assert stored_memory.metadata["hostname"] == "client-machine"
120 |
121 |
122 | @pytest.mark.asyncio
123 | async def test_api_store_memory_hostname_from_request_body(mock_storage):
124 | """Test that client_hostname in request body works."""
125 | mock_storage.store.return_value = None
126 |
127 | service = MemoryService(storage=mock_storage)
128 |
129 | # Simulate API call with hostname in body
130 | result = await service.store_memory(
131 | content="Test",
132 | client_hostname="body-hostname"
133 | )
134 |
135 | stored_memory = mock_storage.store.call_args.args[0]
136 | assert "source:body-hostname" in stored_memory.tags
137 |
138 |
139 | # Test API List Memories Endpoint
140 |
141 | @pytest.mark.asyncio
142 | async def test_api_list_memories_uses_database_filtering(mock_storage):
143 | """Test that GET /api/memories uses database-level filtering."""
144 | # Setup mock to return limited results
145 | mock_storage.get_all_memories.return_value = []
146 | mock_storage.count_all_memories.return_value = 1000
147 |
148 | service = MemoryService(storage=mock_storage)
149 |
150 | # Request page 1 with 10 items from 1000 total
151 | result = await service.list_memories(page=1, page_size=10)
152 |
153 | # CRITICAL: Verify only 10 items requested, not all 1000
154 | # This proves database-level filtering, not O(n) loading
155 | call_kwargs = mock_storage.get_all_memories.call_args.kwargs
156 | assert call_kwargs["limit"] == 10
157 | assert call_kwargs["offset"] == 0
158 | assert result["total"] == 1000
159 | assert result["has_more"] is True
160 |
161 |
162 | @pytest.mark.asyncio
163 | async def test_api_list_memories_pagination_through_service(mock_storage):
164 | """Test end-to-end pagination workflow."""
165 | # Create mock memories
166 | memories = [
167 | Memory(
168 | content=f"Memory {i}",
169 | content_hash=f"hash_{i}",
170 | tags=["test"],
171 | memory_type="note",
172 | metadata={},
173 | created_at=1698765432.0 + i,
174 | updated_at=1698765432.0 + i
175 | )
176 | for i in range(25)
177 | ]
178 |
179 | # Page 1: First 10 memories
180 | mock_storage.get_all_memories.return_value = memories[:10]
181 | mock_storage.count_all_memories.return_value = 25
182 |
183 | service = MemoryService(storage=mock_storage)
184 | page1 = await service.list_memories(page=1, page_size=10)
185 |
186 | assert page1["page"] == 1
187 | assert page1["page_size"] == 10
188 | assert page1["total"] == 25
189 | assert page1["has_more"] is True
190 | assert len(page1["memories"]) == 10
191 |
192 | # Page 2: Next 10 memories
193 | mock_storage.get_all_memories.return_value = memories[10:20]
194 | page2 = await service.list_memories(page=2, page_size=10)
195 |
196 | assert page2["page"] == 2
197 | assert page2["has_more"] is True
198 |
199 | # Page 3: Last 5 memories
200 | mock_storage.get_all_memories.return_value = memories[20:25]
201 | page3 = await service.list_memories(page=3, page_size=10)
202 |
203 | assert page3["page"] == 3
204 | assert page3["has_more"] is False
205 | assert len(page3["memories"]) == 5
206 |
207 |
208 | @pytest.mark.asyncio
209 | async def test_api_list_memories_tag_filter(mock_storage):
210 | """Test filtering by tag through API."""
211 | mock_storage.get_all_memories.return_value = []
212 | mock_storage.count_all_memories.return_value = 0
213 |
214 | service = MemoryService(storage=mock_storage)
215 |
216 | result = await service.list_memories(page=1, page_size=10, tag="important")
217 |
218 | # Verify tag passed to storage as list
219 | call_kwargs = mock_storage.get_all_memories.call_args.kwargs
220 | assert call_kwargs["tags"] == ["important"]
221 |
222 |
223 | @pytest.mark.asyncio
224 | async def test_api_list_memories_type_filter(mock_storage):
225 | """Test filtering by memory type through API."""
226 | mock_storage.get_all_memories.return_value = []
227 | mock_storage.count_all_memories.return_value = 0
228 |
229 | service = MemoryService(storage=mock_storage)
230 |
231 | result = await service.list_memories(page=1, page_size=10, memory_type="reference")
232 |
233 | call_kwargs = mock_storage.get_all_memories.call_args.kwargs
234 | assert call_kwargs["memory_type"] == "reference"
235 |
236 |
237 | @pytest.mark.asyncio
238 | async def test_api_list_memories_combined_filters(mock_storage):
239 | """Test combining tag and type filters."""
240 | mock_storage.get_all_memories.return_value = []
241 | mock_storage.count_all_memories.return_value = 0
242 |
243 | service = MemoryService(storage=mock_storage)
244 |
245 | result = await service.list_memories(
246 | page=1,
247 | page_size=10,
248 | tag="work",
249 | memory_type="task"
250 | )
251 |
252 | call_kwargs = mock_storage.get_all_memories.call_args.kwargs
253 | assert call_kwargs["tags"] == ["work"]
254 | assert call_kwargs["memory_type"] == "task"
255 |
256 |
257 | # Test API Search Endpoints
258 |
259 | @pytest.mark.asyncio
260 | async def test_api_semantic_search_uses_service(mock_storage, sample_memory):
261 | """Test POST /api/search uses MemoryService."""
262 | mock_storage.retrieve.return_value = [sample_memory]
263 |
264 | service = MemoryService(storage=mock_storage)
265 |
266 | result = await service.retrieve_memories(query="test query", n_results=5)
267 |
268 | assert result["query"] == "test query"
269 | assert result["count"] == 1
270 | mock_storage.retrieve.assert_called_once()
271 |
272 |
273 | @pytest.mark.asyncio
274 | async def test_api_tag_search_uses_service(mock_storage, sample_memory):
275 | """Test POST /api/search/by-tag uses MemoryService."""
276 | mock_storage.search_by_tags.return_value = [sample_memory]
277 |
278 | service = MemoryService(storage=mock_storage)
279 |
280 | result = await service.search_by_tag(tags=["test"], match_all=False)
281 |
282 | assert result["tags"] == ["test"]
283 | assert result["match_type"] == "ANY"
284 | assert result["count"] == 1
285 |
286 |
287 | @pytest.mark.asyncio
288 | async def test_api_time_search_uses_service(mock_storage, sample_memory):
289 | """Test POST /api/search/by-time flow (if applicable)."""
290 | # Note: Time search might use retrieve_memories with time filters
291 | mock_storage.retrieve.return_value = [sample_memory]
292 |
293 | service = MemoryService(storage=mock_storage)
294 |
295 | # Simulate time-based search
296 | result = await service.retrieve_memories(query="last week", n_results=10)
297 |
298 | assert "memories" in result
299 |
300 |
301 | # Test API Delete Endpoint
302 |
303 | @pytest.mark.asyncio
304 | async def test_api_delete_memory_uses_service(mock_storage):
305 | """Test DELETE /api/memories/{hash} uses MemoryService."""
306 | mock_storage.delete_memory.return_value = True
307 |
308 | service = MemoryService(storage=mock_storage)
309 |
310 | result = await service.delete_memory("test_hash_123")
311 |
312 | assert result["success"] is True
313 | assert result["content_hash"] == "test_hash_123"
314 | mock_storage.delete_memory.assert_called_once_with("test_hash_123")
315 |
316 |
317 | @pytest.mark.asyncio
318 | async def test_api_delete_memory_not_found(mock_storage):
319 | """Test deleting non-existent memory returns proper response."""
320 | mock_storage.delete_memory.return_value = False
321 |
322 | service = MemoryService(storage=mock_storage)
323 |
324 | result = await service.delete_memory("nonexistent")
325 |
326 | assert result["success"] is False
327 |
328 |
329 | # Test API Get Memory Endpoint
330 |
331 | @pytest.mark.asyncio
332 | async def test_api_get_memory_by_hash_uses_service(mock_storage, sample_memory):
333 | """Test GET /api/memories/{hash} uses MemoryService."""
334 | mock_storage.get_by_hash.return_value = sample_memory
335 |
336 | service = MemoryService(storage=mock_storage)
337 |
338 | result = await service.get_memory_by_hash("test_hash_123")
339 |
340 | assert result["found"] is True
341 | assert result["memory"]["content_hash"] == "test_hash_123"
342 | mock_storage.get_by_hash.assert_called_once_with("test_hash_123")
343 |
344 |
345 | # Test Dependency Injection
346 |
347 | def test_get_memory_service_dependency_injection():
348 | """Test that get_memory_service creates service with correct storage."""
349 | from mcp_memory_service.web.dependencies import get_memory_service
350 |
351 | # Create mock storage
352 | mock_storage = MagicMock()
353 |
354 | # Override dependency
355 | def override_get_storage():
356 | return mock_storage
357 |
358 | # Get service
359 | service = get_memory_service(storage=mock_storage)
360 |
361 | assert isinstance(service, MemoryService)
362 | assert service.storage == mock_storage
363 |
364 |
365 | # Performance and Scaling Tests
366 |
367 | @pytest.mark.asyncio
368 | async def test_list_memories_performance_with_large_dataset(mock_storage):
369 | """
370 | Test that list_memories remains efficient with large datasets.
371 |
372 | This verifies the fix for O(n) memory loading anti-pattern.
373 | """
374 | # Simulate 10,000 memories in database
375 | mock_storage.get_all_memories.return_value = []
376 | mock_storage.count_all_memories.return_value = 10000
377 |
378 | service = MemoryService(storage=mock_storage)
379 |
380 | # Request just 20 items
381 | result = await service.list_memories(page=1, page_size=20)
382 |
383 | # CRITICAL: Verify we only queried for 20 items, not all 10,000
384 | call_kwargs = mock_storage.get_all_memories.call_args.kwargs
385 | assert call_kwargs["limit"] == 20
386 | assert call_kwargs["offset"] == 0
387 |
388 | # This proves database-level filtering prevents loading 10,000 records
389 | assert result["total"] == 10000
390 | assert result["has_more"] is True
391 |
392 |
393 | @pytest.mark.asyncio
394 | async def test_tag_filter_performance(mock_storage):
395 | """Test that tag filtering happens at database level."""
396 | mock_storage.get_all_memories.return_value = []
397 | mock_storage.count_all_memories.return_value = 50
398 |
399 | service = MemoryService(storage=mock_storage)
400 |
401 | result = await service.list_memories(page=1, page_size=10, tag="important")
402 |
403 | # Verify tag filter passed to database query
404 | call_kwargs = mock_storage.get_all_memories.call_args.kwargs
405 | assert call_kwargs["tags"] == ["important"]
406 |
407 | # Result should only reflect filtered count
408 | assert result["total"] == 50 # Only memories matching tag
409 |
410 |
411 | # Error Handling Tests
412 |
413 | @pytest.mark.asyncio
414 | async def test_api_handles_storage_errors_gracefully(mock_storage):
415 | """Test that API returns proper errors when storage fails."""
416 | mock_storage.get_all_memories.side_effect = Exception("Database connection lost")
417 |
418 | service = MemoryService(storage=mock_storage)
419 |
420 | result = await service.list_memories(page=1, page_size=10)
421 |
422 | assert result["success"] is False
423 | assert "error" in result
424 | assert "Database connection lost" in result["error"]
425 |
426 |
427 | @pytest.mark.asyncio
428 | async def test_api_validates_input_through_service(mock_storage):
429 | """Test that validation errors from storage are handled."""
430 | mock_storage.store.side_effect = ValueError("Invalid content format")
431 |
432 | service = MemoryService(storage=mock_storage)
433 |
434 | result = await service.store_memory(content="invalid")
435 |
436 | assert result["success"] is False
437 | assert "Invalid memory data" in result["error"]
438 |
439 |
440 | # Consistency Tests
441 |
442 | @pytest.mark.asyncio
443 | async def test_api_and_mcp_use_same_service_logic(mock_storage):
444 | """
445 | Test that API and MCP tools use the same MemoryService logic.
446 |
447 | This verifies the DRY principle - both interfaces share the same
448 | business logic through MemoryService.
449 | """
450 | service = MemoryService(storage=mock_storage)
451 |
452 | # Store through service (used by both API and MCP)
453 | mock_storage.store.return_value = None
454 | result1 = await service.store_memory(content="Test", tags=["shared"])
455 |
456 | # Retrieve through service (used by both API and MCP)
457 | mock_storage.retrieve.return_value = []
458 | result2 = await service.retrieve_memories(query="test")
459 |
460 | # Both operations used the same service
461 | assert result1["success"] is True
462 | assert "memories" in result2
463 |
464 |
465 | @pytest.mark.asyncio
466 | async def test_response_format_consistency(mock_storage, sample_memory):
467 | """Test that all service methods return consistently formatted responses."""
468 | mock_storage.get_all_memories.return_value = [sample_memory]
469 | mock_storage.count_all_memories.return_value = 1
470 | mock_storage.retrieve.return_value = [sample_memory]
471 | mock_storage.search_by_tags.return_value = [sample_memory]
472 |
473 | service = MemoryService(storage=mock_storage)
474 |
475 | # Get responses from different methods
476 | list_result = await service.list_memories(page=1, page_size=10)
477 | retrieve_result = await service.retrieve_memories(query="test")
478 | tag_result = await service.search_by_tag(tags="test")
479 |
480 | # All should have consistently formatted memories
481 | list_memory = list_result["memories"][0]
482 | retrieve_memory = retrieve_result["memories"][0]
483 | tag_memory = tag_result["memories"][0]
484 |
485 | # Verify all have same format
486 | required_fields = ["content", "content_hash", "tags", "memory_type", "created_at"]
487 | for field in required_fields:
488 | assert field in list_memory
489 | assert field in retrieve_memory
490 | assert field in tag_memory
491 |
492 |
493 | # Real Storage Integration Test (End-to-End)
494 |
495 | @pytest.mark.asyncio
496 | @pytest.mark.integration
497 | async def test_end_to_end_workflow_with_real_storage(temp_db):
498 | """
499 | End-to-end test with real SQLite storage (not mocked).
500 |
501 | This verifies the complete integration stack works correctly.
502 | """
503 | # Create real storage
504 | storage = SqliteVecMemoryStorage(temp_db)
505 | await storage.initialize()
506 |
507 | try:
508 | # Create service with real storage
509 | service = MemoryService(storage=storage)
510 |
511 | # Store a memory
512 | store_result = await service.store_memory(
513 | content="End-to-end test memory",
514 | tags=["e2e", "integration"],
515 | memory_type="test"
516 | )
517 | assert store_result["success"] is True
518 |
519 | # List memories
520 | list_result = await service.list_memories(page=1, page_size=10)
521 | assert len(list_result["memories"]) > 0
522 |
523 | # Search by tag
524 | tag_result = await service.search_by_tag(tags="e2e")
525 | assert len(tag_result["memories"]) > 0
526 |
527 | # Get specific memory
528 | content_hash = store_result["memory"]["content_hash"]
529 | get_result = await service.get_memory_by_hash(content_hash)
530 | assert get_result["found"] is True
531 |
532 | # Delete memory
533 | delete_result = await service.delete_memory(content_hash)
534 | assert delete_result["success"] is True
535 |
536 | # Verify deleted
537 | get_after_delete = await service.get_memory_by_hash(content_hash)
538 | assert get_after_delete["found"] is False
539 |
540 | finally:
541 | storage.close()
542 |
543 |
544 | # Real HTTP API Integration Tests with TestClient
545 |
546 | @pytest.mark.asyncio
547 | @pytest.mark.integration
548 | async def test_http_api_store_memory_endpoint(temp_db):
549 | """
550 | Test POST /api/memories endpoint with real HTTP request.
551 |
552 | Uses TestClient to make actual HTTP request to FastAPI app.
553 | """
554 | # Create real storage
555 | storage = SqliteVecMemoryStorage(temp_db)
556 | await storage.initialize()
557 |
558 | try:
559 | # Import app and set storage
560 | from mcp_memory_service.web.app import app
561 | set_storage(storage)
562 |
563 | # Create TestClient
564 | client = TestClient(app)
565 |
566 | # Make HTTP POST request
567 | response = client.post(
568 | "/api/memories",
569 | json={
570 | "content": "HTTP API test memory",
571 | "tags": ["http", "api", "test"],
572 | "memory_type": "note"
573 | }
574 | )
575 |
576 | # Verify response
577 | assert response.status_code == 200
578 | data = response.json()
579 | assert data["success"] is True
580 | assert "memory" in data
581 | assert data["memory"]["content"] == "HTTP API test memory"
582 | assert "http" in data["memory"]["tags"]
583 |
584 | finally:
585 | storage.close()
586 |
587 |
588 | @pytest.mark.asyncio
589 | @pytest.mark.integration
590 | async def test_http_api_list_memories_endpoint(temp_db):
591 | """
592 | Test GET /api/memories endpoint with real HTTP request.
593 |
594 | Verifies pagination and filtering work through HTTP API.
595 | """
596 | storage = SqliteVecMemoryStorage(temp_db)
597 | await storage.initialize()
598 |
599 | try:
600 | from mcp_memory_service.web.app import app
601 | set_storage(storage)
602 |
603 | # Store test memories first
604 | service = MemoryService(storage=storage)
605 | for i in range(5):
606 | await service.store_memory(
607 | content=f"Test memory {i}",
608 | tags=["test"],
609 | memory_type="note"
610 | )
611 |
612 | # Make HTTP GET request
613 | client = TestClient(app)
614 | response = client.get("/api/memories?page=1&page_size=10")
615 |
616 | # Verify response
617 | assert response.status_code == 200
618 | data = response.json()
619 | assert "memories" in data
620 | assert len(data["memories"]) == 5
621 | assert data["total"] == 5
622 | assert data["page"] == 1
623 |
624 | finally:
625 | storage.close()
626 |
627 |
628 | @pytest.mark.asyncio
629 | @pytest.mark.integration
630 | async def test_http_api_search_endpoint(temp_db):
631 | """
632 | Test POST /api/search endpoint with real HTTP request.
633 |
634 | Verifies semantic search works through HTTP API.
635 | """
636 | storage = SqliteVecMemoryStorage(temp_db)
637 | await storage.initialize()
638 |
639 | try:
640 | from mcp_memory_service.web.app import app
641 | set_storage(storage)
642 |
643 | # Store searchable memory
644 | service = MemoryService(storage=storage)
645 | await service.store_memory(
646 | content="Python programming language tutorial",
647 | tags=["python", "tutorial"],
648 | memory_type="reference"
649 | )
650 |
651 | # Make HTTP POST request for search
652 | client = TestClient(app)
653 | response = client.post(
654 | "/api/search",
655 | json={"query": "python tutorial", "limit": 5}
656 | )
657 |
658 | # Verify response
659 | assert response.status_code == 200
660 | data = response.json()
661 | assert "memories" in data
662 | assert data["query"] == "python tutorial"
663 |
664 | finally:
665 | storage.close()
666 |
667 |
668 | @pytest.mark.asyncio
669 | @pytest.mark.integration
670 | async def test_http_api_search_by_tag_endpoint(temp_db):
671 | """
672 | Test POST /api/search/by-tag endpoint with real HTTP request.
673 |
674 | Verifies tag search works through HTTP API.
675 | """
676 | storage = SqliteVecMemoryStorage(temp_db)
677 | await storage.initialize()
678 |
679 | try:
680 | from mcp_memory_service.web.app import app
681 | set_storage(storage)
682 |
683 | # Store memories with tags
684 | service = MemoryService(storage=storage)
685 | await service.store_memory(
686 | content="Important work item",
687 | tags=["important", "work"],
688 | memory_type="task"
689 | )
690 | await service.store_memory(
691 | content="Personal note",
692 | tags=["personal"],
693 | memory_type="note"
694 | )
695 |
696 | # Search by tag via HTTP
697 | client = TestClient(app)
698 | response = client.post(
699 | "/api/search/by-tag",
700 | json={"tags": ["important"], "limit": 10}
701 | )
702 |
703 | # Verify response
704 | assert response.status_code == 200
705 | data = response.json()
706 | assert len(data["memories"]) == 1
707 | assert "important" in data["memories"][0]["tags"]
708 |
709 | finally:
710 | storage.close()
711 |
712 |
713 | @pytest.mark.asyncio
714 | @pytest.mark.integration
715 | async def test_http_api_get_memory_by_hash_endpoint(temp_db):
716 | """
717 | Test GET /api/memories/{hash} endpoint with real HTTP request.
718 |
719 | Verifies retrieving specific memory by hash works.
720 | """
721 | storage = SqliteVecMemoryStorage(temp_db)
722 | await storage.initialize()
723 |
724 | try:
725 | from mcp_memory_service.web.app import app
726 | set_storage(storage)
727 |
728 | # Store a memory
729 | service = MemoryService(storage=storage)
730 | store_result = await service.store_memory(
731 | content="Memory to retrieve",
732 | tags=["test"],
733 | memory_type="note"
734 | )
735 | content_hash = store_result["memory"]["content_hash"]
736 |
737 | # Retrieve via HTTP
738 | client = TestClient(app)
739 | response = client.get(f"/api/memories/{content_hash}")
740 |
741 | # Verify response
742 | assert response.status_code == 200
743 | data = response.json()
744 | assert data["content"] == "Memory to retrieve"
745 | assert data["content_hash"] == content_hash
746 |
747 | finally:
748 | storage.close()
749 |
750 |
751 | @pytest.mark.asyncio
752 | @pytest.mark.integration
753 | async def test_http_api_delete_memory_endpoint(temp_db):
754 | """
755 | Test DELETE /api/memories/{hash} endpoint with real HTTP request.
756 |
757 | Verifies deletion works through HTTP API.
758 | """
759 | storage = SqliteVecMemoryStorage(temp_db)
760 | await storage.initialize()
761 |
762 | try:
763 | from mcp_memory_service.web.app import app
764 | set_storage(storage)
765 |
766 | # Store a memory
767 | service = MemoryService(storage=storage)
768 | store_result = await service.store_memory(
769 | content="Memory to delete",
770 | tags=["test"],
771 | memory_type="note"
772 | )
773 | content_hash = store_result["memory"]["content_hash"]
774 |
775 | # Delete via HTTP
776 | client = TestClient(app)
777 | response = client.delete(f"/api/memories/{content_hash}")
778 |
779 | # Verify response
780 | assert response.status_code == 200
781 | data = response.json()
782 | assert data["success"] is True
783 |
784 | # Verify memory is gone
785 | get_response = client.get(f"/api/memories/{content_hash}")
786 | assert get_response.status_code == 404
787 |
788 | finally:
789 | storage.close()
790 |
791 |
792 | @pytest.mark.asyncio
793 | @pytest.mark.integration
794 | async def test_http_api_pagination_with_real_data(temp_db):
795 | """
796 | Test pagination through HTTP API with multiple pages.
797 |
798 | Verifies database-level pagination prevents O(n) loading.
799 | """
800 | storage = SqliteVecMemoryStorage(temp_db)
801 | await storage.initialize()
802 |
803 | try:
804 | from mcp_memory_service.web.app import app
805 | set_storage(storage)
806 |
807 | # Store 25 memories
808 | service = MemoryService(storage=storage)
809 | for i in range(25):
810 | await service.store_memory(
811 | content=f"Pagination test {i}",
812 | tags=["pagination"],
813 | memory_type="note"
814 | )
815 |
816 | client = TestClient(app)
817 |
818 | # Page 1: First 10
819 | response1 = client.get("/api/memories?page=1&page_size=10")
820 | assert response1.status_code == 200
821 | data1 = response1.json()
822 | assert len(data1["memories"]) == 10
823 | assert data1["total"] == 25
824 | assert data1["has_more"] is True
825 |
826 | # Page 2: Next 10
827 | response2 = client.get("/api/memories?page=2&page_size=10")
828 | data2 = response2.json()
829 | assert len(data2["memories"]) == 10
830 | assert data2["has_more"] is True
831 |
832 | # Page 3: Last 5
833 | response3 = client.get("/api/memories?page=3&page_size=10")
834 | data3 = response3.json()
835 | assert len(data3["memories"]) == 5
836 | assert data3["has_more"] is False
837 |
838 | finally:
839 | storage.close()
840 |
841 |
842 | @pytest.mark.asyncio
843 | @pytest.mark.integration
844 | async def test_http_api_error_handling_invalid_json(temp_db):
845 | """
846 | Test that HTTP API handles malformed JSON gracefully.
847 |
848 | This would have caught v8.12.0 syntax errors.
849 | """
850 | storage = SqliteVecMemoryStorage(temp_db)
851 | await storage.initialize()
852 |
853 | try:
854 | from mcp_memory_service.web.app import app
855 | set_storage(storage)
856 |
857 | client = TestClient(app)
858 |
859 | # Send malformed JSON
860 | response = client.post(
861 | "/api/memories",
862 | data="{'this': 'is not valid json}", # Missing quote
863 | headers={"Content-Type": "application/json"}
864 | )
865 |
866 | # Should return 400 or 422, not 500
867 | assert response.status_code in [400, 422]
868 |
869 | finally:
870 | storage.close()
871 |
872 |
873 | @pytest.mark.asyncio
874 | @pytest.mark.integration
875 | async def test_http_api_client_hostname_header(temp_db):
876 | """
877 | Test that X-Client-Hostname header is processed correctly.
878 |
879 | Verifies hostname tagging works through real HTTP request.
880 | """
881 | storage = SqliteVecMemoryStorage(temp_db)
882 | await storage.initialize()
883 |
884 | try:
885 | from mcp_memory_service.web.app import app
886 | set_storage(storage)
887 |
888 | client = TestClient(app)
889 |
890 | # Send request with hostname header
891 | response = client.post(
892 | "/api/memories",
893 | json={
894 | "content": "Test with hostname",
895 | "tags": ["test"]
896 | },
897 | headers={"X-Client-Hostname": "test-machine"}
898 | )
899 |
900 | # Verify hostname tag added
901 | assert response.status_code == 200
902 | data = response.json()
903 | assert "source:test-machine" in data["memory"]["tags"]
904 | assert data["memory"]["metadata"]["hostname"] == "test-machine"
905 |
906 | finally:
907 | storage.close()
908 |
909 |
910 | @pytest.mark.asyncio
911 | @pytest.mark.integration
912 | async def test_http_api_complete_crud_workflow(temp_db):
913 | """
914 | Complete end-to-end CRUD workflow through real HTTP API.
915 |
916 | This verifies the entire HTTP API stack works correctly.
917 | """
918 | storage = SqliteVecMemoryStorage(temp_db)
919 | await storage.initialize()
920 |
921 | try:
922 | from mcp_memory_service.web.app import app
923 | set_storage(storage)
924 |
925 | client = TestClient(app)
926 |
927 | # CREATE: Store a memory
928 | create_response = client.post(
929 | "/api/memories",
930 | json={
931 | "content": "CRUD test memory",
932 | "tags": ["crud", "test"],
933 | "memory_type": "note"
934 | }
935 | )
936 | assert create_response.status_code == 200
937 | content_hash = create_response.json()["memory"]["content_hash"]
938 |
939 | # READ: List all memories
940 | list_response = client.get("/api/memories")
941 | assert list_response.status_code == 200
942 | assert len(list_response.json()["memories"]) > 0
943 |
944 | # READ: Get specific memory
945 | get_response = client.get(f"/api/memories/{content_hash}")
946 | assert get_response.status_code == 200
947 | assert get_response.json()["content"] == "CRUD test memory"
948 |
949 | # UPDATE: Search for memory
950 | search_response = client.post(
951 | "/api/search",
952 | json={"query": "CRUD test", "limit": 5}
953 | )
954 | assert search_response.status_code == 200
955 | assert len(search_response.json()["memories"]) > 0
956 |
957 | # DELETE: Remove memory
958 | delete_response = client.delete(f"/api/memories/{content_hash}")
959 | assert delete_response.status_code == 200
960 | assert delete_response.json()["success"] is True
961 |
962 | # VERIFY: Memory is gone
963 | verify_response = client.get(f"/api/memories/{content_hash}")
964 | assert verify_response.status_code == 404
965 |
966 | finally:
967 | storage.close()
968 |
```
--------------------------------------------------------------------------------
/src/mcp_memory_service/web/api/documents.py:
--------------------------------------------------------------------------------
```python
1 | # Copyright 2024 Heinrich Krupp
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Document Upload API Endpoints
17 |
18 | Provides REST API endpoints for document ingestion through the web dashboard.
19 | Supports single file upload, batch upload, progress tracking, and upload history.
20 | """
21 |
22 | import os
23 | import re
24 | import uuid
25 | import asyncio
26 | import logging
27 | import tempfile
28 | from typing import List, Dict, Any, Optional
29 | from datetime import datetime
30 | from pathlib import Path
31 | from urllib.parse import urlparse, unquote
32 |
33 | from fastapi import APIRouter, UploadFile, File, Form, HTTPException, BackgroundTasks
34 | from fastapi.responses import JSONResponse
35 | from pydantic import BaseModel
36 |
37 | from ...ingestion import get_loader_for_file, SUPPORTED_FORMATS
38 | from ...models.memory import Memory
39 | from ...utils import create_memory_from_chunk, _process_and_store_chunk
40 | from ..dependencies import get_storage
41 |
42 | logger = logging.getLogger(__name__)
43 |
44 | router = APIRouter()
45 |
46 | # Constants
47 | MAX_TAG_LENGTH = 100
48 |
49 |
50 | def parse_and_validate_tags(tags: str) -> List[str]:
51 | """
52 | Parse and validate tags from user input.
53 |
54 | Handles comma-separated and space-separated tags, removes file:// prefixes,
55 | sanitizes path separators, and validates tag lengths.
56 |
57 | Args:
58 | tags: Raw tag string (comma or space separated)
59 |
60 | Returns:
61 | List of cleaned and validated tags
62 |
63 | Raises:
64 | HTTPException: If any tag exceeds MAX_TAG_LENGTH
65 | """
66 | if not tags or not tags.strip():
67 | return []
68 |
69 | # Split by comma OR space
70 | raw_tags = tags.replace(',', ' ').split()
71 | tag_list = []
72 |
73 | for tag in raw_tags:
74 | clean_tag = tag.strip()
75 | if not clean_tag:
76 | continue
77 |
78 | # Remove file:// protocol prefixes and extract filename
79 | # Uses urllib.parse for robust handling of URL-encoded chars and different path formats
80 | if clean_tag.startswith('file://'):
81 | path_str = unquote(urlparse(clean_tag).path)
82 | # On Windows, urlparse may add a leading slash (e.g., /C:/...), which needs to be removed
83 | if os.name == 'nt' and path_str.startswith('/') and len(path_str) > 2 and path_str[2] == ':':
84 | path_str = path_str[1:]
85 | clean_tag = Path(path_str).name
86 |
87 | # Remove common path separators to create clean tag names
88 | clean_tag = re.sub(r'[/\\]', '_', clean_tag)
89 |
90 | # Validate tag length - raise error instead of silently dropping
91 | if len(clean_tag) > MAX_TAG_LENGTH:
92 | raise HTTPException(
93 | status_code=400,
94 | detail=f"Tag '{clean_tag[:100]}...' exceeds maximum length of {MAX_TAG_LENGTH} characters. "
95 | f"Please use shorter, more descriptive tags."
96 | )
97 |
98 | tag_list.append(clean_tag)
99 |
100 | return tag_list
101 |
102 |
103 | async def ensure_storage_initialized():
104 | """Ensure storage is initialized for web API usage."""
105 | logger.info("🔍 Checking storage availability...")
106 | try:
107 | # Try to get storage
108 | storage = get_storage()
109 | logger.info("✅ Storage already available")
110 | return storage
111 | except Exception as e:
112 | logger.warning(f"⚠️ Storage not available ({e}), attempting to initialize...")
113 | try:
114 | # Import and initialize storage
115 | from ..dependencies import create_storage_backend, set_storage
116 | logger.info("🏗️ Creating storage backend...")
117 | storage = await create_storage_backend()
118 | set_storage(storage)
119 | logger.info("✅ Storage initialized successfully in API context")
120 | return storage
121 | except Exception as init_error:
122 | logger.error(f"❌ Failed to initialize storage: {init_error}")
123 | logger.error(f"Full error: {str(init_error)}")
124 | import traceback
125 | logger.error(f"Traceback: {traceback.format_exc()}")
126 | # Don't raise HTTPException here since this is called from background tasks
127 | raise init_error
128 |
129 | # In-memory storage for upload tracking (in production, use database)
130 | upload_sessions = {}
131 |
132 | # Note: UploadRequest and BatchUploadRequest models removed - not used
133 | # Endpoints read parameters directly from form data
134 |
135 | class UploadStatus(BaseModel):
136 | upload_id: str
137 | status: str # queued, processing, completed, failed
138 | filename: str = ""
139 | file_size: int = 0
140 | chunks_processed: int = 0
141 | chunks_stored: int = 0
142 | total_chunks: int = 0
143 | progress: float = 0.0
144 | errors: List[str] = []
145 | created_at: datetime
146 | completed_at: Optional[datetime] = None
147 |
148 | @router.post("/upload", response_model=Dict[str, Any])
149 | async def upload_document(
150 | background_tasks: BackgroundTasks,
151 | file: UploadFile = File(...),
152 | tags: str = Form(""),
153 | chunk_size: int = Form(1000),
154 | chunk_overlap: int = Form(200),
155 | memory_type: str = Form("document")
156 | ):
157 | """
158 | Upload and ingest a single document.
159 |
160 | Args:
161 | file: The document file to upload
162 | tags: Comma-separated list of tags
163 | chunk_size: Target chunk size in characters
164 | chunk_overlap: Chunk overlap in characters
165 | memory_type: Type label for memories
166 |
167 | Returns:
168 | Upload session information with ID for tracking
169 |
170 | Uses FastAPI BackgroundTasks for proper async processing.
171 | """
172 | logger.info(f"🚀 Document upload endpoint called with file: {file.filename}")
173 | try:
174 | # Read file content
175 | file_content = await file.read()
176 | file_size = len(file_content)
177 | logger.info(f"File content length: {file_size} bytes")
178 |
179 | # Validate file type
180 | file_ext = Path(file.filename).suffix.lower().lstrip('.')
181 | if file_ext not in SUPPORTED_FORMATS:
182 | supported = ", ".join(f".{ext}" for ext in SUPPORTED_FORMATS.keys())
183 | raise HTTPException(
184 | status_code=400,
185 | detail=f"Unsupported file type: .{file_ext}. Supported: {supported}"
186 | )
187 |
188 | # Parse and validate tags
189 | tag_list = parse_and_validate_tags(tags)
190 |
191 | # Create upload session
192 | upload_id = str(uuid.uuid4())
193 |
194 | # Create secure temporary file (avoids path traversal vulnerability)
195 | # Extract safe file extension for suffix
196 | file_ext = Path(file.filename).suffix if file.filename else ""
197 | temp_file = tempfile.NamedTemporaryFile(
198 | delete=False,
199 | prefix=f"{upload_id}_",
200 | suffix=file_ext
201 | )
202 | temp_path = temp_file.name
203 |
204 | # Save uploaded file temporarily
205 | with temp_file:
206 | temp_file.write(file_content)
207 |
208 | # Initialize upload session
209 | session = UploadStatus(
210 | upload_id=upload_id,
211 | status="queued",
212 | filename=file.filename,
213 | file_size=file_size,
214 | created_at=datetime.now()
215 | )
216 | upload_sessions[upload_id] = session
217 |
218 | # Start background processing
219 | background_tasks.add_task(
220 | process_single_file_upload,
221 | upload_id,
222 | temp_path,
223 | file.filename,
224 | tag_list,
225 | chunk_size,
226 | chunk_overlap,
227 | memory_type
228 | )
229 |
230 | return {
231 | "upload_id": upload_id,
232 | "status": "queued",
233 | "message": f"Document {file.filename} queued for processing"
234 | }
235 |
236 | except Exception as e:
237 | logger.error(f"Upload error: {str(e)}")
238 | raise HTTPException(status_code=500, detail=str(e))
239 |
240 | @router.post("/batch-upload", response_model=Dict[str, Any])
241 | async def batch_upload_documents(
242 | background_tasks: BackgroundTasks,
243 | files: List[UploadFile] = File(...),
244 | tags: str = Form(""),
245 | chunk_size: int = Form(1000),
246 | chunk_overlap: int = Form(200),
247 | memory_type: str = Form("document")
248 | ):
249 | """
250 | Upload and ingest multiple documents in batch.
251 |
252 | Args:
253 | files: List of document files to upload
254 | tags: Comma-separated list of tags
255 | chunk_size: Target chunk size in characters
256 | chunk_overlap: Chunk overlap in characters
257 | memory_type: Type label for memories
258 |
259 | Returns:
260 | Batch upload session information
261 | """
262 | try:
263 | if not files:
264 | raise HTTPException(status_code=400, detail="No files provided")
265 |
266 | # Parse and validate tags
267 | tag_list = parse_and_validate_tags(tags)
268 |
269 | # Create batch upload session
270 | batch_id = str(uuid.uuid4())
271 | temp_paths = []
272 |
273 | # Validate and save all files
274 | for file in files:
275 | file_ext = Path(file.filename).suffix.lower().lstrip('.')
276 | if file_ext not in SUPPORTED_FORMATS:
277 | supported = ", ".join(f".{ext}" for ext in SUPPORTED_FORMATS.keys())
278 | raise HTTPException(
279 | status_code=400,
280 | detail=f"Unsupported file type for {file.filename}: {file_ext}. Supported: {supported}"
281 | )
282 |
283 | # Create secure temporary file (avoids path traversal vulnerability)
284 | content = await file.read()
285 | safe_ext = Path(file.filename).suffix.lower() if file.filename else ""
286 | temp_file = tempfile.NamedTemporaryFile(
287 | delete=False,
288 | prefix=f"{batch_id}_",
289 | suffix=safe_ext
290 | )
291 | temp_path = temp_file.name
292 | with temp_file:
293 | temp_file.write(content)
294 | temp_paths.append((file.filename, temp_path, len(content)))
295 |
296 | # Calculate total file size for the batch
297 | total_file_size = sum(file_size for _, _, file_size in temp_paths)
298 |
299 | # Initialize batch session
300 | session = UploadStatus(
301 | upload_id=batch_id,
302 | status="queued",
303 | filename=f"Batch ({len(files)} files)",
304 | file_size=total_file_size,
305 | created_at=datetime.now()
306 | )
307 | upload_sessions[batch_id] = session
308 |
309 | # Start background processing
310 | background_tasks.add_task(
311 | process_batch_upload,
312 | batch_id,
313 | temp_paths,
314 | tag_list,
315 | chunk_size,
316 | chunk_overlap,
317 | memory_type
318 | )
319 |
320 | return {
321 | "upload_id": batch_id,
322 | "status": "queued",
323 | "message": f"Batch of {len(files)} documents queued for processing"
324 | }
325 |
326 | except Exception as e:
327 | logger.error(f"Batch upload error: {str(e)}")
328 | raise HTTPException(status_code=500, detail=str(e))
329 |
330 | @router.get("/status/{upload_id}", response_model=UploadStatus)
331 | async def get_upload_status(upload_id: str):
332 | """
333 | Get the status of an upload session.
334 |
335 | Args:
336 | upload_id: The upload session ID
337 |
338 | Returns:
339 | Current upload status
340 | """
341 | if upload_id not in upload_sessions:
342 | raise HTTPException(status_code=404, detail="Upload session not found")
343 |
344 | return upload_sessions[upload_id]
345 |
346 | @router.get("/history", response_model=Dict[str, List[Dict[str, Any]]])
347 | async def get_upload_history():
348 | """
349 | Get the history of all uploads.
350 |
351 | Returns:
352 | List of completed uploads with metadata
353 | """
354 | logger.info("Documents history endpoint called")
355 | try:
356 | # For now, return empty history since storage might not be initialized
357 | # In production, this would query a database
358 | history = []
359 | for session in upload_sessions.values():
360 | if session.status in ["completed", "failed"]:
361 | history.append({
362 | "upload_id": session.upload_id,
363 | "filename": session.filename,
364 | "file_size": session.file_size,
365 | "status": session.status,
366 | "chunks_processed": session.chunks_processed,
367 | "chunks_stored": session.chunks_stored,
368 | "progress": session.progress,
369 | "errors": session.errors,
370 | "created_at": session.created_at.isoformat(),
371 | "completed_at": session.completed_at.isoformat() if session.completed_at else None
372 | })
373 |
374 | # Sort by creation time, most recent first
375 | history.sort(key=lambda x: x["created_at"], reverse=True)
376 |
377 | return {"uploads": history}
378 | except Exception as e:
379 | logger.error(f"Error in get_upload_history: {e}")
380 | # Return empty history on error so the UI doesn't break
381 | return {"uploads": []}
382 |
383 | async def process_single_file_upload(
384 | upload_id: str,
385 | file_path: str,
386 | filename: str,
387 | tags: List[str],
388 | chunk_size: int,
389 | chunk_overlap: int,
390 | memory_type: str
391 | ):
392 | """Background task to process a single document upload."""
393 | try:
394 | logger.info(f"Starting document processing: {upload_id} - {filename}")
395 | session = upload_sessions[upload_id]
396 | session.status = "processing"
397 |
398 | # Get storage
399 | storage = await ensure_storage_initialized()
400 |
401 | # Get appropriate loader
402 | file_path_obj = Path(file_path)
403 | loader = get_loader_for_file(file_path_obj)
404 | if loader is None:
405 | raise ValueError(f"No loader available for file: {filename}")
406 |
407 | # Configure loader
408 | loader.chunk_size = chunk_size
409 | loader.chunk_overlap = chunk_overlap
410 |
411 | chunks_processed = 0
412 | chunks_stored = 0
413 |
414 | # Process chunks from the file
415 | async for chunk in loader.extract_chunks(file_path_obj):
416 | chunks_processed += 1
417 |
418 | try:
419 | # Add file-specific tags
420 | all_tags = tags.copy()
421 | all_tags.append(f"source_file:{filename}")
422 | all_tags.append(f"file_type:{file_path_obj.suffix.lstrip('.')}")
423 | all_tags.append(f"upload_id:{upload_id}")
424 |
425 | if chunk.metadata.get('tags'):
426 | # Handle tags from chunk metadata (can be string or list)
427 | chunk_tags = chunk.metadata['tags']
428 | if isinstance(chunk_tags, str):
429 | # Split comma-separated string into list
430 | chunk_tags = [tag.strip() for tag in chunk_tags.split(',') if tag.strip()]
431 | all_tags.extend(chunk_tags)
432 |
433 | # Add upload_id to metadata
434 | chunk_metadata = chunk.metadata.copy() if chunk.metadata else {}
435 | chunk_metadata['upload_id'] = upload_id
436 | chunk_metadata['source_file'] = filename
437 |
438 | # Create memory object
439 | memory = Memory(
440 | content=chunk.content,
441 | content_hash=generate_content_hash(chunk.content, chunk_metadata),
442 | tags=list(set(all_tags)), # Remove duplicates
443 | memory_type=memory_type,
444 | metadata=chunk_metadata
445 | )
446 |
447 | # Store the memory
448 | success, error = await storage.store(memory)
449 | if success:
450 | chunks_stored += 1
451 | else:
452 | session.errors.append(f"Chunk {chunk.chunk_index}: {error}")
453 |
454 | except Exception as e:
455 | session.errors.append(f"Chunk {chunk.chunk_index}: {str(e)}")
456 |
457 | # Update progress
458 | session.chunks_processed = chunks_processed
459 | session.chunks_stored = chunks_stored
460 | session.progress = (chunks_processed / max(chunks_processed, 1)) * 100
461 |
462 | # Mark as completed
463 | session.status = "completed"
464 | session.completed_at = datetime.now()
465 | session.progress = 100.0
466 |
467 | logger.info(f"Document processing completed: {upload_id}, {chunks_stored}/{chunks_processed} chunks")
468 | return {"chunks_processed": chunks_processed, "chunks_stored": chunks_stored}
469 |
470 | except Exception as e:
471 | logger.error(f"Document processing error: {str(e)}")
472 | session = upload_sessions.get(upload_id)
473 | if session:
474 | session.status = "failed"
475 | session.errors.append(str(e))
476 | session.completed_at = datetime.now()
477 | finally:
478 | # Clean up temp file (always executed)
479 | try:
480 | os.unlink(file_path)
481 | except Exception as cleanup_error:
482 | logger.debug(f"Could not delete temp file {file_path}: {cleanup_error}")
483 |
484 |
485 | async def process_batch_upload(
486 | batch_id: str,
487 | file_info: List[tuple], # (filename, temp_path, size)
488 | tags: List[str],
489 | chunk_size: int,
490 | chunk_overlap: int,
491 | memory_type: str
492 | ):
493 | """Background task to process a batch document upload."""
494 | try:
495 | logger.info(f"Starting batch processing: {batch_id}")
496 | session = upload_sessions[batch_id]
497 | session.status = "processing"
498 |
499 | # Get storage
500 | storage = await ensure_storage_initialized()
501 |
502 | total_files = len(file_info)
503 | processed_files = 0
504 | total_chunks_processed = 0
505 | total_chunks_stored = 0
506 | all_errors = []
507 |
508 | for filename, temp_path, file_size in file_info:
509 | try:
510 | # Get appropriate loader
511 | file_path_obj = Path(temp_path)
512 | loader = get_loader_for_file(file_path_obj)
513 | if loader is None:
514 | all_errors.append(f"{filename}: No loader available")
515 | processed_files += 1
516 | continue
517 |
518 | # Configure loader
519 | loader.chunk_size = chunk_size
520 | loader.chunk_overlap = chunk_overlap
521 |
522 | file_chunks_processed = 0
523 | file_chunks_stored = 0
524 |
525 | # Process chunks from this file
526 | async for chunk in loader.extract_chunks(file_path_obj):
527 | file_chunks_processed += 1
528 | total_chunks_processed += 1
529 |
530 | # Process and store the chunk
531 | success, error = await _process_and_store_chunk(
532 | chunk,
533 | storage,
534 | filename,
535 | base_tags=tags.copy(),
536 | memory_type=memory_type,
537 | context_tags={
538 | "source_file": filename,
539 | "file_type": file_path_obj.suffix.lstrip('.'),
540 | "upload_id": batch_id
541 | },
542 | extra_metadata={
543 | "upload_id": batch_id,
544 | "source_file": filename
545 | }
546 | )
547 |
548 | if success:
549 | file_chunks_stored += 1
550 | total_chunks_stored += 1
551 | else:
552 | all_errors.append(error)
553 |
554 | processed_files += 1
555 |
556 | except Exception as e:
557 | all_errors.append(f"{filename}: {str(e)}")
558 | processed_files += 1
559 |
560 | finally:
561 | # Clean up temp file (always executed)
562 | try:
563 | os.unlink(temp_path)
564 | except Exception as cleanup_error:
565 | logger.debug(f"Could not delete temp file {temp_path}: {cleanup_error}")
566 |
567 | # Finalize batch
568 | session.status = "completed" if total_chunks_stored > 0 else "failed"
569 | session.completed_at = datetime.now()
570 | session.chunks_processed = total_chunks_processed
571 | session.chunks_stored = total_chunks_stored
572 | session.progress = 100.0
573 | session.errors = all_errors
574 |
575 | logger.info(f"Batch processing completed: {batch_id}, {total_chunks_stored}/{total_chunks_processed} chunks")
576 |
577 | except Exception as e:
578 | logger.error(f"Batch processing error: {str(e)}")
579 | session = upload_sessions.get(batch_id)
580 | if session:
581 | session.status = "failed"
582 | session.errors.append(str(e))
583 | session.completed_at = datetime.now()
584 | # Note: send_progress_update removed - progress tracking via polling instead
585 |
586 | # Clean up old completed sessions periodically
587 | @router.on_event("startup") # TODO: Migrate to lifespan context manager in app.py (FastAPI 0.109+)
588 | async def cleanup_old_sessions():
589 | """Clean up old completed upload sessions."""
590 | async def cleanup():
591 | while True:
592 | await asyncio.sleep(3600) # Clean up every hour
593 | current_time = datetime.now()
594 | to_remove = []
595 |
596 | for upload_id, session in upload_sessions.items():
597 | if session.status in ["completed", "failed"]:
598 | # Keep sessions for 24 hours after completion
599 | if session.completed_at and (current_time - session.completed_at).total_seconds() > 86400:
600 | to_remove.append(upload_id)
601 |
602 | for upload_id in to_remove:
603 | del upload_sessions[upload_id]
604 | logger.debug(f"Cleaned up old upload session: {upload_id}")
605 |
606 | asyncio.create_task(cleanup())
607 |
608 | @router.delete("/remove/{upload_id}")
609 | async def remove_document(upload_id: str, remove_from_memory: bool = True):
610 | """
611 | Remove a document and optionally its memories.
612 |
613 | Args:
614 | upload_id: The upload session ID
615 | remove_from_memory: Whether to delete associated memories (default: True)
616 |
617 | Returns:
618 | Removal status with count of memories deleted
619 | """
620 | logger.info(f"Remove document request for upload_id: {upload_id}, remove_from_memory: {remove_from_memory}")
621 |
622 | # Get session info if available (may not exist after server restart)
623 | session = upload_sessions.get(upload_id)
624 | filename = session.filename if session else "Unknown file"
625 | memories_deleted = 0
626 |
627 | try:
628 | if remove_from_memory:
629 | # Get storage
630 | storage = get_storage()
631 |
632 | # Search by tag pattern: upload_id:{upload_id}
633 | upload_tag = f"upload_id:{upload_id}"
634 | logger.info(f"Searching for memories with tag: {upload_tag}")
635 |
636 | try:
637 | # Delete all memories with this upload_id tag
638 | count, _ = await storage.delete_by_tags([upload_tag])
639 | memories_deleted = count
640 | logger.info(f"Deleted {memories_deleted} memories with tag {upload_tag}")
641 |
642 | # If we deleted memories but don't have session info, try to get filename from first memory
643 | if memories_deleted > 0 and not session:
644 | # Try to get source_file from metadata by checking remaining memories
645 | # (we already deleted them, so we'll use a generic message)
646 | filename = f"Document (upload_id: {upload_id[:8]}...)"
647 |
648 | except Exception as e:
649 | logger.warning(f"Could not delete memories by tag: {e}")
650 | # If deletion fails and we don't know about this upload, return 404
651 | if not session:
652 | raise HTTPException(
653 | status_code=404,
654 | detail=f"Upload ID not found and no memories with tag '{upload_tag}'"
655 | )
656 | memories_deleted = 0
657 |
658 | # Remove upload session if it exists
659 | if session:
660 | del upload_sessions[upload_id]
661 |
662 | return {
663 | "status": "success",
664 | "upload_id": upload_id,
665 | "filename": filename,
666 | "memories_deleted": memories_deleted,
667 | "message": f"Document '{filename}' removed successfully"
668 | }
669 |
670 | except HTTPException:
671 | raise
672 | except Exception as e:
673 | logger.error(f"Error removing document: {str(e)}")
674 | import traceback
675 | logger.error(f"Traceback: {traceback.format_exc()}")
676 | raise HTTPException(status_code=500, detail=f"Failed to remove document: {str(e)}")
677 |
678 | @router.delete("/remove-by-tags")
679 | async def remove_documents_by_tags(tags: List[str]):
680 | """
681 | Remove documents by their tags.
682 |
683 | Args:
684 | tags: List of tags to search for
685 |
686 | Returns:
687 | Removal status with affected upload IDs and memory counts
688 | """
689 | logger.info(f"Remove documents by tags request: {tags}")
690 |
691 | try:
692 | # Get storage
693 | storage = get_storage()
694 |
695 | # Delete memories by tags
696 | result = await storage.delete_by_tags(tags)
697 | memories_deleted = result.get('deleted_count', 0) if isinstance(result, dict) else 0
698 |
699 | # Find and remove affected upload sessions
700 | affected_sessions = []
701 | to_remove = []
702 |
703 | for upload_id, session in upload_sessions.items():
704 | # Check if any of the document's tags match
705 | # This requires storing tags in the session object
706 | # For now, just track all sessions (placeholder)
707 | pass
708 |
709 | return {
710 | "status": "success",
711 | "tags": tags,
712 | "memories_deleted": memories_deleted,
713 | "affected_uploads": affected_sessions,
714 | "message": f"Deleted {memories_deleted} memories matching tags"
715 | }
716 |
717 | except Exception as e:
718 | logger.error(f"Error removing documents by tags: {str(e)}")
719 | raise HTTPException(status_code=500, detail=f"Failed to remove documents: {str(e)}")
720 |
721 | @router.get("/search-content/{upload_id}")
722 | async def search_document_content(upload_id: str, limit: int = 1000):
723 | """
724 | Search for all memories associated with an upload.
725 |
726 | Args:
727 | upload_id: The upload session ID
728 | limit: Maximum number of results to return (default: 1000)
729 |
730 | Returns:
731 | List of memories with their content and metadata
732 | """
733 | logger.info(f"Search document content for upload_id: {upload_id}, limit: {limit}")
734 |
735 | # Get session info if available (may not exist after server restart)
736 | session = upload_sessions.get(upload_id)
737 |
738 | # If no session, we'll still try to find memories by upload_id tag
739 | if not session:
740 | logger.info(f"No upload session found for {upload_id}, searching by tag only")
741 |
742 | try:
743 | # Get storage
744 | storage = get_storage()
745 |
746 | # Search for memories with upload_id tag
747 | upload_tag = f"upload_id:{upload_id}"
748 | logger.info(f"Searching for memories with tag: {upload_tag}")
749 |
750 | # Use tag search (search_by_tags doesn't support limit parameter)
751 | all_memories = await storage.search_by_tags([upload_tag])
752 |
753 | # If no memories found and no session, this upload_id doesn't exist
754 | if not all_memories and not session:
755 | raise HTTPException(status_code=404, detail=f"No memories found for upload_id: {upload_id}")
756 |
757 | # Apply limit after retrieval
758 | memories = all_memories[:limit] if limit and limit > 0 else all_memories
759 |
760 | # Format results
761 | results = []
762 | for memory in memories:
763 | # Handle created_at (stored as float timestamp)
764 | created_at_str = None
765 | if memory.created_at:
766 | if isinstance(memory.created_at, float):
767 | created_at_str = datetime.fromtimestamp(memory.created_at).isoformat()
768 | elif hasattr(memory.created_at, 'isoformat'):
769 | created_at_str = memory.created_at.isoformat()
770 |
771 | results.append({
772 | "content_hash": memory.content_hash,
773 | "content": memory.content,
774 | "tags": memory.tags,
775 | "metadata": memory.metadata,
776 | "created_at": created_at_str,
777 | "chunk_index": memory.metadata.get('chunk_index', 0) if memory.metadata else 0,
778 | "page": memory.metadata.get('page', None) if memory.metadata else None
779 | })
780 |
781 | # Sort by chunk index
782 | results.sort(key=lambda x: x.get('chunk_index', 0))
783 |
784 | # Get filename from session or from first memory's metadata
785 | filename = session.filename if session else None
786 | if not filename and results:
787 | # Try to get from first memory's metadata
788 | first_memory_metadata = results[0].get('metadata', {})
789 | filename = first_memory_metadata.get('source_file', f"Document (upload_id: {upload_id[:8]}...)")
790 |
791 | return {
792 | "status": "success",
793 | "upload_id": upload_id,
794 | "filename": filename or "Unknown Document",
795 | "total_found": len(results),
796 | "memories": results
797 | }
798 |
799 | except Exception as e:
800 | logger.error(f"Error searching document content: {str(e)}")
801 | # Get filename from session if available
802 | filename = session.filename if session else f"Document (upload_id: {upload_id[:8]}...)"
803 | # Return empty results instead of error to avoid breaking UI
804 | return {
805 | "status": "partial",
806 | "upload_id": upload_id,
807 | "filename": filename,
808 | "total_found": 0,
809 | "memories": [],
810 | "error": str(e)
811 | }
812 |
```
--------------------------------------------------------------------------------
/tests/test_sqlite_vec_storage.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Comprehensive tests for SQLite-vec storage backend.
3 | """
4 |
5 | import pytest
6 | import pytest_asyncio
7 | import asyncio
8 | import tempfile
9 | import os
10 | import shutil
11 | import json
12 | from unittest.mock import Mock, patch
13 | import time
14 |
15 | # Skip tests if sqlite-vec is not available
16 | try:
17 | import sqlite_vec
18 | SQLITE_VEC_AVAILABLE = True
19 | except ImportError:
20 | SQLITE_VEC_AVAILABLE = False
21 |
22 | from src.mcp_memory_service.models.memory import Memory, MemoryQueryResult
23 | from src.mcp_memory_service.utils.hashing import generate_content_hash
24 |
25 | if SQLITE_VEC_AVAILABLE:
26 | from src.mcp_memory_service.storage.sqlite_vec import SqliteVecMemoryStorage
27 |
28 | # Skip all tests if sqlite-vec is not available
29 | pytestmark = pytest.mark.skipif(not SQLITE_VEC_AVAILABLE, reason="sqlite-vec not available")
30 |
31 |
32 | class TestSqliteVecStorage:
33 | """Test suite for SQLite-vec storage functionality."""
34 |
35 | @pytest_asyncio.fixture
36 | async def storage(self):
37 | """Create a test storage instance."""
38 | temp_dir = tempfile.mkdtemp()
39 | db_path = os.path.join(temp_dir, "test_memory.db")
40 |
41 | storage = SqliteVecMemoryStorage(db_path)
42 | await storage.initialize()
43 |
44 | yield storage
45 |
46 | # Cleanup
47 | if storage.conn:
48 | storage.conn.close()
49 | shutil.rmtree(temp_dir, ignore_errors=True)
50 |
51 | @pytest.fixture
52 | def sample_memory(self):
53 | """Create a sample memory for testing."""
54 | content = "This is a test memory for SQLite-vec storage"
55 | return Memory(
56 | content=content,
57 | content_hash=generate_content_hash(content),
58 | tags=["test", "sqlite-vec"],
59 | memory_type="note",
60 | metadata={"priority": "medium", "category": "testing"}
61 | )
62 |
63 | @pytest.mark.asyncio
64 | async def test_initialization(self):
65 | """Test storage initialization."""
66 | temp_dir = tempfile.mkdtemp()
67 | db_path = os.path.join(temp_dir, "test_init.db")
68 |
69 | try:
70 | storage = SqliteVecMemoryStorage(db_path)
71 | await storage.initialize()
72 |
73 | # Check that database file was created
74 | assert os.path.exists(db_path)
75 |
76 | # Check that connection is established
77 | assert storage.conn is not None
78 |
79 | # Check that table was created
80 | cursor = storage.conn.execute('''
81 | SELECT name FROM sqlite_master
82 | WHERE type='table' AND name='memories'
83 | ''')
84 | assert cursor.fetchone() is not None
85 |
86 | storage.close()
87 |
88 | finally:
89 | shutil.rmtree(temp_dir, ignore_errors=True)
90 |
91 | @pytest.mark.asyncio
92 | async def test_store_memory(self, storage, sample_memory):
93 | """Test storing a memory."""
94 | success, message = await storage.store(sample_memory)
95 |
96 | assert success
97 | assert "successfully" in message.lower()
98 |
99 | # Verify memory was stored
100 | cursor = storage.conn.execute(
101 | 'SELECT content_hash FROM memories WHERE content_hash = ?',
102 | (sample_memory.content_hash,)
103 | )
104 | result = cursor.fetchone()
105 | assert result is not None
106 | assert result[0] == sample_memory.content_hash
107 |
108 | @pytest.mark.asyncio
109 | async def test_store_duplicate_memory(self, storage, sample_memory):
110 | """Test that duplicate memories are rejected."""
111 | # Store the memory first time
112 | success, message = await storage.store(sample_memory)
113 | assert success
114 |
115 | # Try to store the same memory again
116 | success, message = await storage.store(sample_memory)
117 | assert not success
118 | assert "duplicate" in message.lower()
119 |
120 | @pytest.mark.asyncio
121 | async def test_retrieve_memory(self, storage, sample_memory):
122 | """Test retrieving memories using semantic search."""
123 | # Store the memory
124 | await storage.store(sample_memory)
125 |
126 | # Retrieve using semantic search
127 | results = await storage.retrieve("test memory sqlite", n_results=5)
128 |
129 | assert len(results) > 0
130 | assert isinstance(results[0], MemoryQueryResult)
131 | assert results[0].memory.content_hash == sample_memory.content_hash
132 | assert results[0].relevance_score >= 0.0
133 | assert results[0].debug_info["backend"] == "sqlite-vec"
134 |
135 | @pytest.mark.asyncio
136 | async def test_retrieve_no_results(self, storage):
137 | """Test retrieving when no memories match."""
138 | results = await storage.retrieve("nonexistent query", n_results=5)
139 | assert len(results) == 0
140 |
141 | @pytest.mark.asyncio
142 | async def test_search_by_tag(self, storage, sample_memory):
143 | """Test searching memories by tags."""
144 | # Store the memory
145 | await storage.store(sample_memory)
146 |
147 | # Search by existing tag
148 | results = await storage.search_by_tag(["test"])
149 | assert len(results) == 1
150 | assert results[0].content_hash == sample_memory.content_hash
151 |
152 | # Search by non-existent tag
153 | results = await storage.search_by_tag(["nonexistent"])
154 | assert len(results) == 0
155 |
156 | # Search by multiple tags
157 | results = await storage.search_by_tag(["test", "sqlite-vec"])
158 | assert len(results) == 1
159 |
160 | @pytest.mark.asyncio
161 | async def test_search_by_empty_tags(self, storage):
162 | """Test searching with empty tags list."""
163 | results = await storage.search_by_tag([])
164 | assert len(results) == 0
165 |
166 | @pytest.mark.asyncio
167 | async def test_delete_memory(self, storage, sample_memory):
168 | """Test deleting a memory by content hash."""
169 | # Store the memory
170 | await storage.store(sample_memory)
171 |
172 | # Delete the memory
173 | success, message = await storage.delete(sample_memory.content_hash)
174 | assert success
175 | assert sample_memory.content_hash in message
176 |
177 | # Verify memory was deleted
178 | cursor = storage.conn.execute(
179 | 'SELECT content_hash FROM memories WHERE content_hash = ?',
180 | (sample_memory.content_hash,)
181 | )
182 | assert cursor.fetchone() is None
183 |
184 | @pytest.mark.asyncio
185 | async def test_delete_nonexistent_memory(self, storage):
186 | """Test deleting a non-existent memory."""
187 | nonexistent_hash = "nonexistent123456789"
188 | success, message = await storage.delete(nonexistent_hash)
189 | assert not success
190 | assert "not found" in message.lower()
191 |
192 | @pytest.mark.asyncio
193 | async def test_delete_by_tag(self, storage):
194 | """Test deleting memories by tag."""
195 | # Store multiple memories with different tags
196 | memory1 = Memory(
197 | content="Memory 1",
198 | content_hash=generate_content_hash("Memory 1"),
199 | tags=["tag1", "shared"]
200 | )
201 | memory2 = Memory(
202 | content="Memory 2",
203 | content_hash=generate_content_hash("Memory 2"),
204 | tags=["tag2", "shared"]
205 | )
206 | memory3 = Memory(
207 | content="Memory 3",
208 | content_hash=generate_content_hash("Memory 3"),
209 | tags=["tag3"]
210 | )
211 |
212 | await storage.store(memory1)
213 | await storage.store(memory2)
214 | await storage.store(memory3)
215 |
216 | # Delete by shared tag
217 | count, message = await storage.delete_by_tag("shared")
218 | assert count == 2
219 | assert "deleted 2 memories" in message.lower()
220 |
221 | # Verify correct memories were deleted
222 | remaining = await storage.search_by_tag(["tag3"])
223 | assert len(remaining) == 1
224 | assert remaining[0].content_hash == memory3.content_hash
225 |
226 | @pytest.mark.asyncio
227 | async def test_delete_by_nonexistent_tag(self, storage):
228 | """Test deleting by a non-existent tag."""
229 | count, message = await storage.delete_by_tag("nonexistent")
230 | assert count == 0
231 | assert "no memories found" in message.lower()
232 |
233 | @pytest.mark.asyncio
234 | async def test_cleanup_duplicates(self, storage):
235 | """Test cleaning up duplicate memories."""
236 | # Create memory
237 | content = "Duplicate test memory"
238 | memory = Memory(
239 | content=content,
240 | content_hash=generate_content_hash(content),
241 | tags=["duplicate"]
242 | )
243 |
244 | # Store the memory
245 | await storage.store(memory)
246 |
247 | # Manually insert a duplicate (bypassing duplicate check)
248 | embedding = storage._generate_embedding(content)
249 | storage.conn.execute('''
250 | INSERT INTO memories (
251 | content_embedding, content_hash, content, tags, memory_type,
252 | metadata, created_at, updated_at, created_at_iso, updated_at_iso
253 | ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
254 | ''', (
255 | sqlite_vec.serialize_float32(embedding),
256 | memory.content_hash,
257 | content,
258 | "duplicate",
259 | None,
260 | "{}",
261 | time.time(),
262 | time.time(),
263 | "2024-01-01T00:00:00Z",
264 | "2024-01-01T00:00:00Z"
265 | ))
266 | storage.conn.commit()
267 |
268 | # Clean up duplicates
269 | count, message = await storage.cleanup_duplicates()
270 | assert count == 1
271 | assert "removed 1 duplicate" in message.lower()
272 |
273 | # Verify only one copy remains
274 | cursor = storage.conn.execute(
275 | 'SELECT COUNT(*) FROM memories WHERE content_hash = ?',
276 | (memory.content_hash,)
277 | )
278 | assert cursor.fetchone()[0] == 1
279 |
280 | @pytest.mark.asyncio
281 | async def test_cleanup_no_duplicates(self, storage, sample_memory):
282 | """Test cleanup when no duplicates exist."""
283 | await storage.store(sample_memory)
284 |
285 | count, message = await storage.cleanup_duplicates()
286 | assert count == 0
287 | assert "no duplicate memories found" in message.lower()
288 |
289 | @pytest.mark.asyncio
290 | async def test_update_memory_metadata(self, storage, sample_memory):
291 | """Test updating memory metadata."""
292 | # Store the memory
293 | await storage.store(sample_memory)
294 |
295 | # Update metadata
296 | updates = {
297 | "tags": ["updated", "test"],
298 | "memory_type": "reminder",
299 | "metadata": {"priority": "high", "due_date": "2024-01-15"},
300 | "status": "active"
301 | }
302 |
303 | success, message = await storage.update_memory_metadata(
304 | content_hash=sample_memory.content_hash,
305 | updates=updates
306 | )
307 |
308 | assert success
309 | assert "updated fields" in message.lower()
310 |
311 | # Verify updates
312 | cursor = storage.conn.execute('''
313 | SELECT tags, memory_type, metadata
314 | FROM memories WHERE content_hash = ?
315 | ''', (sample_memory.content_hash,))
316 |
317 | row = cursor.fetchone()
318 | assert row is not None
319 |
320 | tags_str, memory_type, metadata_str = row
321 | metadata = json.loads(metadata_str)
322 |
323 | assert tags_str == "updated,test"
324 | assert memory_type == "reminder"
325 | assert metadata["priority"] == "high"
326 | assert metadata["due_date"] == "2024-01-15"
327 | assert metadata["status"] == "active"
328 |
329 | @pytest.mark.asyncio
330 | async def test_update_nonexistent_memory(self, storage):
331 | """Test updating metadata for non-existent memory."""
332 | nonexistent_hash = "nonexistent123456789"
333 | success, message = await storage.update_memory_metadata(
334 | content_hash=nonexistent_hash,
335 | updates={"tags": ["test"]}
336 | )
337 |
338 | assert not success
339 | assert "not found" in message.lower()
340 |
341 | @pytest.mark.asyncio
342 | async def test_update_memory_with_invalid_tags(self, storage, sample_memory):
343 | """Test updating memory with invalid tags format."""
344 | await storage.store(sample_memory)
345 |
346 | success, message = await storage.update_memory_metadata(
347 | content_hash=sample_memory.content_hash,
348 | updates={"tags": "not_a_list"}
349 | )
350 |
351 | assert not success
352 | assert "list of strings" in message.lower()
353 |
354 | @pytest.mark.asyncio
355 | async def test_update_memory_with_invalid_metadata(self, storage, sample_memory):
356 | """Test updating memory with invalid metadata format."""
357 | await storage.store(sample_memory)
358 |
359 | success, message = await storage.update_memory_metadata(
360 | content_hash=sample_memory.content_hash,
361 | updates={"metadata": "not_a_dict"}
362 | )
363 |
364 | assert not success
365 | assert "dictionary" in message.lower()
366 |
367 | @pytest.mark.asyncio
368 | async def test_update_memory_preserve_timestamps(self, storage, sample_memory):
369 | """Test updating memory while preserving timestamps."""
370 | await storage.store(sample_memory)
371 |
372 | # Get original timestamps
373 | cursor = storage.conn.execute('''
374 | SELECT created_at, created_at_iso FROM memories WHERE content_hash = ?
375 | ''', (sample_memory.content_hash,))
376 | original_created_at, original_created_at_iso = cursor.fetchone()
377 |
378 | # Wait a moment
379 | await asyncio.sleep(0.1)
380 |
381 | # Update with preserve_timestamps=True
382 | success, message = await storage.update_memory_metadata(
383 | content_hash=sample_memory.content_hash,
384 | updates={"tags": ["updated"]},
385 | preserve_timestamps=True
386 | )
387 |
388 | assert success
389 |
390 | # Check timestamps
391 | cursor = storage.conn.execute('''
392 | SELECT created_at, created_at_iso, updated_at FROM memories WHERE content_hash = ?
393 | ''', (sample_memory.content_hash,))
394 | created_at, created_at_iso, updated_at = cursor.fetchone()
395 |
396 | # created_at should be preserved
397 | assert abs(created_at - original_created_at) < 0.01
398 | assert created_at_iso == original_created_at_iso
399 |
400 | # updated_at should be newer
401 | assert updated_at > original_created_at
402 |
403 | @pytest.mark.asyncio
404 | async def test_update_memory_reset_timestamps(self, storage, sample_memory):
405 | """Test updating memory with timestamp reset."""
406 | await storage.store(sample_memory)
407 |
408 | # Get original timestamps
409 | cursor = storage.conn.execute('''
410 | SELECT created_at FROM memories WHERE content_hash = ?
411 | ''', (sample_memory.content_hash,))
412 | original_created_at = cursor.fetchone()[0]
413 |
414 | # Wait a moment
415 | await asyncio.sleep(0.1)
416 |
417 | # Update with preserve_timestamps=False
418 | success, message = await storage.update_memory_metadata(
419 | content_hash=sample_memory.content_hash,
420 | updates={"tags": ["updated"]},
421 | preserve_timestamps=False
422 | )
423 |
424 | assert success
425 |
426 | # Check timestamps
427 | cursor = storage.conn.execute('''
428 | SELECT created_at FROM memories WHERE content_hash = ?
429 | ''', (sample_memory.content_hash,))
430 | created_at = cursor.fetchone()[0]
431 |
432 | # created_at should be updated (newer)
433 | assert created_at > original_created_at
434 |
435 | def test_get_stats(self, storage):
436 | """Test getting storage statistics."""
437 | stats = storage.get_stats()
438 |
439 | assert isinstance(stats, dict)
440 | assert stats["backend"] == "sqlite-vec"
441 | assert "total_memories" in stats
442 | assert "database_size_bytes" in stats
443 | assert "embedding_model" in stats
444 | assert "embedding_dimension" in stats
445 |
446 | @pytest.mark.asyncio
447 | async def test_get_stats_with_data(self, storage, sample_memory):
448 | """Test getting statistics with data."""
449 | await storage.store(sample_memory)
450 |
451 | stats = storage.get_stats()
452 |
453 | assert stats["total_memories"] >= 1
454 | assert stats["database_size_bytes"] > 0
455 | assert stats["embedding_dimension"] == storage.embedding_dimension
456 |
457 | def test_close_connection(self, storage):
458 | """Test closing the database connection."""
459 | assert storage.conn is not None
460 |
461 | storage.close()
462 |
463 | assert storage.conn is None
464 |
465 | @pytest.mark.asyncio
466 | async def test_multiple_memories_retrieval(self, storage):
467 | """Test retrieving multiple memories with ranking."""
468 | # Store multiple memories
469 | memories = []
470 | for i in range(5):
471 | content = f"Test memory {i} with different content and keywords"
472 | memory = Memory(
473 | content=content,
474 | content_hash=generate_content_hash(content),
475 | tags=[f"tag{i}"],
476 | memory_type="note"
477 | )
478 | memories.append(memory)
479 | await storage.store(memory)
480 |
481 | # Retrieve memories
482 | results = await storage.retrieve("test memory content", n_results=3)
483 |
484 | assert len(results) <= 3
485 | assert len(results) > 0
486 |
487 | # Check that results are properly ranked (higher relevance first)
488 | for i in range(len(results) - 1):
489 | assert results[i].relevance_score >= results[i + 1].relevance_score
490 |
491 | @pytest.mark.asyncio
492 | async def test_embedding_generation(self, storage):
493 | """Test embedding generation functionality."""
494 | test_text = "This is a test for embedding generation"
495 |
496 | embedding = storage._generate_embedding(test_text)
497 |
498 | assert isinstance(embedding, list)
499 | assert len(embedding) == storage.embedding_dimension
500 | assert all(isinstance(x, float) for x in embedding)
501 |
502 | @pytest.mark.asyncio
503 | async def test_memory_with_complex_metadata(self, storage):
504 | """Test storing and retrieving memory with complex metadata."""
505 | content = "Memory with complex metadata"
506 | memory = Memory(
507 | content=content,
508 | content_hash=generate_content_hash(content),
509 | tags=["complex", "metadata", "test"],
510 | memory_type="structured",
511 | metadata={
512 | "nested": {"level1": {"level2": "value"}},
513 | "array": [1, 2, 3, "four"],
514 | "boolean": True,
515 | "null_value": None,
516 | "unicode": "测试中文 🚀"
517 | }
518 | )
519 |
520 | # Store the memory
521 | success, message = await storage.store(memory)
522 | assert success
523 |
524 | # Retrieve and verify
525 | results = await storage.retrieve("complex metadata", n_results=1)
526 | assert len(results) == 1
527 |
528 | retrieved_memory = results[0].memory
529 | assert retrieved_memory.metadata["nested"]["level1"]["level2"] == "value"
530 | assert retrieved_memory.metadata["array"] == [1, 2, 3, "four"]
531 | assert retrieved_memory.metadata["boolean"] is True
532 | assert retrieved_memory.metadata["unicode"] == "测试中文 🚀"
533 |
534 | @pytest.mark.asyncio
535 | async def test_concurrent_operations(self, storage):
536 | """Test concurrent storage operations."""
537 | # Create multiple memories
538 | memories = []
539 | for i in range(10):
540 | content = f"Concurrent test memory {i}"
541 | memory = Memory(
542 | content=content,
543 | content_hash=generate_content_hash(content),
544 | tags=[f"concurrent{i}"]
545 | )
546 | memories.append(memory)
547 |
548 | # Store memories concurrently
549 | tasks = [storage.store(memory) for memory in memories]
550 | results = await asyncio.gather(*tasks)
551 |
552 | # All should succeed
553 | assert all(success for success, _ in results)
554 |
555 | # Verify all were stored
556 | for memory in memories:
557 | cursor = storage.conn.execute(
558 | 'SELECT content_hash FROM memories WHERE content_hash = ?',
559 | (memory.content_hash,)
560 | )
561 | assert cursor.fetchone() is not None
562 |
563 | @pytest.mark.asyncio
564 | async def test_get_memories_by_time_range_basic(self, storage):
565 | """Test basic time range filtering."""
566 | # Store memories at different times
567 | now = time.time()
568 |
569 | # Memory from 1 hour ago
570 | memory1 = Memory(
571 | content="Memory from 1 hour ago",
572 | content_hash=generate_content_hash("Memory from 1 hour ago"),
573 | tags=["timerange"],
574 | created_at=now - 3600
575 | )
576 |
577 | # Memory from 30 minutes ago
578 | memory2 = Memory(
579 | content="Memory from 30 minutes ago",
580 | content_hash=generate_content_hash("Memory from 30 minutes ago"),
581 | tags=["timerange"],
582 | created_at=now - 1800
583 | )
584 |
585 | # Memory from now
586 | memory3 = Memory(
587 | content="Memory from now",
588 | content_hash=generate_content_hash("Memory from now"),
589 | tags=["timerange"],
590 | created_at=now
591 | )
592 |
593 | await storage.store(memory1)
594 | await storage.store(memory2)
595 | await storage.store(memory3)
596 |
597 | # Get memories from last 45 minutes (should get memory2 and memory3)
598 | results = await storage.get_memories_by_time_range(now - 2700, now + 100)
599 | assert len(results) == 2
600 | contents = [m.content for m in results]
601 | assert "Memory from 30 minutes ago" in contents
602 | assert "Memory from now" in contents
603 | assert "Memory from 1 hour ago" not in contents
604 |
605 | @pytest.mark.asyncio
606 | async def test_get_memories_by_time_range_empty(self, storage):
607 | """Test time range with no matching memories."""
608 | # Store one memory now
609 | memory = Memory(
610 | content="Current memory",
611 | content_hash=generate_content_hash("Current memory"),
612 | tags=["test"]
613 | )
614 | await storage.store(memory)
615 |
616 | # Query for memories from far in the past
617 | now = time.time()
618 | results = await storage.get_memories_by_time_range(now - 86400, now - 7200)
619 | assert len(results) == 0
620 |
621 | @pytest.mark.asyncio
622 | async def test_get_memories_by_time_range_boundaries(self, storage):
623 | """Test inclusive boundaries of time range."""
624 | now = time.time()
625 |
626 | # Memory exactly at start boundary
627 | memory_start = Memory(
628 | content="At start boundary",
629 | content_hash=generate_content_hash("At start boundary"),
630 | tags=["boundary"],
631 | created_at=now - 1000
632 | )
633 |
634 | # Memory exactly at end boundary
635 | memory_end = Memory(
636 | content="At end boundary",
637 | content_hash=generate_content_hash("At end boundary"),
638 | tags=["boundary"],
639 | created_at=now
640 | )
641 |
642 | # Memory just before start
643 | memory_before = Memory(
644 | content="Before start",
645 | content_hash=generate_content_hash("Before start"),
646 | tags=["boundary"],
647 | created_at=now - 1001
648 | )
649 |
650 | # Memory just after end
651 | memory_after = Memory(
652 | content="After end",
653 | content_hash=generate_content_hash("After end"),
654 | tags=["boundary"],
655 | created_at=now + 1
656 | )
657 |
658 | await storage.store(memory_start)
659 | await storage.store(memory_end)
660 | await storage.store(memory_before)
661 | await storage.store(memory_after)
662 |
663 | # Query with inclusive boundaries
664 | results = await storage.get_memories_by_time_range(now - 1000, now)
665 | assert len(results) == 2
666 | contents = [m.content for m in results]
667 | assert "At start boundary" in contents
668 | assert "At end boundary" in contents
669 | assert "Before start" not in contents
670 | assert "After end" not in contents
671 |
672 | @pytest.mark.asyncio
673 | async def test_get_memories_by_time_range_ordering(self, storage):
674 | """Test that results are ordered by created_at DESC."""
675 | now = time.time()
676 |
677 | # Store three memories in random order
678 | memory1 = Memory(
679 | content="First",
680 | content_hash=generate_content_hash("First"),
681 | tags=["order"],
682 | created_at=now - 300
683 | )
684 | memory2 = Memory(
685 | content="Second",
686 | content_hash=generate_content_hash("Second"),
687 | tags=["order"],
688 | created_at=now - 200
689 | )
690 | memory3 = Memory(
691 | content="Third",
692 | content_hash=generate_content_hash("Third"),
693 | tags=["order"],
694 | created_at=now - 100
695 | )
696 |
697 | await storage.store(memory3) # Store in non-chronological order
698 | await storage.store(memory1)
699 | await storage.store(memory2)
700 |
701 | # Get all three
702 | results = await storage.get_memories_by_time_range(now - 400, now)
703 | assert len(results) == 3
704 |
705 | # Should be ordered newest first (DESC)
706 | assert results[0].content == "Third"
707 | assert results[1].content == "Second"
708 | assert results[2].content == "First"
709 |
710 |
711 | class TestSqliteVecStorageWithoutEmbeddings:
712 | """Test SQLite-vec storage when sentence transformers is not available."""
713 |
714 | @pytest.mark.asyncio
715 | async def test_initialization_without_embeddings(self):
716 | """Test that storage can initialize without sentence transformers."""
717 | temp_dir = tempfile.mkdtemp()
718 | db_path = os.path.join(temp_dir, "test_no_embeddings.db")
719 |
720 | try:
721 | with patch('src.mcp_memory_service.storage.sqlite_vec.SENTENCE_TRANSFORMERS_AVAILABLE', False):
722 | storage = SqliteVecMemoryStorage(db_path)
723 | await storage.initialize()
724 |
725 | assert storage.conn is not None
726 | assert storage.embedding_model is None
727 |
728 | storage.close()
729 |
730 | finally:
731 | shutil.rmtree(temp_dir, ignore_errors=True)
732 |
733 | @pytest.mark.asyncio
734 | async def test_operations_without_embeddings(self):
735 | """Test basic operations without embeddings."""
736 | temp_dir = tempfile.mkdtemp()
737 | db_path = os.path.join(temp_dir, "test_no_embeddings.db")
738 |
739 | try:
740 | with patch('src.mcp_memory_service.storage.sqlite_vec.SENTENCE_TRANSFORMERS_AVAILABLE', False):
741 | storage = SqliteVecMemoryStorage(db_path)
742 | await storage.initialize()
743 |
744 | # Store should work (with zero embeddings)
745 | content = "Test without embeddings"
746 | memory = Memory(
747 | content=content,
748 | content_hash=generate_content_hash(content),
749 | tags=["no-embeddings"]
750 | )
751 |
752 | success, message = await storage.store(memory)
753 | assert success
754 |
755 | # Tag search should work
756 | results = await storage.search_by_tag(["no-embeddings"])
757 | assert len(results) == 1
758 |
759 | # Semantic search won't work well but shouldn't crash
760 | results = await storage.retrieve("test", n_results=1)
761 | # May or may not return results, but shouldn't crash
762 |
763 | storage.close()
764 |
765 | finally:
766 | shutil.rmtree(temp_dir, ignore_errors=True)
767 |
768 |
769 | if __name__ == "__main__":
770 | # Run basic tests when executed directly
771 | async def run_basic_tests():
772 | """Run basic tests to verify functionality."""
773 | if not SQLITE_VEC_AVAILABLE:
774 | print("⚠️ sqlite-vec not available, skipping tests")
775 | return
776 |
777 | print("Running basic SQLite-vec storage tests...")
778 |
779 | temp_dir = tempfile.mkdtemp()
780 | db_path = os.path.join(temp_dir, "test_basic.db")
781 |
782 | try:
783 | # Test basic functionality
784 | storage = SqliteVecMemoryStorage(db_path)
785 | await storage.initialize()
786 |
787 | # Store a memory
788 | content = "Test memory for basic validation"
789 | memory = Memory(
790 | content=content,
791 | content_hash=generate_content_hash(content),
792 | tags=["test", "basic"]
793 | )
794 |
795 | success, message = await storage.store(memory)
796 | print(f"Store: {success}, {message}")
797 |
798 | # Retrieve the memory
799 | results = await storage.retrieve("test memory", n_results=1)
800 | print(f"Retrieve: Found {len(results)} results")
801 |
802 | if results:
803 | print(f"Content: {results[0].memory.content}")
804 | print(f"Relevance: {results[0].relevance_score}")
805 |
806 | # Search by tag
807 | tag_results = await storage.search_by_tag(["test"])
808 | print(f"Tag search: Found {len(tag_results)} results")
809 |
810 | # Get stats
811 | stats = storage.get_stats()
812 | print(f"Stats: {stats['total_memories']} memories, {stats['database_size_mb']} MB")
813 |
814 | storage.close()
815 | print("✅ Basic tests passed!")
816 |
817 | except Exception as e:
818 | print(f"❌ Basic tests failed: {e}")
819 |
820 | finally:
821 | shutil.rmtree(temp_dir, ignore_errors=True)
822 |
823 | # Run the basic tests
824 | asyncio.run(run_basic_tests())
```
--------------------------------------------------------------------------------
/src/mcp_memory_service/utils/time_parser.py:
--------------------------------------------------------------------------------
```python
1 | # Copyright 2024 Heinrich Krupp
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Natural language time expression parser for MCP Memory Service.
17 |
18 | This module provides utilities to parse and understand various time expressions
19 | for retrieving memories based on when they were stored.
20 | """
21 | import re
22 | import logging
23 | from datetime import datetime, timedelta, date, time
24 | from typing import Tuple, Optional, Dict, Any, List
25 |
26 | logger = logging.getLogger(__name__)
27 |
28 | # Named time periods and their approximate date ranges
29 | NAMED_PERIODS = {
30 | # Holidays (US/Western-centric, would need localization for global use)
31 | "christmas": {"month": 12, "day": 25, "window": 3},
32 | "new year": {"month": 1, "day": 1, "window": 3},
33 | "valentine": {"month": 2, "day": 14, "window": 1},
34 | "halloween": {"month": 10, "day": 31, "window": 3},
35 | "thanksgiving": {"month": 11, "day": -1, "window": 3}, # -1 means fourth Thursday
36 |
37 | # Seasons (Northern Hemisphere)
38 | "spring": {"start_month": 3, "start_day": 20, "end_month": 6, "end_day": 20},
39 | "summer": {"start_month": 6, "start_day": 21, "end_month": 9, "end_day": 22},
40 | "fall": {"start_month": 9, "start_day": 23, "end_month": 12, "end_day": 20},
41 | "autumn": {"start_month": 9, "start_day": 23, "end_month": 12, "end_day": 20},
42 | "winter": {"start_month": 12, "start_day": 21, "end_month": 3, "end_day": 19},
43 | }
44 |
45 | # Time of day mappings (24-hour format)
46 | TIME_OF_DAY = {
47 | "morning": (5, 11), # 5:00 AM - 11:59 AM
48 | "noon": (12, 12), # 12:00 PM
49 | "afternoon": (13, 17), # 1:00 PM - 5:59 PM
50 | "evening": (18, 21), # 6:00 PM - 9:59 PM
51 | "night": (22, 4), # 10:00 PM - 4:59 AM (wraps around midnight)
52 | "midnight": (0, 0), # 12:00 AM
53 | }
54 |
55 | # Regular expressions for various time patterns
56 | PATTERNS = {
57 | "relative_days": re.compile(r'(?:(\d+)\s+days?\s+ago)|(?:yesterday)|(?:today)'),
58 | "relative_weeks": re.compile(r'(\d+)\s+weeks?\s+ago'),
59 | "relative_months": re.compile(r'(\d+)\s+months?\s+ago'),
60 | "relative_years": re.compile(r'(\d+)\s+years?\s+ago'),
61 | "last_period": re.compile(r'last\s+(day|week|month|year|summer|spring|winter|fall|autumn)'),
62 | "this_period": re.compile(r'this\s+(day|week|month|year|summer|spring|winter|fall|autumn)'),
63 | "month_name": re.compile(r'(january|february|march|april|may|june|july|august|september|october|november|december)'),
64 | "date_range": re.compile(r'between\s+(.+?)\s+and\s+(.+?)(?:\s|$)'),
65 | "time_of_day": re.compile(r'(morning|afternoon|evening|night|noon|midnight)'),
66 | "recent": re.compile(r'recent|lately|recently'),
67 | "specific_date": re.compile(r'(\d{1,2})[/-](\d{1,2})(?:[/-](\d{2,4}))?'),
68 | "full_date": re.compile(r'(\d{4})-(\d{1,2})-(\d{1,2})'),
69 | "named_period": re.compile(r'(spring|summer|winter|fall|autumn|christmas|new\s*year|valentine|halloween|thanksgiving|spring\s*break|summer\s*break|winter\s*break)'), "half_year": re.compile(r'(first|second)\s+half\s+of\s+(\d{4})'),
70 | "quarter": re.compile(r'(first|second|third|fourth|1st|2nd|3rd|4th)\s+quarter(?:\s+of\s+(\d{4}))?'),
71 | }
72 |
73 | def _calculate_season_date_range(
74 | period: str,
75 | season_info: Dict[str, int],
76 | base_year: int,
77 | current_month: Optional[int] = None
78 | ) -> Tuple[datetime, datetime]:
79 | """
80 | Calculate start and end dates for a season, handling winter's year boundary.
81 |
82 | Args:
83 | period: Season name ("winter", "spring", "summer", "fall"/"autumn")
84 | season_info: Dictionary with start_month, start_day, end_month, end_day
85 | base_year: The year to use as reference for calculation
86 | current_month: Current month (1-12) for context-aware winter calculation (optional)
87 |
88 | Returns:
89 | Tuple of (start_datetime, end_datetime) for the season
90 | """
91 | if period == "winter":
92 | # Winter spans year boundary (Dec -> Mar)
93 | # Determine start year based on current month context
94 | if current_month is not None and current_month <= 3:
95 | # We're in Jan-Mar, so winter started the previous year
96 | start_year = base_year - 1
97 | end_year = base_year
98 | else:
99 | # We're in any other month, winter starts this year
100 | start_year = base_year
101 | end_year = base_year + 1
102 |
103 | start_dt = datetime(start_year, season_info["start_month"], season_info["start_day"])
104 | end_dt = datetime(end_year, season_info["end_month"], season_info["end_day"], 23, 59, 59)
105 | else:
106 | # All other seasons fall within a single calendar year
107 | start_dt = datetime(base_year, season_info["start_month"], season_info["start_day"])
108 | end_dt = datetime(base_year, season_info["end_month"], season_info["end_day"], 23, 59, 59)
109 |
110 | return start_dt, end_dt
111 |
112 | def parse_time_expression(query: str) -> Tuple[Optional[float], Optional[float]]:
113 | """
114 | Parse a natural language time expression and return timestamp range.
115 |
116 | Args:
117 | query: A natural language query with time expressions
118 |
119 | Returns:
120 | Tuple of (start_timestamp, end_timestamp), either may be None
121 | """
122 | query = query.lower().strip()
123 |
124 | # Check for multiple patterns in a single query
125 | try:
126 | # First check for date ranges like "between X and Y"
127 | date_range_match = PATTERNS["date_range"].search(query)
128 | if date_range_match:
129 | start_expr = date_range_match.group(1)
130 | end_expr = date_range_match.group(2)
131 | start_ts, _ = parse_time_expression(start_expr)
132 | _, end_ts = parse_time_expression(end_expr)
133 | return start_ts, end_ts
134 |
135 | # Check for full ISO dates (YYYY-MM-DD) FIRST
136 | full_date_match = PATTERNS["full_date"].search(query)
137 | if full_date_match:
138 | year, month, day = full_date_match.groups()
139 | try:
140 | specific_date = date(int(year), int(month), int(day))
141 | start_dt = datetime.combine(specific_date, time.min)
142 | end_dt = datetime.combine(specific_date, time.max)
143 | return start_dt.timestamp(), end_dt.timestamp()
144 | except ValueError as e:
145 | logger.warning(f"Invalid date: {e}")
146 | return None, None
147 |
148 | # Check for specific dates (MM/DD/YYYY)
149 | specific_date_match = PATTERNS["specific_date"].search(query)
150 | if specific_date_match:
151 | month, day, year = specific_date_match.groups()
152 | month = int(month)
153 | day = int(day)
154 | current_year = datetime.now().year
155 | year = int(year) if year else current_year
156 | # Handle 2-digit years
157 | if year and year < 100:
158 | year = 2000 + year if year < 50 else 1900 + year
159 |
160 | try:
161 | specific_date = date(year, month, day)
162 | start_dt = datetime.combine(specific_date, time.min)
163 | end_dt = datetime.combine(specific_date, time.max)
164 | return start_dt.timestamp(), end_dt.timestamp()
165 | except ValueError as e:
166 | logger.warning(f"Invalid date: {e}")
167 | return None, None
168 |
169 | # Relative days: "X days ago", "yesterday", "today"
170 | days_ago_match = PATTERNS["relative_days"].search(query)
171 | if days_ago_match:
172 | if "yesterday" in query:
173 | days = 1
174 | elif "today" in query:
175 | days = 0
176 | else:
177 | days = int(days_ago_match.group(1))
178 |
179 | target_date = date.today() - timedelta(days=days)
180 |
181 | # Check for time of day modifiers
182 | time_of_day_match = PATTERNS["time_of_day"].search(query)
183 | if time_of_day_match:
184 | # Narrow the range based on time of day
185 | return get_time_of_day_range(target_date, time_of_day_match.group(1))
186 | else:
187 | # Return the full day
188 | start_dt = datetime.combine(target_date, time.min)
189 | end_dt = datetime.combine(target_date, time.max)
190 | return start_dt.timestamp(), end_dt.timestamp()
191 |
192 | # Relative weeks: "X weeks ago"
193 | weeks_ago_match = PATTERNS["relative_weeks"].search(query)
194 | if weeks_ago_match:
195 | weeks = int(weeks_ago_match.group(1))
196 | target_date = date.today() - timedelta(weeks=weeks)
197 | # Get the start of the week (Monday)
198 | start_date = target_date - timedelta(days=target_date.weekday())
199 | end_date = start_date + timedelta(days=6)
200 | start_dt = datetime.combine(start_date, time.min)
201 | end_dt = datetime.combine(end_date, time.max)
202 | return start_dt.timestamp(), end_dt.timestamp()
203 |
204 | # Relative months: "X months ago"
205 | months_ago_match = PATTERNS["relative_months"].search(query)
206 | if months_ago_match:
207 | months = int(months_ago_match.group(1))
208 | current = datetime.now()
209 | # Calculate target month
210 | year = current.year
211 | month = current.month - months
212 |
213 | # Adjust year if month goes negative
214 | while month <= 0:
215 | year -= 1
216 | month += 12
217 |
218 | # Get first and last day of the month
219 | first_day = date(year, month, 1)
220 | if month == 12:
221 | last_day = date(year + 1, 1, 1) - timedelta(days=1)
222 | else:
223 | last_day = date(year, month + 1, 1) - timedelta(days=1)
224 |
225 | start_dt = datetime.combine(first_day, time.min)
226 | end_dt = datetime.combine(last_day, time.max)
227 | return start_dt.timestamp(), end_dt.timestamp()
228 |
229 | # Relative years: "X years ago"
230 | years_ago_match = PATTERNS["relative_years"].search(query)
231 | if years_ago_match:
232 | years = int(years_ago_match.group(1))
233 | current_year = datetime.now().year
234 | target_year = current_year - years
235 | start_dt = datetime(target_year, 1, 1, 0, 0, 0)
236 | end_dt = datetime(target_year, 12, 31, 23, 59, 59)
237 | return start_dt.timestamp(), end_dt.timestamp()
238 |
239 | # "Last X" expressions
240 | last_period_match = PATTERNS["last_period"].search(query)
241 | if last_period_match:
242 | period = last_period_match.group(1)
243 | return get_last_period_range(period)
244 |
245 | # "This X" expressions
246 | this_period_match = PATTERNS["this_period"].search(query)
247 | if this_period_match:
248 | period = this_period_match.group(1)
249 | return get_this_period_range(period)
250 |
251 | # Month names
252 | month_match = PATTERNS["month_name"].search(query)
253 | if month_match:
254 | month_name = month_match.group(1)
255 | return get_month_range(month_name)
256 |
257 | # Named periods (holidays, etc.)
258 | named_period_match = PATTERNS["named_period"].search(query)
259 | if named_period_match:
260 | period_name = named_period_match.group(1) # <-- Just get the matched group without replacing
261 | return get_named_period_range(period_name)
262 |
263 | # Half year expressions
264 | half_year_match = PATTERNS["half_year"].search(query)
265 | if half_year_match:
266 | half = half_year_match.group(1)
267 | year_str = half_year_match.group(2)
268 | year = int(year_str) if year_str else datetime.now().year
269 |
270 | if half.lower() == "first":
271 | start_dt = datetime(year, 1, 1, 0, 0, 0)
272 | end_dt = datetime(year, 6, 30, 23, 59, 59)
273 | else: # "second"
274 | start_dt = datetime(year, 7, 1, 0, 0, 0)
275 | end_dt = datetime(year, 12, 31, 23, 59, 59)
276 |
277 | return start_dt.timestamp(), end_dt.timestamp()
278 |
279 | # Quarter expressions
280 | quarter_match = PATTERNS["quarter"].search(query)
281 | if quarter_match:
282 | quarter = quarter_match.group(1).lower()
283 | year_str = quarter_match.group(2)
284 | year = int(year_str) if year_str else datetime.now().year
285 |
286 | # Map textual quarter to number
287 | quarter_num = {"first": 1, "1st": 1, "second": 2, "2nd": 2,
288 | "third": 3, "3rd": 3, "fourth": 4, "4th": 4}[quarter]
289 |
290 | # Calculate quarter start and end dates
291 | quarter_month = (quarter_num - 1) * 3 + 1
292 | start_dt = datetime(year, quarter_month, 1, 0, 0, 0)
293 |
294 | if quarter_month + 3 > 12:
295 | end_dt = datetime(year + 1, 1, 1, 0, 0, 0) - timedelta(seconds=1)
296 | else:
297 | end_dt = datetime(year, quarter_month + 3, 1, 0, 0, 0) - timedelta(seconds=1)
298 |
299 | return start_dt.timestamp(), end_dt.timestamp()
300 |
301 | # Recent/fuzzy time expressions
302 | recent_match = PATTERNS["recent"].search(query)
303 | if recent_match:
304 | # Default to last 7 days for "recent"
305 | end_dt = datetime.now()
306 | start_dt = end_dt - timedelta(days=7)
307 | return start_dt.timestamp(), end_dt.timestamp()
308 |
309 | # If no time expression is found, return None for both timestamps
310 | return None, None
311 |
312 | except Exception as e:
313 | logger.error(f"Error parsing time expression: {e}")
314 | return None, None
315 |
316 | def get_time_of_day_range(target_date: date, time_period: str) -> Tuple[float, float]:
317 | """Get timestamp range for a specific time of day on a given date."""
318 | if time_period in TIME_OF_DAY:
319 | start_hour, end_hour = TIME_OF_DAY[time_period]
320 |
321 | # Handle periods that wrap around midnight
322 | if start_hour > end_hour: # e.g., "night" = (22, 4)
323 | # For periods that span midnight, we need to handle specially
324 | if time_period == "night":
325 | start_dt = datetime.combine(target_date, time(start_hour, 0))
326 | end_dt = datetime.combine(target_date + timedelta(days=1), time(end_hour, 59, 59))
327 | else:
328 | # Default handling for other wrapping periods
329 | start_dt = datetime.combine(target_date, time(start_hour, 0))
330 | end_dt = datetime.combine(target_date + timedelta(days=1), time(end_hour, 59, 59))
331 | else:
332 | # Normal periods within a single day
333 | start_dt = datetime.combine(target_date, time(start_hour, 0))
334 | if end_hour == start_hour: # For noon, midnight (specific hour)
335 | end_dt = datetime.combine(target_date, time(end_hour, 59, 59))
336 | else:
337 | end_dt = datetime.combine(target_date, time(end_hour, 59, 59))
338 |
339 | return start_dt.timestamp(), end_dt.timestamp()
340 | else:
341 | # Fallback to full day
342 | start_dt = datetime.combine(target_date, time.min)
343 | end_dt = datetime.combine(target_date, time.max)
344 | return start_dt.timestamp(), end_dt.timestamp()
345 |
346 | def get_last_period_range(period: str) -> Tuple[float, float]:
347 | """Get timestamp range for 'last X' expressions."""
348 | now = datetime.now()
349 | today = date.today()
350 |
351 | if period == "day":
352 | # Last day = yesterday
353 | yesterday = today - timedelta(days=1)
354 | start_dt = datetime.combine(yesterday, time.min)
355 | end_dt = datetime.combine(yesterday, time.max)
356 | elif period == "week":
357 | # Last week = previous calendar week (Mon-Sun)
358 | # Find last Monday
359 | last_monday = today - timedelta(days=today.weekday() + 7)
360 | # Find last Sunday
361 | last_sunday = last_monday + timedelta(days=6)
362 | start_dt = datetime.combine(last_monday, time.min)
363 | end_dt = datetime.combine(last_sunday, time.max)
364 | elif period == "month":
365 | # Last month = previous calendar month
366 | first_of_this_month = date(today.year, today.month, 1)
367 | if today.month == 1:
368 | last_month = 12
369 | last_month_year = today.year - 1
370 | else:
371 | last_month = today.month - 1
372 | last_month_year = today.year
373 |
374 | first_of_last_month = date(last_month_year, last_month, 1)
375 | last_of_last_month = first_of_this_month - timedelta(days=1)
376 |
377 | start_dt = datetime.combine(first_of_last_month, time.min)
378 | end_dt = datetime.combine(last_of_last_month, time.max)
379 | elif period == "year":
380 | # Last year = previous calendar year
381 | last_year = today.year - 1
382 | start_dt = datetime(last_year, 1, 1, 0, 0, 0)
383 | end_dt = datetime(last_year, 12, 31, 23, 59, 59)
384 | elif period in ["summer", "spring", "winter", "fall", "autumn"]:
385 | # Last season
386 | season_info = NAMED_PERIODS[period]
387 | current_year = today.year
388 |
389 | # Determine if we're currently in this season
390 | current_month = today.month
391 | current_day = today.day
392 | is_current_season = False
393 |
394 | # Check if today falls within the season's date range
395 | if period in ["winter"]: # Winter spans year boundary
396 | if (current_month >= season_info["start_month"] or
397 | (current_month <= season_info["end_month"] and
398 | current_day <= season_info["end_day"])):
399 | is_current_season = True
400 | else:
401 | if (current_month >= season_info["start_month"] and current_month <= season_info["end_month"] and
402 | current_day >= season_info["start_day"] if current_month == season_info["start_month"] else True and
403 | current_day <= season_info["end_day"] if current_month == season_info["end_month"] else True):
404 | is_current_season = True
405 |
406 | # If we're currently in the season, get last year's season
407 | if is_current_season:
408 | year = current_year - 1
409 | else:
410 | year = current_year
411 |
412 | # Calculate season date range (handles winter's year boundary)
413 | context_month = current_month if is_current_season else None
414 | start_dt, end_dt = _calculate_season_date_range(period, season_info, year, context_month)
415 | else:
416 | # Fallback - last 24 hours
417 | end_dt = now
418 | start_dt = end_dt - timedelta(days=1)
419 |
420 | return start_dt.timestamp(), end_dt.timestamp()
421 |
422 | def get_this_period_range(period: str) -> Tuple[float, float]:
423 | """Get timestamp range for 'this X' expressions."""
424 | now = datetime.now()
425 | today = date.today()
426 |
427 | if period == "day":
428 | # This day = today
429 | start_dt = datetime.combine(today, time.min)
430 | end_dt = datetime.combine(today, time.max)
431 | elif period == "week":
432 | # This week = current calendar week (Mon-Sun)
433 | # Find this Monday
434 | monday = today - timedelta(days=today.weekday())
435 | sunday = monday + timedelta(days=6)
436 | start_dt = datetime.combine(monday, time.min)
437 | end_dt = datetime.combine(sunday, time.max)
438 | elif period == "month":
439 | # This month = current calendar month
440 | first_of_month = date(today.year, today.month, 1)
441 | if today.month == 12:
442 | first_of_next_month = date(today.year + 1, 1, 1)
443 | else:
444 | first_of_next_month = date(today.year, today.month + 1, 1)
445 |
446 | last_of_month = first_of_next_month - timedelta(days=1)
447 |
448 | start_dt = datetime.combine(first_of_month, time.min)
449 | end_dt = datetime.combine(last_of_month, time.max)
450 | elif period == "year":
451 | # This year = current calendar year
452 | start_dt = datetime(today.year, 1, 1, 0, 0, 0)
453 | end_dt = datetime(today.year, 12, 31, 23, 59, 59)
454 | elif period in ["summer", "spring", "winter", "fall", "autumn"]:
455 | # This season
456 | season_info = NAMED_PERIODS[period]
457 | current_year = today.year
458 |
459 | # Calculate season date range (handles winter's year boundary)
460 | start_dt, end_dt = _calculate_season_date_range(period, season_info, current_year, today.month)
461 | else:
462 | # Fallback - current 24 hours
463 | end_dt = now
464 | start_dt = datetime.combine(today, time.min)
465 |
466 | return start_dt.timestamp(), end_dt.timestamp()
467 |
468 | def get_month_range(month_name: str) -> Tuple[float, float]:
469 | """Get timestamp range for a named month."""
470 | # Map month name to number
471 | month_map = {
472 | "january": 1, "february": 2, "march": 3, "april": 4,
473 | "may": 5, "june": 6, "july": 7, "august": 8,
474 | "september": 9, "october": 10, "november": 11, "december": 12
475 | }
476 |
477 | if month_name in month_map:
478 | month_num = month_map[month_name]
479 | current_year = datetime.now().year
480 |
481 | # If the month is in the future for this year, use last year
482 | current_month = datetime.now().month
483 | year = current_year if month_num <= current_month else current_year - 1
484 |
485 | # Get first and last day of the month
486 | first_day = date(year, month_num, 1)
487 | if month_num == 12:
488 | last_day = date(year + 1, 1, 1) - timedelta(days=1)
489 | else:
490 | last_day = date(year, month_num + 1, 1) - timedelta(days=1)
491 |
492 | start_dt = datetime.combine(first_day, time.min)
493 | end_dt = datetime.combine(last_day, time.max)
494 | return start_dt.timestamp(), end_dt.timestamp()
495 | else:
496 | return None, None
497 |
498 | def get_named_period_range(period_name: str) -> Tuple[Optional[float], Optional[float]]:
499 | """Get timestamp range for named periods like holidays."""
500 | period_name = period_name.lower().replace("_", " ")
501 | current_year = datetime.now().year
502 | current_month = datetime.now().month
503 | current_day = datetime.now().day
504 |
505 | if period_name in NAMED_PERIODS:
506 | info = NAMED_PERIODS[period_name]
507 | # Found matching period
508 | # Determine if the period is in the past or future for this year
509 | if "month" in info and "day" in info:
510 | # Simple fixed-date holiday
511 | month = info["month"]
512 | day = info["day"]
513 | window = info.get("window", 1) # Default 1-day window
514 |
515 | # Special case for Thanksgiving (fourth Thursday in November)
516 | if day == -1 and month == 11: # Thanksgiving
517 | # Find the fourth Thursday in November
518 | first_day = date(current_year, 11, 1)
519 | # Find first Thursday
520 | first_thursday = first_day + timedelta(days=((3 - first_day.weekday()) % 7))
521 | # Fourth Thursday is 3 weeks later
522 | thanksgiving = first_thursday + timedelta(weeks=3)
523 | day = thanksgiving.day
524 |
525 | # Check if the holiday has passed this year
526 | is_past = (current_month > month or
527 | (current_month == month and current_day > day + window))
528 |
529 | year = current_year if not is_past else current_year - 1
530 | target_date = date(year, month, day)
531 |
532 | # Create date range with window
533 | start_date = target_date - timedelta(days=window)
534 | end_date = target_date + timedelta(days=window)
535 |
536 | start_dt = datetime.combine(start_date, time.min)
537 | end_dt = datetime.combine(end_date, time.max)
538 | return start_dt.timestamp(), end_dt.timestamp()
539 |
540 | elif "start_month" in info and "end_month" in info:
541 | # Season or date range
542 | start_month = info["start_month"]
543 | start_day = info["start_day"]
544 | end_month = info["end_month"]
545 | end_day = info["end_day"]
546 |
547 | # Determine year based on current date
548 | if start_month > end_month: # Period crosses year boundary
549 | if current_month < end_month or (current_month == end_month and current_day <= end_day):
550 | # We're in the end part of the period that started last year
551 | start_dt = datetime(current_year - 1, start_month, start_day)
552 | end_dt = datetime(current_year, end_month, end_day, 23, 59, 59)
553 | else:
554 | # The period is either coming up this year or happened earlier this year
555 | if current_month > start_month or (current_month == start_month and current_day >= start_day):
556 | # Period already started this year
557 | start_dt = datetime(current_year, start_month, start_day)
558 | end_dt = datetime(current_year + 1, end_month, end_day, 23, 59, 59)
559 | else:
560 | # Period from last year
561 | start_dt = datetime(current_year - 1, start_month, start_day)
562 | end_dt = datetime(current_year, end_month, end_day, 23, 59, 59)
563 | else:
564 | # Period within a single year
565 | # Check if period has already occurred this year
566 | if (current_month > end_month or
567 | (current_month == end_month and current_day > end_day)):
568 | # Period already passed this year
569 | start_dt = datetime(current_year, start_month, start_day)
570 | end_dt = datetime(current_year, end_month, end_day, 23, 59, 59)
571 | else:
572 | # Check if current date is within the period
573 | is_within_period = (
574 | (current_month > start_month or
575 | (current_month == start_month and current_day >= start_day))
576 | and
577 | (current_month < end_month or
578 | (current_month == end_month and current_day <= end_day))
579 | )
580 |
581 | if is_within_period:
582 | # We're in the period this year
583 | start_dt = datetime(current_year, start_month, start_day)
584 | end_dt = datetime(current_year, end_month, end_day, 23, 59, 59)
585 | else:
586 | # Period from last year
587 | start_dt = datetime(current_year - 1, start_month, start_day)
588 | end_dt = datetime(current_year - 1, end_month, end_day, 23, 59, 59)
589 |
590 | return start_dt.timestamp(), end_dt.timestamp()
591 |
592 | # If no match found
593 | return None, None
594 |
595 | # Helper function to detect time expressions in a general query
596 | def extract_time_expression(query: str) -> Tuple[str, Tuple[Optional[float], Optional[float]]]:
597 | """
598 | Extract time-related expressions from a query and return the timestamps.
599 |
600 | Args:
601 | query: A natural language query that may contain time expressions
602 |
603 | Returns:
604 | Tuple of (cleaned_query, (start_timestamp, end_timestamp))
605 | The cleaned_query has time expressions removed
606 | """
607 | # Check for time expressions
608 | time_expressions = [
609 | r'\b\d+\s+days?\s+ago\b',
610 | r'\byesterday\b',
611 | r'\btoday\b',
612 | r'\b\d+\s+weeks?\s+ago\b',
613 | r'\b\d+\s+months?\s+ago\b',
614 | r'\b\d+\s+years?\s+ago\b',
615 | r'\blast\s+(day|week|month|year|summer|spring|winter|fall|autumn)\b',
616 | r'\bthis\s+(day|week|month|year|summer|spring|winter|fall|autumn)\b',
617 | r'\b(january|february|march|april|may|june|july|august|september|october|november|december)\b',
618 | r'\bbetween\s+.+?\s+and\s+.+?(?:\s|$)',
619 | r'\bin\s+the\s+(morning|afternoon|evening|night|noon|midnight)\b',
620 | r'\brecent|lately|recently\b',
621 | r'\b\d{1,2}[/-]\d{1,2}(?:[/-]\d{2,4})?\b',
622 | r'\b\d{4}-\d{1,2}-\d{1,2}\b',
623 | r'\b(spring|summer|winter|fall|autumn|christmas|new\s*year|valentine|halloween|thanksgiving|spring\s*break|summer\s*break|winter\s*break)\b',
624 | r'\b(first|second)\s+half\s+of\s+\d{4}\b',
625 | r'\b(first|second|third|fourth|1st|2nd|3rd|4th)\s+quarter(?:\s+of\s+\d{4})?\b',
626 | r'\bfrom\s+.+\s+to\s+.+\b'
627 | ]
628 |
629 | # Combine all patterns
630 | combined_pattern = '|'.join(f'({expr})' for expr in time_expressions)
631 | combined_regex = re.compile(combined_pattern, re.IGNORECASE)
632 |
633 | # Find all matches
634 | matches = list(combined_regex.finditer(query))
635 | if not matches:
636 | return query, (None, None)
637 |
638 | # Extract the time expressions
639 | time_expressions = []
640 | for match in matches:
641 | span = match.span()
642 | expression = query[span[0]:span[1]]
643 | time_expressions.append(expression)
644 |
645 | # Parse time expressions to get timestamps
646 | full_time_expression = ' '.join(time_expressions)
647 | start_ts, end_ts = parse_time_expression(full_time_expression)
648 |
649 | # Remove time expressions from the query
650 | cleaned_query = query
651 | for expr in time_expressions:
652 | cleaned_query = cleaned_query.replace(expr, '')
653 |
654 | # Clean up multiple spaces
655 | cleaned_query = re.sub(r'\s+', ' ', cleaned_query).strip()
656 |
657 | return cleaned_query, (start_ts, end_ts)
```