This is page 16 of 47. Use http://codebase.md/doobidoo/mcp-memory-service?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .claude
│ ├── agents
│ │ ├── amp-bridge.md
│ │ ├── amp-pr-automator.md
│ │ ├── code-quality-guard.md
│ │ ├── gemini-pr-automator.md
│ │ └── github-release-manager.md
│ ├── settings.local.json.backup
│ └── settings.local.json.local
├── .commit-message
├── .dockerignore
├── .env.example
├── .env.sqlite.backup
├── .envnn#
├── .gitattributes
├── .github
│ ├── FUNDING.yml
│ ├── ISSUE_TEMPLATE
│ │ ├── bug_report.yml
│ │ ├── config.yml
│ │ ├── feature_request.yml
│ │ └── performance_issue.yml
│ ├── pull_request_template.md
│ └── workflows
│ ├── bridge-tests.yml
│ ├── CACHE_FIX.md
│ ├── claude-code-review.yml
│ ├── claude.yml
│ ├── cleanup-images.yml.disabled
│ ├── dev-setup-validation.yml
│ ├── docker-publish.yml
│ ├── LATEST_FIXES.md
│ ├── main-optimized.yml.disabled
│ ├── main.yml
│ ├── publish-and-test.yml
│ ├── README_OPTIMIZATION.md
│ ├── release-tag.yml.disabled
│ ├── release.yml
│ ├── roadmap-review-reminder.yml
│ ├── SECRET_CONDITIONAL_FIX.md
│ └── WORKFLOW_FIXES.md
├── .gitignore
├── .mcp.json.backup
├── .mcp.json.template
├── .pyscn
│ ├── .gitignore
│ └── reports
│ └── analyze_20251123_214224.html
├── AGENTS.md
├── archive
│ ├── deployment
│ │ ├── deploy_fastmcp_fixed.sh
│ │ ├── deploy_http_with_mcp.sh
│ │ └── deploy_mcp_v4.sh
│ ├── deployment-configs
│ │ ├── empty_config.yml
│ │ └── smithery.yaml
│ ├── development
│ │ └── test_fastmcp.py
│ ├── docs-removed-2025-08-23
│ │ ├── authentication.md
│ │ ├── claude_integration.md
│ │ ├── claude-code-compatibility.md
│ │ ├── claude-code-integration.md
│ │ ├── claude-code-quickstart.md
│ │ ├── claude-desktop-setup.md
│ │ ├── complete-setup-guide.md
│ │ ├── database-synchronization.md
│ │ ├── development
│ │ │ ├── autonomous-memory-consolidation.md
│ │ │ ├── CLEANUP_PLAN.md
│ │ │ ├── CLEANUP_README.md
│ │ │ ├── CLEANUP_SUMMARY.md
│ │ │ ├── dream-inspired-memory-consolidation.md
│ │ │ ├── hybrid-slm-memory-consolidation.md
│ │ │ ├── mcp-milestone.md
│ │ │ ├── multi-client-architecture.md
│ │ │ ├── test-results.md
│ │ │ └── TIMESTAMP_FIX_SUMMARY.md
│ │ ├── distributed-sync.md
│ │ ├── invocation_guide.md
│ │ ├── macos-intel.md
│ │ ├── master-guide.md
│ │ ├── mcp-client-configuration.md
│ │ ├── multi-client-server.md
│ │ ├── service-installation.md
│ │ ├── sessions
│ │ │ └── MCP_ENHANCEMENT_SESSION_MEMORY_v4.1.0.md
│ │ ├── UBUNTU_SETUP.md
│ │ ├── ubuntu.md
│ │ ├── windows-setup.md
│ │ └── windows.md
│ ├── docs-root-cleanup-2025-08-23
│ │ ├── AWESOME_LIST_SUBMISSION.md
│ │ ├── CLOUDFLARE_IMPLEMENTATION.md
│ │ ├── DOCUMENTATION_ANALYSIS.md
│ │ ├── DOCUMENTATION_CLEANUP_PLAN.md
│ │ ├── DOCUMENTATION_CONSOLIDATION_COMPLETE.md
│ │ ├── LITESTREAM_SETUP_GUIDE.md
│ │ ├── lm_studio_system_prompt.md
│ │ ├── PYTORCH_DOWNLOAD_FIX.md
│ │ └── README-ORIGINAL-BACKUP.md
│ ├── investigations
│ │ └── MACOS_HOOKS_INVESTIGATION.md
│ ├── litestream-configs-v6.3.0
│ │ ├── install_service.sh
│ │ ├── litestream_master_config_fixed.yml
│ │ ├── litestream_master_config.yml
│ │ ├── litestream_replica_config_fixed.yml
│ │ ├── litestream_replica_config.yml
│ │ ├── litestream_replica_simple.yml
│ │ ├── litestream-http.service
│ │ ├── litestream.service
│ │ └── requirements-cloudflare.txt
│ ├── release-notes
│ │ └── release-notes-v7.1.4.md
│ └── setup-development
│ ├── README.md
│ ├── setup_consolidation_mdns.sh
│ ├── STARTUP_SETUP_GUIDE.md
│ └── test_service.sh
├── CHANGELOG-HISTORIC.md
├── CHANGELOG.md
├── claude_commands
│ ├── memory-context.md
│ ├── memory-health.md
│ ├── memory-ingest-dir.md
│ ├── memory-ingest.md
│ ├── memory-recall.md
│ ├── memory-search.md
│ ├── memory-store.md
│ ├── README.md
│ └── session-start.md
├── claude-hooks
│ ├── config.json
│ ├── config.template.json
│ ├── CONFIGURATION.md
│ ├── core
│ │ ├── memory-retrieval.js
│ │ ├── mid-conversation.js
│ │ ├── session-end.js
│ │ ├── session-start.js
│ │ └── topic-change.js
│ ├── debug-pattern-test.js
│ ├── install_claude_hooks_windows.ps1
│ ├── install_hooks.py
│ ├── memory-mode-controller.js
│ ├── MIGRATION.md
│ ├── README-NATURAL-TRIGGERS.md
│ ├── README-phase2.md
│ ├── README.md
│ ├── simple-test.js
│ ├── statusline.sh
│ ├── test-adaptive-weights.js
│ ├── test-dual-protocol-hook.js
│ ├── test-mcp-hook.js
│ ├── test-natural-triggers.js
│ ├── test-recency-scoring.js
│ ├── tests
│ │ ├── integration-test.js
│ │ ├── phase2-integration-test.js
│ │ ├── test-code-execution.js
│ │ ├── test-cross-session.json
│ │ ├── test-session-tracking.json
│ │ └── test-threading.json
│ ├── utilities
│ │ ├── adaptive-pattern-detector.js
│ │ ├── context-formatter.js
│ │ ├── context-shift-detector.js
│ │ ├── conversation-analyzer.js
│ │ ├── dynamic-context-updater.js
│ │ ├── git-analyzer.js
│ │ ├── mcp-client.js
│ │ ├── memory-client.js
│ │ ├── memory-scorer.js
│ │ ├── performance-manager.js
│ │ ├── project-detector.js
│ │ ├── session-tracker.js
│ │ ├── tiered-conversation-monitor.js
│ │ └── version-checker.js
│ └── WINDOWS-SESSIONSTART-BUG.md
├── CLAUDE.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Development-Sprint-November-2025.md
├── docs
│ ├── amp-cli-bridge.md
│ ├── api
│ │ ├── code-execution-interface.md
│ │ ├── memory-metadata-api.md
│ │ ├── PHASE1_IMPLEMENTATION_SUMMARY.md
│ │ ├── PHASE2_IMPLEMENTATION_SUMMARY.md
│ │ ├── PHASE2_REPORT.md
│ │ └── tag-standardization.md
│ ├── architecture
│ │ ├── search-enhancement-spec.md
│ │ └── search-examples.md
│ ├── architecture.md
│ ├── archive
│ │ └── obsolete-workflows
│ │ ├── load_memory_context.md
│ │ └── README.md
│ ├── assets
│ │ └── images
│ │ ├── dashboard-v3.3.0-preview.png
│ │ ├── memory-awareness-hooks-example.png
│ │ ├── project-infographic.svg
│ │ └── README.md
│ ├── CLAUDE_CODE_QUICK_REFERENCE.md
│ ├── cloudflare-setup.md
│ ├── deployment
│ │ ├── docker.md
│ │ ├── dual-service.md
│ │ ├── production-guide.md
│ │ └── systemd-service.md
│ ├── development
│ │ ├── ai-agent-instructions.md
│ │ ├── code-quality
│ │ │ ├── phase-2a-completion.md
│ │ │ ├── phase-2a-handle-get-prompt.md
│ │ │ ├── phase-2a-index.md
│ │ │ ├── phase-2a-install-package.md
│ │ │ └── phase-2b-session-summary.md
│ │ ├── code-quality-workflow.md
│ │ ├── dashboard-workflow.md
│ │ ├── issue-management.md
│ │ ├── pr-review-guide.md
│ │ ├── refactoring-notes.md
│ │ ├── release-checklist.md
│ │ └── todo-tracker.md
│ ├── docker-optimized-build.md
│ ├── document-ingestion.md
│ ├── DOCUMENTATION_AUDIT.md
│ ├── enhancement-roadmap-issue-14.md
│ ├── examples
│ │ ├── analysis-scripts.js
│ │ ├── maintenance-session-example.md
│ │ ├── memory-distribution-chart.jsx
│ │ └── tag-schema.json
│ ├── first-time-setup.md
│ ├── glama-deployment.md
│ ├── guides
│ │ ├── advanced-command-examples.md
│ │ ├── chromadb-migration.md
│ │ ├── commands-vs-mcp-server.md
│ │ ├── mcp-enhancements.md
│ │ ├── mdns-service-discovery.md
│ │ ├── memory-consolidation-guide.md
│ │ ├── migration.md
│ │ ├── scripts.md
│ │ └── STORAGE_BACKENDS.md
│ ├── HOOK_IMPROVEMENTS.md
│ ├── hooks
│ │ └── phase2-code-execution-migration.md
│ ├── http-server-management.md
│ ├── ide-compatability.md
│ ├── IMAGE_RETENTION_POLICY.md
│ ├── images
│ │ └── dashboard-placeholder.md
│ ├── implementation
│ │ ├── health_checks.md
│ │ └── performance.md
│ ├── IMPLEMENTATION_PLAN_HTTP_SSE.md
│ ├── integration
│ │ ├── homebrew.md
│ │ └── multi-client.md
│ ├── integrations
│ │ ├── gemini.md
│ │ ├── groq-bridge.md
│ │ ├── groq-integration-summary.md
│ │ └── groq-model-comparison.md
│ ├── integrations.md
│ ├── legacy
│ │ └── dual-protocol-hooks.md
│ ├── LM_STUDIO_COMPATIBILITY.md
│ ├── maintenance
│ │ └── memory-maintenance.md
│ ├── mastery
│ │ ├── api-reference.md
│ │ ├── architecture-overview.md
│ │ ├── configuration-guide.md
│ │ ├── local-setup-and-run.md
│ │ ├── testing-guide.md
│ │ └── troubleshooting.md
│ ├── migration
│ │ └── code-execution-api-quick-start.md
│ ├── natural-memory-triggers
│ │ ├── cli-reference.md
│ │ ├── installation-guide.md
│ │ └── performance-optimization.md
│ ├── oauth-setup.md
│ ├── pr-graphql-integration.md
│ ├── quick-setup-cloudflare-dual-environment.md
│ ├── README.md
│ ├── remote-configuration-wiki-section.md
│ ├── research
│ │ ├── code-execution-interface-implementation.md
│ │ └── code-execution-interface-summary.md
│ ├── ROADMAP.md
│ ├── sqlite-vec-backend.md
│ ├── statistics
│ │ ├── charts
│ │ │ ├── activity_patterns.png
│ │ │ ├── contributors.png
│ │ │ ├── growth_trajectory.png
│ │ │ ├── monthly_activity.png
│ │ │ └── october_sprint.png
│ │ ├── data
│ │ │ ├── activity_by_day.csv
│ │ │ ├── activity_by_hour.csv
│ │ │ ├── contributors.csv
│ │ │ └── monthly_activity.csv
│ │ ├── generate_charts.py
│ │ └── REPOSITORY_STATISTICS.md
│ ├── technical
│ │ ├── development.md
│ │ ├── memory-migration.md
│ │ ├── migration-log.md
│ │ ├── sqlite-vec-embedding-fixes.md
│ │ └── tag-storage.md
│ ├── testing
│ │ └── regression-tests.md
│ ├── testing-cloudflare-backend.md
│ ├── troubleshooting
│ │ ├── cloudflare-api-token-setup.md
│ │ ├── cloudflare-authentication.md
│ │ ├── general.md
│ │ ├── hooks-quick-reference.md
│ │ ├── pr162-schema-caching-issue.md
│ │ ├── session-end-hooks.md
│ │ └── sync-issues.md
│ └── tutorials
│ ├── advanced-techniques.md
│ ├── data-analysis.md
│ └── demo-session-walkthrough.md
├── examples
│ ├── claude_desktop_config_template.json
│ ├── claude_desktop_config_windows.json
│ ├── claude-desktop-http-config.json
│ ├── config
│ │ └── claude_desktop_config.json
│ ├── http-mcp-bridge.js
│ ├── memory_export_template.json
│ ├── README.md
│ ├── setup
│ │ └── setup_multi_client_complete.py
│ └── start_https_example.sh
├── install_service.py
├── install.py
├── LICENSE
├── NOTICE
├── pyproject.toml
├── pytest.ini
├── README.md
├── run_server.py
├── scripts
│ ├── .claude
│ │ └── settings.local.json
│ ├── archive
│ │ └── check_missing_timestamps.py
│ ├── backup
│ │ ├── backup_memories.py
│ │ ├── backup_sqlite_vec.sh
│ │ ├── export_distributable_memories.sh
│ │ └── restore_memories.py
│ ├── benchmarks
│ │ ├── benchmark_code_execution_api.py
│ │ ├── benchmark_hybrid_sync.py
│ │ └── benchmark_server_caching.py
│ ├── database
│ │ ├── analyze_sqlite_vec_db.py
│ │ ├── check_sqlite_vec_status.py
│ │ ├── db_health_check.py
│ │ └── simple_timestamp_check.py
│ ├── development
│ │ ├── debug_server_initialization.py
│ │ ├── find_orphaned_files.py
│ │ ├── fix_mdns.sh
│ │ ├── fix_sitecustomize.py
│ │ ├── remote_ingest.sh
│ │ ├── setup-git-merge-drivers.sh
│ │ ├── uv-lock-merge.sh
│ │ └── verify_hybrid_sync.py
│ ├── hooks
│ │ └── pre-commit
│ ├── installation
│ │ ├── install_linux_service.py
│ │ ├── install_macos_service.py
│ │ ├── install_uv.py
│ │ ├── install_windows_service.py
│ │ ├── install.py
│ │ ├── setup_backup_cron.sh
│ │ ├── setup_claude_mcp.sh
│ │ └── setup_cloudflare_resources.py
│ ├── linux
│ │ ├── service_status.sh
│ │ ├── start_service.sh
│ │ ├── stop_service.sh
│ │ ├── uninstall_service.sh
│ │ └── view_logs.sh
│ ├── maintenance
│ │ ├── assign_memory_types.py
│ │ ├── check_memory_types.py
│ │ ├── cleanup_corrupted_encoding.py
│ │ ├── cleanup_memories.py
│ │ ├── cleanup_organize.py
│ │ ├── consolidate_memory_types.py
│ │ ├── consolidation_mappings.json
│ │ ├── delete_orphaned_vectors_fixed.py
│ │ ├── fast_cleanup_duplicates_with_tracking.sh
│ │ ├── find_all_duplicates.py
│ │ ├── find_cloudflare_duplicates.py
│ │ ├── find_duplicates.py
│ │ ├── memory-types.md
│ │ ├── README.md
│ │ ├── recover_timestamps_from_cloudflare.py
│ │ ├── regenerate_embeddings.py
│ │ ├── repair_malformed_tags.py
│ │ ├── repair_memories.py
│ │ ├── repair_sqlite_vec_embeddings.py
│ │ ├── repair_zero_embeddings.py
│ │ ├── restore_from_json_export.py
│ │ └── scan_todos.sh
│ ├── migration
│ │ ├── cleanup_mcp_timestamps.py
│ │ ├── legacy
│ │ │ └── migrate_chroma_to_sqlite.py
│ │ ├── mcp-migration.py
│ │ ├── migrate_sqlite_vec_embeddings.py
│ │ ├── migrate_storage.py
│ │ ├── migrate_tags.py
│ │ ├── migrate_timestamps.py
│ │ ├── migrate_to_cloudflare.py
│ │ ├── migrate_to_sqlite_vec.py
│ │ ├── migrate_v5_enhanced.py
│ │ ├── TIMESTAMP_CLEANUP_README.md
│ │ └── verify_mcp_timestamps.py
│ ├── pr
│ │ ├── amp_collect_results.sh
│ │ ├── amp_detect_breaking_changes.sh
│ │ ├── amp_generate_tests.sh
│ │ ├── amp_pr_review.sh
│ │ ├── amp_quality_gate.sh
│ │ ├── amp_suggest_fixes.sh
│ │ ├── auto_review.sh
│ │ ├── detect_breaking_changes.sh
│ │ ├── generate_tests.sh
│ │ ├── lib
│ │ │ └── graphql_helpers.sh
│ │ ├── quality_gate.sh
│ │ ├── resolve_threads.sh
│ │ ├── run_pyscn_analysis.sh
│ │ ├── run_quality_checks.sh
│ │ ├── thread_status.sh
│ │ └── watch_reviews.sh
│ ├── quality
│ │ ├── fix_dead_code_install.sh
│ │ ├── phase1_dead_code_analysis.md
│ │ ├── phase2_complexity_analysis.md
│ │ ├── README_PHASE1.md
│ │ ├── README_PHASE2.md
│ │ ├── track_pyscn_metrics.sh
│ │ └── weekly_quality_review.sh
│ ├── README.md
│ ├── run
│ │ ├── run_mcp_memory.sh
│ │ ├── run-with-uv.sh
│ │ └── start_sqlite_vec.sh
│ ├── run_memory_server.py
│ ├── server
│ │ ├── check_http_server.py
│ │ ├── check_server_health.py
│ │ ├── memory_offline.py
│ │ ├── preload_models.py
│ │ ├── run_http_server.py
│ │ ├── run_memory_server.py
│ │ ├── start_http_server.bat
│ │ └── start_http_server.sh
│ ├── service
│ │ ├── deploy_dual_services.sh
│ │ ├── install_http_service.sh
│ │ ├── mcp-memory-http.service
│ │ ├── mcp-memory.service
│ │ ├── memory_service_manager.sh
│ │ ├── service_control.sh
│ │ ├── service_utils.py
│ │ └── update_service.sh
│ ├── sync
│ │ ├── check_drift.py
│ │ ├── claude_sync_commands.py
│ │ ├── export_memories.py
│ │ ├── import_memories.py
│ │ ├── litestream
│ │ │ ├── apply_local_changes.sh
│ │ │ ├── enhanced_memory_store.sh
│ │ │ ├── init_staging_db.sh
│ │ │ ├── io.litestream.replication.plist
│ │ │ ├── manual_sync.sh
│ │ │ ├── memory_sync.sh
│ │ │ ├── pull_remote_changes.sh
│ │ │ ├── push_to_remote.sh
│ │ │ ├── README.md
│ │ │ ├── resolve_conflicts.sh
│ │ │ ├── setup_local_litestream.sh
│ │ │ ├── setup_remote_litestream.sh
│ │ │ ├── staging_db_init.sql
│ │ │ ├── stash_local_changes.sh
│ │ │ ├── sync_from_remote_noconfig.sh
│ │ │ └── sync_from_remote.sh
│ │ ├── README.md
│ │ ├── safe_cloudflare_update.sh
│ │ ├── sync_memory_backends.py
│ │ └── sync_now.py
│ ├── testing
│ │ ├── run_complete_test.py
│ │ ├── run_memory_test.sh
│ │ ├── simple_test.py
│ │ ├── test_cleanup_logic.py
│ │ ├── test_cloudflare_backend.py
│ │ ├── test_docker_functionality.py
│ │ ├── test_installation.py
│ │ ├── test_mdns.py
│ │ ├── test_memory_api.py
│ │ ├── test_memory_simple.py
│ │ ├── test_migration.py
│ │ ├── test_search_api.py
│ │ ├── test_sqlite_vec_embeddings.py
│ │ ├── test_sse_events.py
│ │ ├── test-connection.py
│ │ └── test-hook.js
│ ├── utils
│ │ ├── claude_commands_utils.py
│ │ ├── generate_personalized_claude_md.sh
│ │ ├── groq
│ │ ├── groq_agent_bridge.py
│ │ ├── list-collections.py
│ │ ├── memory_wrapper_uv.py
│ │ ├── query_memories.py
│ │ ├── smithery_wrapper.py
│ │ ├── test_groq_bridge.sh
│ │ └── uv_wrapper.py
│ └── validation
│ ├── check_dev_setup.py
│ ├── check_documentation_links.py
│ ├── diagnose_backend_config.py
│ ├── validate_configuration_complete.py
│ ├── validate_memories.py
│ ├── validate_migration.py
│ ├── validate_timestamp_integrity.py
│ ├── verify_environment.py
│ ├── verify_pytorch_windows.py
│ └── verify_torch.py
├── SECURITY.md
├── selective_timestamp_recovery.py
├── SPONSORS.md
├── src
│ └── mcp_memory_service
│ ├── __init__.py
│ ├── api
│ │ ├── __init__.py
│ │ ├── client.py
│ │ ├── operations.py
│ │ ├── sync_wrapper.py
│ │ └── types.py
│ ├── backup
│ │ ├── __init__.py
│ │ └── scheduler.py
│ ├── cli
│ │ ├── __init__.py
│ │ ├── ingestion.py
│ │ ├── main.py
│ │ └── utils.py
│ ├── config.py
│ ├── consolidation
│ │ ├── __init__.py
│ │ ├── associations.py
│ │ ├── base.py
│ │ ├── clustering.py
│ │ ├── compression.py
│ │ ├── consolidator.py
│ │ ├── decay.py
│ │ ├── forgetting.py
│ │ ├── health.py
│ │ └── scheduler.py
│ ├── dependency_check.py
│ ├── discovery
│ │ ├── __init__.py
│ │ ├── client.py
│ │ └── mdns_service.py
│ ├── embeddings
│ │ ├── __init__.py
│ │ └── onnx_embeddings.py
│ ├── ingestion
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── chunker.py
│ │ ├── csv_loader.py
│ │ ├── json_loader.py
│ │ ├── pdf_loader.py
│ │ ├── registry.py
│ │ ├── semtools_loader.py
│ │ └── text_loader.py
│ ├── lm_studio_compat.py
│ ├── mcp_server.py
│ ├── models
│ │ ├── __init__.py
│ │ └── memory.py
│ ├── server.py
│ ├── services
│ │ ├── __init__.py
│ │ └── memory_service.py
│ ├── storage
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── cloudflare.py
│ │ ├── factory.py
│ │ ├── http_client.py
│ │ ├── hybrid.py
│ │ └── sqlite_vec.py
│ ├── sync
│ │ ├── __init__.py
│ │ ├── exporter.py
│ │ ├── importer.py
│ │ └── litestream_config.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── cache_manager.py
│ │ ├── content_splitter.py
│ │ ├── db_utils.py
│ │ ├── debug.py
│ │ ├── document_processing.py
│ │ ├── gpu_detection.py
│ │ ├── hashing.py
│ │ ├── http_server_manager.py
│ │ ├── port_detection.py
│ │ ├── system_detection.py
│ │ └── time_parser.py
│ └── web
│ ├── __init__.py
│ ├── api
│ │ ├── __init__.py
│ │ ├── analytics.py
│ │ ├── backup.py
│ │ ├── consolidation.py
│ │ ├── documents.py
│ │ ├── events.py
│ │ ├── health.py
│ │ ├── manage.py
│ │ ├── mcp.py
│ │ ├── memories.py
│ │ ├── search.py
│ │ └── sync.py
│ ├── app.py
│ ├── dependencies.py
│ ├── oauth
│ │ ├── __init__.py
│ │ ├── authorization.py
│ │ ├── discovery.py
│ │ ├── middleware.py
│ │ ├── models.py
│ │ ├── registration.py
│ │ └── storage.py
│ ├── sse.py
│ └── static
│ ├── app.js
│ ├── index.html
│ ├── README.md
│ ├── sse_test.html
│ └── style.css
├── start_http_debug.bat
├── start_http_server.sh
├── test_document.txt
├── test_version_checker.js
├── tests
│ ├── __init__.py
│ ├── api
│ │ ├── __init__.py
│ │ ├── test_compact_types.py
│ │ └── test_operations.py
│ ├── bridge
│ │ ├── mock_responses.js
│ │ ├── package-lock.json
│ │ ├── package.json
│ │ └── test_http_mcp_bridge.js
│ ├── conftest.py
│ ├── consolidation
│ │ ├── __init__.py
│ │ ├── conftest.py
│ │ ├── test_associations.py
│ │ ├── test_clustering.py
│ │ ├── test_compression.py
│ │ ├── test_consolidator.py
│ │ ├── test_decay.py
│ │ └── test_forgetting.py
│ ├── contracts
│ │ └── api-specification.yml
│ ├── integration
│ │ ├── package-lock.json
│ │ ├── package.json
│ │ ├── test_api_key_fallback.py
│ │ ├── test_api_memories_chronological.py
│ │ ├── test_api_tag_time_search.py
│ │ ├── test_api_with_memory_service.py
│ │ ├── test_bridge_integration.js
│ │ ├── test_cli_interfaces.py
│ │ ├── test_cloudflare_connection.py
│ │ ├── test_concurrent_clients.py
│ │ ├── test_data_serialization_consistency.py
│ │ ├── test_http_server_startup.py
│ │ ├── test_mcp_memory.py
│ │ ├── test_mdns_integration.py
│ │ ├── test_oauth_basic_auth.py
│ │ ├── test_oauth_flow.py
│ │ ├── test_server_handlers.py
│ │ └── test_store_memory.py
│ ├── performance
│ │ ├── test_background_sync.py
│ │ └── test_hybrid_live.py
│ ├── README.md
│ ├── smithery
│ │ └── test_smithery.py
│ ├── sqlite
│ │ └── simple_sqlite_vec_test.py
│ ├── test_client.py
│ ├── test_content_splitting.py
│ ├── test_database.py
│ ├── test_hybrid_cloudflare_limits.py
│ ├── test_hybrid_storage.py
│ ├── test_memory_ops.py
│ ├── test_semantic_search.py
│ ├── test_sqlite_vec_storage.py
│ ├── test_time_parser.py
│ ├── test_timestamp_preservation.py
│ ├── timestamp
│ │ ├── test_hook_vs_manual_storage.py
│ │ ├── test_issue99_final_validation.py
│ │ ├── test_search_retrieval_inconsistency.py
│ │ ├── test_timestamp_issue.py
│ │ └── test_timestamp_simple.py
│ └── unit
│ ├── conftest.py
│ ├── test_cloudflare_storage.py
│ ├── test_csv_loader.py
│ ├── test_fastapi_dependencies.py
│ ├── test_import.py
│ ├── test_json_loader.py
│ ├── test_mdns_simple.py
│ ├── test_mdns.py
│ ├── test_memory_service.py
│ ├── test_memory.py
│ ├── test_semtools_loader.py
│ ├── test_storage_interface_compatibility.py
│ └── test_tag_time_filtering.py
├── tools
│ ├── docker
│ │ ├── DEPRECATED.md
│ │ ├── docker-compose.http.yml
│ │ ├── docker-compose.pythonpath.yml
│ │ ├── docker-compose.standalone.yml
│ │ ├── docker-compose.uv.yml
│ │ ├── docker-compose.yml
│ │ ├── docker-entrypoint-persistent.sh
│ │ ├── docker-entrypoint-unified.sh
│ │ ├── docker-entrypoint.sh
│ │ ├── Dockerfile
│ │ ├── Dockerfile.glama
│ │ ├── Dockerfile.slim
│ │ ├── README.md
│ │ └── test-docker-modes.sh
│ └── README.md
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/archive/docs-removed-2025-08-23/claude-code-integration.md:
--------------------------------------------------------------------------------
```markdown
1 | # Using MCP Memory Service with Claude Code
2 |
3 | This guide explains how to integrate the MCP Memory Service with Claude Code, providing two powerful approaches for using persistent memory capabilities in the Claude CLI environment.
4 |
5 | ## Prerequisites
6 |
7 | Before you begin, ensure you have:
8 |
9 | 1. Installed [Claude Code](https://www.anthropic.com/news/introducing-claude-code) CLI tool
10 | 2. Set up the MCP Memory Service on your machine
11 | 3. Basic familiarity with command-line interfaces
12 |
13 | ## Integration Approaches
14 |
15 | The MCP Memory Service offers **two integration methods** with Claude Code:
16 |
17 | ### 🎯 Method 1: Conversational Commands (Recommended)
18 | **New in v2.2.0** - Direct command syntax following the CCPlugins pattern
19 |
20 | ### 🔧 Method 2: MCP Server Registration
21 | Traditional MCP server approach for deep integration
22 |
23 | ---
24 |
25 | ## Method 1: Conversational Commands (v2.2.0)
26 |
27 | The **conversational commands approach** provides direct memory operations through Claude Code commands that follow the CCPlugins pattern. This is the **recommended approach** for most users as it provides immediate access to memory operations without MCP server configuration.
28 |
29 | ### Installation
30 |
31 | The commands can be installed during the main MCP Memory Service installation:
32 |
33 | ```bash
34 | # Install with commands (will prompt if Claude Code CLI is detected)
35 | python install.py
36 |
37 | # Force install commands without prompting
38 | python install.py --install-claude-commands
39 |
40 | # Skip command installation prompt
41 | python install.py --skip-claude-commands-prompt
42 | ```
43 |
44 | Or install them manually:
45 |
46 | ```bash
47 | # Install commands directly
48 | python scripts/claude_commands_utils.py
49 |
50 | # Test installation prerequisites
51 | python scripts/claude_commands_utils.py --test
52 |
53 | # Uninstall commands
54 | python scripts/claude_commands_utils.py --uninstall
55 | ```
56 |
57 | ### Available Commands
58 |
59 | #### `/memory-store` - Store Information with Context
60 | Store information in your memory service with automatic context detection and smart tagging.
61 |
62 | ```bash
63 | claude /memory-store "Important architectural decision about database backend"
64 | claude /memory-store --tags "decision,architecture" "We chose SQLite-vec for performance"
65 | claude /memory-store --type "note" "Remember to update Docker configuration"
66 | ```
67 |
68 | **Features:**
69 | - Automatic project context detection from current directory
70 | - Smart tag generation based on file types and git repository
71 | - Session context integration
72 | - Metadata enrichment with timestamps and paths
73 |
74 | #### `/memory-recall` - Time-based Memory Retrieval
75 | Retrieve memories using natural language time expressions.
76 |
77 | ```bash
78 | claude /memory-recall "what did we decide about the database last week?"
79 | claude /memory-recall "yesterday's architectural discussions"
80 | claude /memory-recall --project "mcp-memory-service" "last month's progress"
81 | ```
82 |
83 | **Features:**
84 | - Natural language time parsing ("yesterday", "last Tuesday", "two months ago")
85 | - Context-aware filtering based on current project
86 | - Temporal relevance scoring
87 | - Seasonal and event-based queries
88 |
89 | #### `/memory-search` - Tag and Content Search
90 | Search through stored memories using tags, content keywords, and semantic similarity.
91 |
92 | ```bash
93 | claude /memory-search --tags "architecture,database"
94 | claude /memory-search "SQLite performance optimization"
95 | claude /memory-search --project "current" --type "decision"
96 | ```
97 |
98 | **Features:**
99 | - Tag-based filtering with partial matching
100 | - Semantic content search using embeddings
101 | - Combined query support (tags + content)
102 | - Relevance scoring and ranking
103 |
104 | #### `/memory-context` - Session Context Integration
105 | Capture the current conversation and project context as a memory.
106 |
107 | ```bash
108 | claude /memory-context
109 | claude /memory-context --summary "Architecture planning session"
110 | claude /memory-context --include-files --include-commits
111 | ```
112 |
113 | **Features:**
114 | - Automatic session analysis and summarization
115 | - Git repository state capture
116 | - File change detection and inclusion
117 | - Key decision and insight extraction
118 |
119 | #### `/memory-health` - Service Health and Diagnostics
120 | Check the health and status of your MCP Memory Service.
121 |
122 | ```bash
123 | claude /memory-health
124 | claude /memory-health --detailed
125 | claude /memory-health --test-operations
126 | ```
127 |
128 | **Features:**
129 | - Service connectivity verification
130 | - Database health and statistics
131 | - Performance metrics and diagnostics
132 | - Auto-discovery testing and troubleshooting
133 |
134 | ### Command Features
135 |
136 | - **Auto-Discovery**: Commands automatically locate your MCP Memory Service via mDNS
137 | - **Context Awareness**: Understand current project, git repository, and session state
138 | - **Error Recovery**: Graceful handling when memory service is unavailable
139 | - **Cross-Platform**: Full support for Windows, macOS, and Linux
140 | - **Backend Agnostic**: Works with both ChromaDB and SQLite-vec storage backends
141 |
142 | ### Example Workflow
143 |
144 | ```bash
145 | # Start a development session
146 | claude /memory-context --summary "Starting work on mDNS integration"
147 |
148 | # Store important decisions during development
149 | claude /memory-store --tags "mDNS,architecture" "Decided to use zeroconf library for service discovery"
150 |
151 | # Continue development work...
152 |
153 | # Later, recall what was decided
154 | claude /memory-recall "what did we decide about mDNS last week?"
155 |
156 | # Search for related technical information
157 | claude /memory-search --tags "mDNS,zeroconf"
158 |
159 | # Check if everything is working correctly
160 | claude /memory-health
161 | ```
162 |
163 | ---
164 |
165 | ## Method 2: MCP Server Registration
166 |
167 | For users who prefer the traditional MCP server approach or need deeper integration, you can register the MCP Memory Service directly with Claude Code.
168 |
169 | ### Registering the Memory Service with Claude Code
170 |
171 | You can register the MCP Memory Service to work with Claude Code using the `claude mcp add` command.
172 |
173 | ### Check Existing MCP Servers
174 |
175 | To see which MCP servers are already registered with Claude:
176 |
177 | ```bash
178 | claude mcp list
179 | ```
180 |
181 | ### Add the Memory Service
182 |
183 | To add the memory service that's running on your local machine:
184 |
185 | ```bash
186 | claude mcp add memory-service spawn -- /path/to/your/command
187 | ```
188 |
189 | For example, if you've installed the memory service using UV (recommended):
190 |
191 | ```bash
192 | claude mcp add memory-service spawn -- /opt/homebrew/bin/uv --directory /Users/yourusername/path/to/mcp-memory-service run memory
193 | ```
194 |
195 | Replace the path elements with the actual paths on your system.
196 |
197 | ## Example Configuration
198 |
199 | Here's a real-world example of adding the memory service to Claude Code:
200 |
201 | ```bash
202 | claude mcp add memory-service spawn -- /opt/homebrew/bin/uv --directory /Users/yourusername/Documents/GitHub/mcp-memory-service run memory
203 | ```
204 |
205 | This command:
206 | 1. Registers a new MCP server named "memory-service"
207 | 2. Uses the "spawn" transport method, which runs the command when needed
208 | 3. Specifies the full path to the UV command
209 | 4. Sets the working directory to your mcp-memory-service location
210 | 5. Runs the "memory" module
211 |
212 | After running this command, you should see a confirmation message like:
213 |
214 | ```
215 | Added stdio MCP server memory-service with command: spawn /opt/homebrew/bin/uv --directory /Users/yourusername/Documents/GitHub/mcp-memory-service run memory to local config
216 | ```
217 |
218 | ## Using Memory Functions in Claude Code
219 |
220 | Once registered, you can use the memory service directly in your conversations with Claude Code. The memory functions available include:
221 |
222 | - Storing memories
223 | - Retrieving memories based on semantic search
224 | - Recalling information from specific time periods
225 | - Searching by tags
226 | - And many more
227 |
228 | ---
229 |
230 | ## Troubleshooting
231 |
232 | ### For Conversational Commands (Method 1)
233 |
234 | If you encounter issues with the commands:
235 |
236 | 1. **Commands Not Available**:
237 | - Verify Claude Code CLI is installed: `claude --version`
238 | - Check commands are installed: `ls ~/.claude/commands/memory-*.md`
239 | - Reinstall commands: `python scripts/claude_commands_utils.py`
240 |
241 | 2. **MCP Service Connection Issues**:
242 | - Verify MCP Memory Service is running: `memory --help`
243 | - Check service health: `claude /memory-health`
244 | - Ensure service is discoverable via mDNS
245 |
246 | 3. **Permission Issues**:
247 | - Check commands directory permissions: `ls -la ~/.claude/commands/`
248 | - Ensure write access to the commands directory
249 |
250 | ### For MCP Server Registration (Method 2)
251 |
252 | If you encounter issues with MCP server registration:
253 |
254 | 1. Verify the memory service is running properly as a standalone application
255 | 2. Check that the paths in your `claude mcp add` command are correct
256 | 3. Ensure you have the necessary permissions to execute the specified commands
257 | 4. Try running `claude mcp list` to verify the server was added correctly
258 |
259 | ## Which Method Should You Use?
260 |
261 | ### Choose **Conversational Commands (Method 1)** if:
262 | - ✅ You want quick setup with minimal configuration
263 | - ✅ You prefer direct command syntax (`claude /memory-store`)
264 | - ✅ You want automatic service discovery and context awareness
265 | - ✅ You're new to MCP and want the simplest approach
266 | - ✅ You want CCPlugins-compatible command integration
267 |
268 | ### Choose **MCP Server Registration (Method 2)** if:
269 | - ✅ You need deep integration with Claude Code's MCP system
270 | - ✅ You want to use the service alongside other MCP servers
271 | - ✅ You prefer traditional MCP tool-based interactions
272 | - ✅ You need maximum control over the server configuration
273 | - ✅ You're building complex multi-server MCP workflows
274 |
275 | ## Benefits of Claude Code Integration
276 |
277 | Both integration methods provide powerful advantages:
278 |
279 | ### Core Benefits
280 | 1. **Persistent Memory**: Your conversations and stored information persist across sessions
281 | 2. **Semantic Search**: Claude can retrieve relevant information even when not phrased exactly the same way
282 | 3. **Temporal Recall**: Ask about information from specific time periods (e.g., "last week", "yesterday")
283 | 4. **Organized Knowledge**: Use tags to categorize and later retrieve information by category
284 | 5. **Project Context**: Commands understand your current project and development context
285 |
286 | ### Method 1 (Commands) Additional Benefits
287 | - **Immediate Access**: Direct command syntax without MCP server complexity
288 | - **Context Integration**: Automatic project and git repository detection
289 | - **Error Recovery**: Graceful fallback when service is unavailable
290 | - **User-Friendly**: Conversational interface following established patterns
291 |
292 | ### Method 2 (MCP Server) Additional Benefits
293 | - **Deep Integration**: Full MCP protocol support with rich tool interactions
294 | - **Flexible Configuration**: Advanced server configuration options
295 | - **Multi-Server Support**: Seamless integration with other MCP servers
296 | - **Protocol Compliance**: Standard MCP tool-based interactions
297 |
298 | This integration transforms Claude Code into a powerful knowledge management system with long-term memory capabilities, making your development workflow more efficient and context-aware.
299 |
```
--------------------------------------------------------------------------------
/tests/test_content_splitting.py:
--------------------------------------------------------------------------------
```python
1 | # Copyright 2024 Heinrich Krupp
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Tests for content splitting and backend-specific length limits.
17 |
18 | Tests cover:
19 | - Content splitting utility functions
20 | - Backend limit enforcement
21 | - Automatic chunking with metadata
22 | - Boundary preservation (sentences, paragraphs, code blocks)
23 | - Overlap between chunks for context preservation
24 | """
25 |
26 | import pytest
27 | from src.mcp_memory_service.utils.content_splitter import (
28 | split_content,
29 | estimate_chunks_needed,
30 | validate_chunk_lengths,
31 | _find_best_split_point
32 | )
33 |
34 |
35 | class TestContentSplitter:
36 | """Test the content_splitter utility module."""
37 |
38 | def test_split_short_content(self):
39 | """Content shorter than max_length should not be split."""
40 | content = "This is a short sentence."
41 | chunks = split_content(content, max_length=100)
42 |
43 | assert len(chunks) == 1
44 | assert chunks[0] == content
45 |
46 | def test_split_long_content_character_mode(self):
47 | """Test character-based splitting without boundary preservation."""
48 | content = "a" * 500
49 | chunks = split_content(content, max_length=100, preserve_boundaries=False, overlap=10)
50 |
51 | # Should create multiple chunks
52 | assert len(chunks) > 1
53 | # All chunks should be <= max_length
54 | assert all(len(chunk) <= 100 for chunk in chunks)
55 | # Should have overlap
56 | assert chunks[1].startswith(chunks[0][-10:])
57 |
58 | def test_split_preserves_paragraphs(self):
59 | """Test that paragraph boundaries are preferred for splitting."""
60 | content = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph."
61 | chunks = split_content(content, max_length=30, preserve_boundaries=True)
62 |
63 | # Should split at paragraph boundaries
64 | assert len(chunks) >= 2
65 | # Each chunk should end cleanly (no mid-paragraph cuts)
66 | for chunk in chunks[:-1]: # Check all but last chunk
67 | assert chunk.strip().endswith('.') or '\n\n' in chunk
68 |
69 | def test_split_preserves_sentences(self):
70 | """Test that sentence boundaries are preferred when paragraphs don't fit."""
71 | content = "First sentence. Second sentence. Third sentence. Fourth sentence."
72 | chunks = split_content(content, max_length=40, preserve_boundaries=True)
73 |
74 | # Should split at sentence boundaries
75 | assert len(chunks) >= 2
76 | # Most chunks should end with period
77 | period_endings = sum(1 for chunk in chunks if chunk.strip().endswith('.'))
78 | assert period_endings >= len(chunks) - 1
79 |
80 | def test_split_preserves_words(self):
81 | """Test that word boundaries are preferred when sentences don't fit."""
82 | content = "word1 word2 word3 word4 word5 word6 word7 word8"
83 | chunks = split_content(content, max_length=25, preserve_boundaries=True)
84 |
85 | # Should split at word boundaries
86 | assert len(chunks) >= 2
87 | # No chunk should end mid-word (except possibly last)
88 | for chunk in chunks[:-1]:
89 | # Should not end with partial word (will end with space or be complete)
90 | assert chunk.endswith(' ') or chunk == chunks[-1]
91 |
92 | def test_split_overlap(self):
93 | """Test that chunks have proper overlap for context."""
94 | content = "The quick brown fox jumps over the lazy dog. " * 10
95 | chunks = split_content(content, max_length=100, preserve_boundaries=True, overlap=20)
96 |
97 | assert len(chunks) > 1
98 | # Check that consecutive chunks have overlap
99 | for i in range(len(chunks) - 1):
100 | # The next chunk should contain some content from the end of current chunk
101 | current_end = chunks[i][-20:]
102 | assert any(word in chunks[i+1] for word in current_end.split()[:3])
103 |
104 | def test_estimate_chunks_needed(self):
105 | """Test chunk estimation function."""
106 | # Basic cases without overlap
107 | assert estimate_chunks_needed(0, 100) == 0
108 | assert estimate_chunks_needed(100, 100) == 1
109 | assert estimate_chunks_needed(200, 100) == 2
110 | assert estimate_chunks_needed(250, 100) == 3
111 |
112 | # Cases with overlap
113 | assert estimate_chunks_needed(100, 100, overlap=10) == 1 # Fits in one chunk
114 | assert estimate_chunks_needed(150, 100, overlap=10) == 2 # First chunk 100, second chunk covers remaining 50
115 | assert estimate_chunks_needed(200, 100, overlap=50) == 3 # Effective chunk size is 50
116 |
117 | # Edge cases
118 | assert estimate_chunks_needed(100, 100, overlap=100) == 1 # Invalid overlap, fallback to simple division
119 | assert estimate_chunks_needed(100, 100, overlap=150) == 1 # Invalid overlap larger than max_length
120 |
121 | def test_validate_chunk_lengths(self):
122 | """Test chunk length validation."""
123 | valid_chunks = ["short", "also short", "still short"]
124 | invalid_chunks = ["short", "this is way too long for the limit", "short"]
125 |
126 | assert validate_chunk_lengths(valid_chunks, max_length=50) is True
127 | assert validate_chunk_lengths(invalid_chunks, max_length=20) is False
128 |
129 | def test_find_best_split_point_paragraph(self):
130 | """Test that paragraph breaks are prioritized."""
131 | text = "First para.\n\nSecond para.\n\nThird para."
132 | split_point = _find_best_split_point(text, max_length=25)
133 |
134 | # Should split at first paragraph break
135 | assert text[split_point-2:split_point] == '\n\n'
136 |
137 | def test_find_best_split_point_sentence(self):
138 | """Test that sentence boundaries are used when no paragraph breaks."""
139 | text = "First sentence. Second sentence. Third sentence."
140 | split_point = _find_best_split_point(text, max_length=30)
141 |
142 | # Should split at sentence boundary
143 | assert '. ' in text[:split_point]
144 |
145 | def test_split_empty_content(self):
146 | """Test handling of empty content."""
147 | chunks = split_content("", max_length=100)
148 | assert chunks == []
149 |
150 | def test_split_exact_length(self):
151 | """Test content exactly at max_length."""
152 | content = "a" * 100
153 | chunks = split_content(content, max_length=100)
154 |
155 | assert len(chunks) == 1
156 | assert chunks[0] == content
157 |
158 | def test_split_code_blocks(self):
159 | """Test that code blocks are handled reasonably."""
160 | content = """def function_one():
161 | return True
162 |
163 | def function_two():
164 | return False
165 |
166 | def function_three():
167 | return None"""
168 |
169 | chunks = split_content(content, max_length=60, preserve_boundaries=True)
170 |
171 | # Should split at paragraph/function boundaries
172 | assert len(chunks) >= 2
173 | # Each chunk should contain complete functions ideally
174 | for chunk in chunks:
175 | # Count function definitions
176 | if 'def ' in chunk:
177 | # If it has a def, it should have a return (complete function)
178 | assert 'return' in chunk or chunk == chunks[-1]
179 |
180 |
181 | class TestBackendLimits:
182 | """Test backend-specific content length limits."""
183 |
184 | def test_cloudflare_limit(self):
185 | """Test that Cloudflare backend uses config constant."""
186 | from src.mcp_memory_service.storage.cloudflare import CloudflareStorage
187 | from src.mcp_memory_service.config import CLOUDFLARE_MAX_CONTENT_LENGTH
188 |
189 | # Verify the class constant matches config
190 | assert CloudflareStorage._MAX_CONTENT_LENGTH == CLOUDFLARE_MAX_CONTENT_LENGTH
191 |
192 | def test_sqlitevec_unlimited(self):
193 | """Test that SQLite-vec backend uses config constant."""
194 | from src.mcp_memory_service.storage.sqlite_vec import SqliteVecMemoryStorage
195 | from src.mcp_memory_service.config import SQLITEVEC_MAX_CONTENT_LENGTH
196 |
197 | # Create a mock instance to check property
198 | import tempfile
199 | import os
200 |
201 | with tempfile.TemporaryDirectory() as tmpdir:
202 | db_path = os.path.join(tmpdir, "test.db")
203 | storage = SqliteVecMemoryStorage(db_path=db_path)
204 |
205 | # Should return configured value (default: None/unlimited)
206 | assert storage.max_content_length == SQLITEVEC_MAX_CONTENT_LENGTH
207 | assert storage.supports_chunking is True
208 |
209 | def test_hybrid_follows_config(self):
210 | """Test that Hybrid backend uses config constant."""
211 | from src.mcp_memory_service.storage.hybrid import HybridMemoryStorage
212 | from src.mcp_memory_service.config import HYBRID_MAX_CONTENT_LENGTH
213 | import tempfile
214 | import os
215 |
216 | with tempfile.TemporaryDirectory() as tmpdir:
217 | db_path = os.path.join(tmpdir, "test.db")
218 | storage = HybridMemoryStorage(
219 | sqlite_db_path=db_path,
220 | cloudflare_config=None # No cloud sync for this test
221 | )
222 |
223 | # Should match configured hybrid limit
224 | assert storage.max_content_length == HYBRID_MAX_CONTENT_LENGTH
225 | assert storage.supports_chunking is True
226 |
227 |
228 | class TestConfigurationConstants:
229 | """Test configuration constants for content limits."""
230 |
231 | def test_config_constants_exist(self):
232 | """Test that all content limit constants are defined."""
233 | from src.mcp_memory_service.config import (
234 | CLOUDFLARE_MAX_CONTENT_LENGTH,
235 | SQLITEVEC_MAX_CONTENT_LENGTH,
236 | HYBRID_MAX_CONTENT_LENGTH,
237 | ENABLE_AUTO_SPLIT,
238 | CONTENT_SPLIT_OVERLAP,
239 | CONTENT_PRESERVE_BOUNDARIES
240 | )
241 |
242 | assert CLOUDFLARE_MAX_CONTENT_LENGTH == 800
243 | assert SQLITEVEC_MAX_CONTENT_LENGTH is None # Unlimited
244 | assert HYBRID_MAX_CONTENT_LENGTH == CLOUDFLARE_MAX_CONTENT_LENGTH
245 | assert isinstance(ENABLE_AUTO_SPLIT, bool)
246 | assert isinstance(CONTENT_SPLIT_OVERLAP, int)
247 | assert isinstance(CONTENT_PRESERVE_BOUNDARIES, bool)
248 |
249 | def test_config_validation(self):
250 | """Test that config values are sensible."""
251 | from src.mcp_memory_service.config import (
252 | CLOUDFLARE_MAX_CONTENT_LENGTH,
253 | CONTENT_SPLIT_OVERLAP
254 | )
255 |
256 | # Limits should be positive
257 | assert CLOUDFLARE_MAX_CONTENT_LENGTH > 0
258 |
259 | # Overlap should be reasonable
260 | assert 0 <= CONTENT_SPLIT_OVERLAP <= 500
261 |
262 |
263 | if __name__ == "__main__":
264 | pytest.main([__file__, "-v"])
265 |
```
--------------------------------------------------------------------------------
/src/mcp_memory_service/api/client.py:
--------------------------------------------------------------------------------
```python
1 | # Copyright 2024 Heinrich Krupp
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Storage client wrapper for code execution interface.
17 |
18 | Provides global storage instance management with lazy initialization
19 | and automatic connection reuse for optimal performance.
20 |
21 | Design Goals:
22 | - Single storage instance per process (avoid redundant connections)
23 | - Lazy initialization (create on first use)
24 | - Thread-safe access to global instance
25 | - Automatic cleanup on process exit
26 | - Graceful error handling with fallbacks
27 |
28 | Performance:
29 | - First call: ~50ms (initialization + connection)
30 | - Subsequent calls: ~0ms (reuses connection)
31 | - Memory overhead: ~10MB for embedding model cache
32 | """
33 |
34 | import asyncio
35 | import logging
36 | import os
37 | from typing import Optional
38 | from ..storage.base import MemoryStorage
39 | from ..storage.factory import create_storage_instance
40 | from ..config import DATABASE_PATH, get_base_directory
41 |
42 | logger = logging.getLogger(__name__)
43 |
44 | # Global storage instance (module-level singleton)
45 | _storage_instance: Optional[MemoryStorage] = None
46 | _initialization_lock = asyncio.Lock()
47 |
48 | # Global consolidation instances (set by HTTP server)
49 | _consolidator_instance: Optional["DreamInspiredConsolidator"] = None
50 | _scheduler_instance: Optional["ConsolidationScheduler"] = None
51 |
52 |
53 | async def _get_storage_async() -> MemoryStorage:
54 | """
55 | Get or create storage backend instance (async version).
56 |
57 | This function implements lazy initialization with connection reuse:
58 | 1. Returns existing instance if available
59 | 2. Creates new instance if none exists
60 | 3. Initializes storage backend on first call
61 | 4. Reuses connection for subsequent calls
62 |
63 | Thread Safety:
64 | Uses asyncio.Lock to prevent race conditions during initialization.
65 |
66 | Returns:
67 | Initialized MemoryStorage instance
68 |
69 | Raises:
70 | RuntimeError: If storage initialization fails
71 | """
72 | global _storage_instance
73 |
74 | # Fast path: return existing instance
75 | if _storage_instance is not None:
76 | return _storage_instance
77 |
78 | # Slow path: create new instance with lock
79 | async with _initialization_lock:
80 | # Double-check after acquiring lock (another coroutine may have initialized)
81 | if _storage_instance is not None:
82 | return _storage_instance
83 |
84 | try:
85 | logger.info("Initializing storage backend for code execution API...")
86 |
87 | # Determine SQLite database path
88 | db_path = DATABASE_PATH
89 | if not db_path:
90 | # Fallback to cross-platform default path
91 | base_dir = get_base_directory()
92 | db_path = os.path.join(base_dir, "sqlite_vec.db")
93 | logger.warning(f"DATABASE_PATH not configured, using default: {db_path}")
94 |
95 | # Ensure database directory exists
96 | db_dir = os.path.dirname(db_path)
97 | if db_dir and not os.path.exists(db_dir):
98 | os.makedirs(db_dir, exist_ok=True)
99 | logger.info(f"Created database directory: {db_dir}")
100 |
101 | # Create and initialize storage instance
102 | _storage_instance = await create_storage_instance(db_path)
103 |
104 | logger.info(f"Storage backend initialized: {type(_storage_instance).__name__}")
105 | return _storage_instance
106 |
107 | except Exception as e:
108 | logger.error(f"Failed to initialize storage backend: {e}")
109 | raise RuntimeError(f"Storage initialization failed: {e}") from e
110 |
111 |
112 | async def get_storage_async() -> MemoryStorage:
113 | """
114 | Get storage backend instance (async version).
115 |
116 | This is the internal async version that should be used within
117 | async contexts. For synchronous contexts, the sync_wrapper
118 | will handle the event loop management.
119 |
120 | Returns:
121 | Initialized MemoryStorage instance
122 |
123 | Raises:
124 | RuntimeError: If storage initialization fails
125 | """
126 | return await _get_storage_async()
127 |
128 |
129 | def get_storage() -> MemoryStorage:
130 | """
131 | Get storage backend instance (synchronous wrapper).
132 |
133 | This is the primary entry point for code execution API operations.
134 | It wraps the async initialization in a synchronous interface for
135 | ease of use in non-async contexts.
136 |
137 | Connection Reuse:
138 | - First call: ~50ms (initialization)
139 | - Subsequent calls: ~0ms (returns cached instance)
140 |
141 | Returns:
142 | Initialized MemoryStorage instance
143 |
144 | Raises:
145 | RuntimeError: If storage initialization fails
146 |
147 | Example:
148 | >>> storage = get_storage()
149 | >>> # Use storage for operations
150 | >>> results = await storage.retrieve("query", n_results=5)
151 | """
152 | global _storage_instance
153 |
154 | # Fast path: if already initialized, return immediately
155 | if _storage_instance is not None:
156 | return _storage_instance
157 |
158 | # Need to initialize - this requires an event loop
159 | try:
160 | # Check if we're already in an async context
161 | try:
162 | loop = asyncio.get_running_loop()
163 | # We're in an async context, but we can't use run_until_complete
164 | # This shouldn't happen in normal usage, but handle it gracefully
165 | logger.error("get_storage() called from async context - use get_storage_async() instead")
166 | raise RuntimeError("get_storage() cannot be called from async context")
167 | except RuntimeError:
168 | # No running loop, we can proceed with synchronous initialization
169 | pass
170 |
171 | # Get or create event loop
172 | try:
173 | loop = asyncio.get_event_loop()
174 | if loop.is_closed():
175 | loop = asyncio.new_event_loop()
176 | asyncio.set_event_loop(loop)
177 | except RuntimeError:
178 | loop = asyncio.new_event_loop()
179 | asyncio.set_event_loop(loop)
180 |
181 | # Run async initialization
182 | storage = loop.run_until_complete(_get_storage_async())
183 | return storage
184 |
185 | except Exception as e:
186 | logger.error(f"Error getting storage instance: {e}")
187 | raise
188 |
189 |
190 | def close() -> None:
191 | """
192 | Close and clean up storage resources.
193 |
194 | Explicitly closes the storage backend connection and clears
195 | the global instance. This ensures proper cleanup when the
196 | process is terminating or when you want to force a reconnection.
197 |
198 | After calling close(), the next call to get_storage() will
199 | create a fresh connection.
200 |
201 | Note:
202 | If the storage backend has an async close() method, it will
203 | be scheduled but not awaited. For proper async cleanup, use
204 | close_async() instead.
205 |
206 | Example:
207 | >>> from mcp_memory_service.api import close
208 | >>> close() # Cleanup resources
209 | >>> # Next get_storage() call will create new connection
210 | """
211 | global _storage_instance
212 |
213 | if _storage_instance is not None:
214 | try:
215 | logger.info("Closing storage instance")
216 | # Simply clear the instance reference
217 | # Async cleanup will happen via atexit or explicit close_async()
218 | except Exception as e:
219 | logger.warning(f"Error closing storage instance: {e}")
220 | finally:
221 | _storage_instance = None
222 |
223 |
224 | async def close_async() -> None:
225 | """
226 | Close and clean up storage resources (async version).
227 |
228 | This is the proper way to close storage backends that have
229 | async cleanup methods. Use this in async contexts.
230 |
231 | Example:
232 | >>> from mcp_memory_service.api import close_async
233 | >>> await close_async() # Proper async cleanup
234 | """
235 | global _storage_instance
236 |
237 | if _storage_instance is not None:
238 | try:
239 | logger.info("Closing storage instance (async)")
240 | # If storage has an async close method, await it
241 | if hasattr(_storage_instance, 'close') and callable(_storage_instance.close):
242 | close_method = _storage_instance.close()
243 | # Check if it's a coroutine
244 | if hasattr(close_method, '__await__'):
245 | await close_method
246 | except Exception as e:
247 | logger.warning(f"Error closing storage instance: {e}")
248 | finally:
249 | _storage_instance = None
250 |
251 |
252 | def reset_storage() -> None:
253 | """
254 | Reset global storage instance.
255 |
256 | Useful for testing or when configuration changes require
257 | reinitializing the storage backend.
258 |
259 | Warning:
260 | This closes the existing connection. Subsequent calls to
261 | get_storage() will create a new instance.
262 |
263 | Example:
264 | >>> reset_storage() # Close current connection
265 | >>> storage = get_storage() # Creates new connection
266 | """
267 | close() # Reuse the close() method for consistency
268 |
269 |
270 | # Cleanup on module exit
271 | import atexit
272 |
273 |
274 | def _cleanup_storage():
275 | """Cleanup storage instance on process exit."""
276 | global _storage_instance
277 | if _storage_instance is not None:
278 | logger.info("Cleaning up storage instance on exit")
279 | _storage_instance = None
280 |
281 |
282 | atexit.register(_cleanup_storage)
283 |
284 |
285 | def set_consolidator(consolidator: "DreamInspiredConsolidator") -> None:
286 | """
287 | Set global consolidator instance (called by HTTP server).
288 |
289 | This allows the API to access the consolidator instance
290 | that's managed by the HTTP server lifecycle.
291 |
292 | Args:
293 | consolidator: DreamInspiredConsolidator instance
294 | """
295 | global _consolidator_instance
296 | _consolidator_instance = consolidator
297 | logger.info("Global consolidator instance set")
298 |
299 |
300 | def set_scheduler(scheduler: "ConsolidationScheduler") -> None:
301 | """
302 | Set global scheduler instance (called by HTTP server).
303 |
304 | This allows the API to access the scheduler instance
305 | that's managed by the HTTP server lifecycle.
306 |
307 | Args:
308 | scheduler: ConsolidationScheduler instance
309 | """
310 | global _scheduler_instance
311 | _scheduler_instance = scheduler
312 | logger.info("Global scheduler instance set")
313 |
314 |
315 | def get_consolidator() -> Optional["DreamInspiredConsolidator"]:
316 | """
317 | Get global consolidator instance.
318 |
319 | Returns:
320 | DreamInspiredConsolidator instance or None if not set
321 | """
322 | return _consolidator_instance
323 |
324 |
325 | def get_scheduler() -> Optional["ConsolidationScheduler"]:
326 | """
327 | Get global scheduler instance.
328 |
329 | Returns:
330 | ConsolidationScheduler instance or None if not set
331 | """
332 | return _scheduler_instance
333 |
```
--------------------------------------------------------------------------------
/claude-hooks/memory-mode-controller.js:
--------------------------------------------------------------------------------
```javascript
1 | #!/usr/bin/env node
2 |
3 | /**
4 | * Memory Mode Controller
5 | * Command-line utility for managing memory hook performance profiles
6 | */
7 |
8 | const fs = require('fs').promises;
9 | const path = require('path');
10 |
11 | class MemoryModeController {
12 | constructor(configPath = null) {
13 | this.configPath = configPath || path.join(__dirname, 'config.json');
14 | }
15 |
16 | /**
17 | * Load current configuration
18 | */
19 | async loadConfig() {
20 | try {
21 | const configData = await fs.readFile(this.configPath, 'utf8');
22 | return JSON.parse(configData);
23 | } catch (error) {
24 | throw new Error(`Failed to load config: ${error.message}`);
25 | }
26 | }
27 |
28 | /**
29 | * Save configuration
30 | */
31 | async saveConfig(config) {
32 | try {
33 | await fs.writeFile(this.configPath, JSON.stringify(config, null, 2));
34 | } catch (error) {
35 | throw new Error(`Failed to save config: ${error.message}`);
36 | }
37 | }
38 |
39 | /**
40 | * Switch to a performance profile
41 | */
42 | async switchProfile(profileName) {
43 | const config = await this.loadConfig();
44 |
45 | if (!config.performance?.profiles[profileName]) {
46 | throw new Error(`Unknown profile: ${profileName}. Available profiles: ${Object.keys(config.performance?.profiles || {}).join(', ')}`);
47 | }
48 |
49 | config.performance.defaultProfile = profileName;
50 | await this.saveConfig(config);
51 |
52 | const profile = config.performance.profiles[profileName];
53 | console.log(`✅ Switched to profile: ${profileName}`);
54 | console.log(`📊 Description: ${profile.description}`);
55 | console.log(`⚡ Max Latency: ${profile.maxLatency || 'adaptive'}ms`);
56 | console.log(`🎯 Enabled Tiers: ${profile.enabledTiers?.join(', ') || 'adaptive'}`);
57 | console.log(`🔄 Background Processing: ${profile.backgroundProcessing ? 'enabled' : 'disabled'}`);
58 |
59 | return profile;
60 | }
61 |
62 | /**
63 | * Get current status
64 | */
65 | async getStatus() {
66 | const config = await this.loadConfig();
67 | const currentProfile = config.performance?.defaultProfile || 'balanced';
68 | const profile = config.performance?.profiles[currentProfile];
69 |
70 | console.log('📊 Memory Hook Status');
71 | console.log('═'.repeat(50));
72 | console.log(`Current Profile: ${currentProfile}`);
73 | console.log(`Description: ${profile?.description || 'No description'}`);
74 | console.log(`Natural Triggers: ${config.naturalTriggers?.enabled ? 'enabled' : 'disabled'}`);
75 | console.log(`Sensitivity: ${config.patternDetector?.sensitivity || 0.7}`);
76 | console.log(`Trigger Threshold: ${config.naturalTriggers?.triggerThreshold || 0.6}`);
77 | console.log(`Cooldown Period: ${(config.naturalTriggers?.cooldownPeriod || 30000) / 1000}s`);
78 |
79 | if (profile) {
80 | console.log('\n🎯 Performance Settings');
81 | console.log('─'.repeat(30));
82 | console.log(`Max Latency: ${profile.maxLatency || 'adaptive'}ms`);
83 | console.log(`Enabled Tiers: ${profile.enabledTiers?.join(', ') || 'adaptive'}`);
84 | console.log(`Background Processing: ${profile.backgroundProcessing ? 'enabled' : 'disabled'}`);
85 | console.log(`Degrade Threshold: ${profile.degradeThreshold || 'adaptive'}ms`);
86 | }
87 |
88 | console.log('\n🔧 Available Profiles');
89 | console.log('─'.repeat(30));
90 | for (const [name, prof] of Object.entries(config.performance?.profiles || {})) {
91 | const current = name === currentProfile ? ' (current)' : '';
92 | console.log(`${name}${current}: ${prof.description}`);
93 | }
94 |
95 | return {
96 | currentProfile,
97 | config: config.performance,
98 | naturalTriggers: config.naturalTriggers
99 | };
100 | }
101 |
102 | /**
103 | * Update sensitivity
104 | */
105 | async updateSensitivity(sensitivity) {
106 | const config = await this.loadConfig();
107 |
108 | if (sensitivity < 0 || sensitivity > 1) {
109 | throw new Error('Sensitivity must be between 0 and 1');
110 | }
111 |
112 | if (!config.patternDetector) {
113 | config.patternDetector = {};
114 | }
115 |
116 | config.patternDetector.sensitivity = sensitivity;
117 | await this.saveConfig(config);
118 |
119 | console.log(`✅ Updated sensitivity to ${sensitivity}`);
120 | return sensitivity;
121 | }
122 |
123 | /**
124 | * Update trigger threshold
125 | */
126 | async updateThreshold(threshold) {
127 | const config = await this.loadConfig();
128 |
129 | if (threshold < 0 || threshold > 1) {
130 | throw new Error('Threshold must be between 0 and 1');
131 | }
132 |
133 | if (!config.naturalTriggers) {
134 | config.naturalTriggers = {};
135 | }
136 |
137 | config.naturalTriggers.triggerThreshold = threshold;
138 | await this.saveConfig(config);
139 |
140 | console.log(`✅ Updated trigger threshold to ${threshold}`);
141 | return threshold;
142 | }
143 |
144 | /**
145 | * Enable or disable natural triggers
146 | */
147 | async toggleNaturalTriggers(enabled = null) {
148 | const config = await this.loadConfig();
149 |
150 | if (!config.naturalTriggers) {
151 | config.naturalTriggers = {};
152 | }
153 |
154 | if (enabled === null) {
155 | enabled = !config.naturalTriggers.enabled;
156 | }
157 |
158 | config.naturalTriggers.enabled = enabled;
159 | await this.saveConfig(config);
160 |
161 | console.log(`✅ Natural triggers ${enabled ? 'enabled' : 'disabled'}`);
162 | return enabled;
163 | }
164 |
165 | /**
166 | * Reset to default configuration
167 | */
168 | async resetToDefaults() {
169 | const config = await this.loadConfig();
170 |
171 | config.performance.defaultProfile = 'balanced';
172 | config.naturalTriggers = {
173 | enabled: true,
174 | triggerThreshold: 0.6,
175 | cooldownPeriod: 30000,
176 | maxMemoriesPerTrigger: 5
177 | };
178 |
179 | // Pattern detector defaults
180 | if (!config.patternDetector) {
181 | config.patternDetector = {};
182 | }
183 | config.patternDetector.sensitivity = 0.7;
184 | config.patternDetector.adaptiveLearning = true;
185 |
186 | await this.saveConfig(config);
187 | console.log('✅ Reset to default configuration');
188 | return config;
189 | }
190 |
191 | /**
192 | * Get performance profiles information
193 | */
194 | async listProfiles() {
195 | const config = await this.loadConfig();
196 | const profiles = config.performance?.profiles || {};
197 |
198 | console.log('📋 Available Performance Profiles');
199 | console.log('═'.repeat(60));
200 |
201 | for (const [name, profile] of Object.entries(profiles)) {
202 | const current = name === config.performance?.defaultProfile ? ' ⭐' : '';
203 | console.log(`\n${name}${current}`);
204 | console.log(` Description: ${profile.description}`);
205 | console.log(` Max Latency: ${profile.maxLatency || 'adaptive'}ms`);
206 | console.log(` Enabled Tiers: ${profile.enabledTiers?.join(', ') || 'adaptive'}`);
207 | console.log(` Background Processing: ${profile.backgroundProcessing ? 'yes' : 'no'}`);
208 | }
209 |
210 | return profiles;
211 | }
212 | }
213 |
214 | /**
215 | * Command-line interface
216 | */
217 | async function main() {
218 | const args = process.argv.slice(2);
219 | const controller = new MemoryModeController();
220 |
221 | try {
222 | if (args.length === 0 || args[0] === 'status') {
223 | await controller.getStatus();
224 | return;
225 | }
226 |
227 | const command = args[0];
228 |
229 | switch (command) {
230 | case 'switch':
231 | case 'profile':
232 | if (!args[1]) {
233 | console.error('❌ Please specify a profile name');
234 | console.log('Available profiles: speed_focused, balanced, memory_aware, adaptive');
235 | process.exit(1);
236 | }
237 | await controller.switchProfile(args[1]);
238 | break;
239 |
240 | case 'sensitivity':
241 | if (!args[1]) {
242 | console.error('❌ Please specify sensitivity value (0-1)');
243 | process.exit(1);
244 | }
245 | const sensitivity = parseFloat(args[1]);
246 | await controller.updateSensitivity(sensitivity);
247 | break;
248 |
249 | case 'threshold':
250 | if (!args[1]) {
251 | console.error('❌ Please specify threshold value (0-1)');
252 | process.exit(1);
253 | }
254 | const threshold = parseFloat(args[1]);
255 | await controller.updateThreshold(threshold);
256 | break;
257 |
258 | case 'enable':
259 | await controller.toggleNaturalTriggers(true);
260 | break;
261 |
262 | case 'disable':
263 | await controller.toggleNaturalTriggers(false);
264 | break;
265 |
266 | case 'toggle':
267 | await controller.toggleNaturalTriggers();
268 | break;
269 |
270 | case 'reset':
271 | await controller.resetToDefaults();
272 | break;
273 |
274 | case 'list':
275 | case 'profiles':
276 | await controller.listProfiles();
277 | break;
278 |
279 | case 'help':
280 | case '-h':
281 | case '--help':
282 | showHelp();
283 | break;
284 |
285 | default:
286 | console.error(`❌ Unknown command: ${command}`);
287 | showHelp();
288 | process.exit(1);
289 | }
290 |
291 | } catch (error) {
292 | console.error(`❌ Error: ${error.message}`);
293 | process.exit(1);
294 | }
295 | }
296 |
297 | function showHelp() {
298 | console.log(`
299 | 🧠 Memory Mode Controller
300 |
301 | Usage: node memory-mode-controller.js <command> [options]
302 |
303 | Commands:
304 | status Show current configuration and status
305 | profile <name> Switch to performance profile
306 | sensitivity <0-1> Set pattern detection sensitivity
307 | threshold <0-1> Set trigger threshold
308 | enable Enable natural triggers
309 | disable Disable natural triggers
310 | toggle Toggle natural triggers on/off
311 | reset Reset to default configuration
312 | list List available profiles
313 | help Show this help message
314 |
315 | Performance Profiles:
316 | speed_focused Fastest response, minimal memory (< 100ms)
317 | balanced Moderate latency, smart triggers (< 200ms)
318 | memory_aware Full awareness, accept latency (< 500ms)
319 | adaptive Auto-adjust based on usage patterns
320 |
321 | Examples:
322 | node memory-mode-controller.js status
323 | node memory-mode-controller.js profile balanced
324 | node memory-mode-controller.js sensitivity 0.8
325 | node memory-mode-controller.js disable
326 | `);
327 | }
328 |
329 | // Run if called directly
330 | if (require.main === module) {
331 | main().catch(error => {
332 | console.error('❌ Fatal error:', error.message);
333 | process.exit(1);
334 | });
335 | }
336 |
337 | module.exports = { MemoryModeController };
```
--------------------------------------------------------------------------------
/tests/unit/test_semtools_loader.py:
--------------------------------------------------------------------------------
```python
1 | #!/usr/bin/env python3
2 | """
3 | Unit tests for Semtools document loader.
4 | """
5 |
6 | import pytest
7 | import asyncio
8 | from pathlib import Path
9 | from unittest.mock import Mock, patch, AsyncMock, MagicMock
10 | import shutil
11 |
12 | from mcp_memory_service.ingestion.semtools_loader import SemtoolsLoader
13 | from mcp_memory_service.ingestion.base import DocumentChunk
14 |
15 |
16 | class TestSemtoolsLoader:
17 | """Test suite for SemtoolsLoader class."""
18 |
19 | def test_initialization(self):
20 | """Test basic initialization of SemtoolsLoader."""
21 | loader = SemtoolsLoader(chunk_size=500, chunk_overlap=50)
22 |
23 | assert loader.chunk_size == 500
24 | assert loader.chunk_overlap == 50
25 | assert 'pdf' in loader.supported_extensions
26 | assert 'docx' in loader.supported_extensions
27 | assert 'pptx' in loader.supported_extensions
28 |
29 | @patch('shutil.which')
30 | def test_semtools_availability_check(self, mock_which):
31 | """Test detection of semtools availability."""
32 | # Test when semtools is available
33 | mock_which.return_value = '/usr/local/bin/semtools'
34 | loader = SemtoolsLoader()
35 | assert loader._semtools_available is True
36 |
37 | # Test when semtools is not available
38 | mock_which.return_value = None
39 | loader = SemtoolsLoader()
40 | assert loader._semtools_available is False
41 |
42 | @patch('shutil.which')
43 | def test_can_handle_file(self, mock_which):
44 | """Test file format detection."""
45 | mock_which.return_value = '/usr/local/bin/semtools'
46 | loader = SemtoolsLoader()
47 |
48 | # Create temporary test files
49 | import tempfile
50 | with tempfile.TemporaryDirectory() as tmpdir:
51 | pdf_file = Path(tmpdir) / "test.pdf"
52 | pdf_file.touch()
53 |
54 | docx_file = Path(tmpdir) / "test.docx"
55 | docx_file.touch()
56 |
57 | txt_file = Path(tmpdir) / "test.txt"
58 | txt_file.touch()
59 |
60 | # Test supported formats
61 | assert loader.can_handle(pdf_file) is True
62 | assert loader.can_handle(docx_file) is True
63 |
64 | # Test unsupported formats
65 | assert loader.can_handle(txt_file) is False
66 |
67 | @patch('shutil.which')
68 | def test_can_handle_returns_false_when_semtools_unavailable(self, mock_which):
69 | """Test that can_handle returns False when semtools is not installed."""
70 | mock_which.return_value = None
71 | loader = SemtoolsLoader()
72 |
73 | import tempfile
74 | with tempfile.TemporaryDirectory() as tmpdir:
75 | pdf_file = Path(tmpdir) / "test.pdf"
76 | pdf_file.touch()
77 |
78 | # Should return False even for supported format when semtools unavailable
79 | assert loader.can_handle(pdf_file) is False
80 |
81 | @pytest.mark.asyncio
82 | @patch('shutil.which')
83 | async def test_extract_chunks_semtools_unavailable(self, mock_which):
84 | """Test that extract_chunks raises error when semtools is unavailable."""
85 | mock_which.return_value = None
86 | loader = SemtoolsLoader()
87 |
88 | import tempfile
89 | with tempfile.TemporaryDirectory() as tmpdir:
90 | pdf_file = Path(tmpdir) / "test.pdf"
91 | pdf_file.write_text("dummy content")
92 |
93 | # When semtools is unavailable, validate_file will fail first
94 | with pytest.raises(ValueError, match="File format not supported"):
95 | async for chunk in loader.extract_chunks(pdf_file):
96 | pass
97 |
98 | @pytest.mark.asyncio
99 | @patch('mcp_memory_service.ingestion.semtools_loader.SemtoolsLoader._check_semtools_availability')
100 | @patch('asyncio.create_subprocess_exec')
101 | async def test_extract_chunks_success(self, mock_subprocess, mock_check_semtools):
102 | """Test successful document extraction with semtools."""
103 | # Force semtools to be "available"
104 | mock_check_semtools.return_value = True
105 |
106 | # Mock successful semtools execution with sufficient content to create chunks
107 | mock_content = b"# Document Title\n\n" + b"This is a test document with enough content to create chunks. " * 10
108 | mock_process = AsyncMock()
109 | mock_process.returncode = 0
110 | mock_process.communicate = AsyncMock(
111 | return_value=(mock_content, b"")
112 | )
113 | mock_subprocess.return_value = mock_process
114 |
115 | loader = SemtoolsLoader(chunk_size=200, chunk_overlap=50)
116 | loader._semtools_available = True # Override
117 |
118 | import tempfile
119 | with tempfile.TemporaryDirectory() as tmpdir:
120 | pdf_file = Path(tmpdir) / "test.pdf"
121 | pdf_file.write_text("dummy content")
122 |
123 | chunks = []
124 | async for chunk in loader.extract_chunks(pdf_file):
125 | chunks.append(chunk)
126 |
127 | # Verify chunks were created
128 | assert len(chunks) > 0
129 |
130 | # Verify chunk structure
131 | first_chunk = chunks[0]
132 | assert isinstance(first_chunk, DocumentChunk)
133 | assert isinstance(first_chunk.content, str)
134 | assert first_chunk.metadata['extraction_method'] == 'semtools'
135 | assert first_chunk.metadata['parser_backend'] == 'llamaparse'
136 | assert first_chunk.source_file == pdf_file
137 |
138 | @pytest.mark.asyncio
139 | @patch('mcp_memory_service.ingestion.semtools_loader.SemtoolsLoader._check_semtools_availability')
140 | @patch('asyncio.create_subprocess_exec')
141 | @patch.dict('os.environ', {'LLAMAPARSE_API_KEY': 'test-api-key'})
142 | async def test_extract_chunks_with_api_key(self, mock_subprocess, mock_check_semtools):
143 | """Test that API key is passed to semtools when available."""
144 | mock_check_semtools.return_value = True
145 |
146 | # Mock with sufficient content to create chunks
147 | mock_content = b"# Content\n\n" + b"This document has enough content to create chunks. " * 10
148 | mock_process = AsyncMock()
149 | mock_process.returncode = 0
150 | mock_process.communicate = AsyncMock(
151 | return_value=(mock_content, b"")
152 | )
153 | mock_subprocess.return_value = mock_process
154 |
155 | # Create loader with API key
156 | loader = SemtoolsLoader()
157 | loader._semtools_available = True # Override
158 |
159 | import tempfile
160 | with tempfile.TemporaryDirectory() as tmpdir:
161 | pdf_file = Path(tmpdir) / "test.pdf"
162 | pdf_file.write_text("dummy content")
163 |
164 | chunks = []
165 | async for chunk in loader.extract_chunks(pdf_file):
166 | chunks.append(chunk)
167 |
168 | # Verify chunks were created and API key was recognized
169 | assert len(chunks) > 0
170 | assert chunks[0].metadata['has_api_key'] is True
171 |
172 | @pytest.mark.asyncio
173 | @patch('shutil.which')
174 | @patch('asyncio.create_subprocess_exec')
175 | async def test_extract_chunks_semtools_error(self, mock_subprocess, mock_which):
176 | """Test handling of semtools execution errors."""
177 | mock_which.return_value = '/usr/local/bin/semtools'
178 |
179 | # Mock failed semtools execution
180 | mock_process = AsyncMock()
181 | mock_process.returncode = 1
182 | mock_process.communicate = AsyncMock(
183 | return_value=(b"", b"Error: Failed to parse document")
184 | )
185 | mock_subprocess.return_value = mock_process
186 |
187 | loader = SemtoolsLoader()
188 |
189 | import tempfile
190 | with tempfile.TemporaryDirectory() as tmpdir:
191 | pdf_file = Path(tmpdir) / "test.pdf"
192 | pdf_file.write_text("dummy content")
193 |
194 | with pytest.raises(ValueError, match="Failed to parse document"):
195 | async for chunk in loader.extract_chunks(pdf_file):
196 | pass
197 |
198 | @pytest.mark.asyncio
199 | @patch('shutil.which')
200 | @patch('asyncio.create_subprocess_exec')
201 | @patch('asyncio.wait_for')
202 | async def test_extract_chunks_timeout(self, mock_wait_for, mock_subprocess, mock_which):
203 | """Test handling of semtools timeout."""
204 | mock_which.return_value = '/usr/local/bin/semtools'
205 |
206 | # Mock timeout scenario
207 | mock_wait_for.side_effect = asyncio.TimeoutError()
208 |
209 | loader = SemtoolsLoader()
210 |
211 | import tempfile
212 | with tempfile.TemporaryDirectory() as tmpdir:
213 | pdf_file = Path(tmpdir) / "test.pdf"
214 | pdf_file.write_text("dummy content")
215 |
216 | with pytest.raises(ValueError, match="timed out|Failed to parse"):
217 | async for chunk in loader.extract_chunks(pdf_file):
218 | pass
219 |
220 | @pytest.mark.asyncio
221 | @patch('shutil.which')
222 | @patch('asyncio.create_subprocess_exec')
223 | async def test_extract_chunks_empty_content(self, mock_subprocess, mock_which):
224 | """Test handling of empty content from semtools."""
225 | mock_which.return_value = '/usr/local/bin/semtools'
226 |
227 | # Mock empty output
228 | mock_process = AsyncMock()
229 | mock_process.returncode = 0
230 | mock_process.communicate = AsyncMock(
231 | return_value=(b"", b"") # Empty stdout
232 | )
233 | mock_subprocess.return_value = mock_process
234 |
235 | loader = SemtoolsLoader()
236 |
237 | import tempfile
238 | with tempfile.TemporaryDirectory() as tmpdir:
239 | pdf_file = Path(tmpdir) / "test.pdf"
240 | pdf_file.write_text("dummy content")
241 |
242 | with pytest.raises(ValueError, match="empty content|Failed to parse"):
243 | async for chunk in loader.extract_chunks(pdf_file):
244 | pass
245 |
246 |
247 | class TestSemtoolsLoaderRegistry:
248 | """Test semtools loader registration."""
249 |
250 | @patch('shutil.which')
251 | def test_loader_registration_with_semtools(self, mock_which):
252 | """Test that semtools loader is registered when available."""
253 | mock_which.return_value = '/usr/local/bin/semtools'
254 |
255 | from mcp_memory_service.ingestion.registry import get_loader_for_file
256 |
257 | import tempfile
258 | with tempfile.TemporaryDirectory() as tmpdir:
259 | # Test DOCX file (semtools-only format)
260 | docx_file = Path(tmpdir) / "test.docx"
261 | docx_file.touch()
262 |
263 | loader = get_loader_for_file(docx_file)
264 |
265 | # Should get SemtoolsLoader when semtools is available
266 | assert loader is not None
267 | assert isinstance(loader, SemtoolsLoader)
268 |
269 | @patch('shutil.which')
270 | def test_loader_registration_without_semtools(self, mock_which):
271 | """Test that docx files return None when semtools unavailable."""
272 | mock_which.return_value = None
273 |
274 | from mcp_memory_service.ingestion.registry import get_loader_for_file
275 |
276 | import tempfile
277 | with tempfile.TemporaryDirectory() as tmpdir:
278 | # Test DOCX file (semtools-only format)
279 | docx_file = Path(tmpdir) / "test.docx"
280 | docx_file.touch()
281 |
282 | loader = get_loader_for_file(docx_file)
283 |
284 | # Should return None when semtools is not available
285 | assert loader is None
286 |
287 |
288 | if __name__ == '__main__':
289 | pytest.main([__file__, '-v'])
290 |
```
--------------------------------------------------------------------------------
/scripts/database/db_health_check.py:
--------------------------------------------------------------------------------
```python
1 | #!/usr/bin/env python3
2 | """
3 | Comprehensive Database Health Check for MCP Memory Service SQLite-vec Backend
4 | """
5 |
6 | import asyncio
7 | import os
8 | import sys
9 | import tempfile
10 | import time
11 | from pathlib import Path
12 |
13 | # Add src to path
14 | sys.path.insert(0, 'src')
15 |
16 | # Set environment for sqlite-vec
17 | os.environ['MCP_MEMORY_STORAGE_BACKEND'] = 'sqlite_vec'
18 |
19 | class HealthChecker:
20 | def __init__(self):
21 | self.results = []
22 | self.errors = []
23 |
24 | async def test(self, name: str, func):
25 | """Run a test and record results."""
26 | print(f"🔍 {name}...")
27 | try:
28 | start_time = time.time()
29 |
30 | if asyncio.iscoroutinefunction(func):
31 | result = await func()
32 | else:
33 | result = func()
34 |
35 | duration = time.time() - start_time
36 |
37 | if result:
38 | print(f" ✅ PASS ({duration:.2f}s)")
39 | self.results.append((name, "PASS", duration))
40 | else:
41 | print(f" ❌ FAIL ({duration:.2f}s)")
42 | self.results.append((name, "FAIL", duration))
43 |
44 | except Exception as e:
45 | duration = time.time() - start_time
46 | print(f" ❌ ERROR ({duration:.2f}s): {str(e)}")
47 | self.results.append((name, "ERROR", duration))
48 | self.errors.append((name, str(e)))
49 |
50 | def test_imports(self):
51 | """Test all necessary imports."""
52 | try:
53 | import sqlite_vec
54 | from src.mcp_memory_service.storage.sqlite_vec import SqliteVecMemoryStorage
55 | from src.mcp_memory_service.models.memory import Memory
56 | from src.mcp_memory_service.utils.hashing import generate_content_hash
57 | return True
58 | except ImportError as e:
59 | print(f" Import error: {e}")
60 | return False
61 |
62 | async def test_database_creation(self):
63 | """Test database creation and initialization."""
64 | temp_dir = tempfile.mkdtemp()
65 | db_path = os.path.join(temp_dir, "health_check.db")
66 |
67 | try:
68 | from src.mcp_memory_service.storage.sqlite_vec import SqliteVecMemoryStorage
69 | storage = SqliteVecMemoryStorage(db_path)
70 | await storage.initialize()
71 |
72 | # Check tables exist
73 | cursor = storage.conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
74 | tables = [row[0] for row in cursor.fetchall()]
75 |
76 | expected_tables = ['memories', 'memory_embeddings']
77 | for table in expected_tables:
78 | if table not in tables:
79 | print(f" Missing table: {table}")
80 | return False
81 |
82 | storage.close()
83 | os.remove(db_path)
84 | os.rmdir(temp_dir)
85 | return True
86 |
87 | except Exception as e:
88 | print(f" Database creation error: {e}")
89 | return False
90 |
91 | async def test_memory_operations(self):
92 | """Test core memory operations."""
93 | temp_dir = tempfile.mkdtemp()
94 | db_path = os.path.join(temp_dir, "operations_test.db")
95 |
96 | try:
97 | from src.mcp_memory_service.storage.sqlite_vec import SqliteVecMemoryStorage
98 | from src.mcp_memory_service.models.memory import Memory
99 | from src.mcp_memory_service.utils.hashing import generate_content_hash
100 |
101 | storage = SqliteVecMemoryStorage(db_path)
102 | await storage.initialize()
103 |
104 | # Test store
105 | content = "Health check test memory"
106 | memory = Memory(
107 | content=content,
108 | content_hash=generate_content_hash(content),
109 | tags=["health", "check"],
110 | memory_type="test"
111 | )
112 |
113 | success, message = await storage.store(memory)
114 | if not success:
115 | print(f" Store failed: {message}")
116 | return False
117 |
118 | # Test retrieve
119 | results = await storage.retrieve("health check", n_results=1)
120 | if not results:
121 | print(" Retrieve failed: No results")
122 | return False
123 |
124 | # Test tag search
125 | tag_results = await storage.search_by_tag(["health"])
126 | if not tag_results:
127 | print(" Tag search failed: No results")
128 | return False
129 |
130 | # Test delete
131 | success, message = await storage.delete(memory.content_hash)
132 | if not success:
133 | print(f" Delete failed: {message}")
134 | return False
135 |
136 | storage.close()
137 | os.remove(db_path)
138 | os.rmdir(temp_dir)
139 | return True
140 |
141 | except Exception as e:
142 | print(f" Memory operations error: {e}")
143 | return False
144 |
145 | async def test_vector_search(self):
146 | """Test vector similarity search functionality."""
147 | temp_dir = tempfile.mkdtemp()
148 | db_path = os.path.join(temp_dir, "vector_test.db")
149 |
150 | try:
151 | from src.mcp_memory_service.storage.sqlite_vec import SqliteVecMemoryStorage
152 | from src.mcp_memory_service.models.memory import Memory
153 | from src.mcp_memory_service.utils.hashing import generate_content_hash
154 |
155 | storage = SqliteVecMemoryStorage(db_path)
156 | await storage.initialize()
157 |
158 | # Store multiple memories
159 | test_memories = [
160 | "Python programming is fun",
161 | "JavaScript development tools",
162 | "Database design patterns"
163 | ]
164 |
165 | for content in test_memories:
166 | memory = Memory(
167 | content=content,
168 | content_hash=generate_content_hash(content),
169 | tags=["programming"],
170 | memory_type="test"
171 | )
172 | await storage.store(memory)
173 |
174 | # Test vector search
175 | results = await storage.retrieve("programming languages", n_results=3)
176 | if len(results) != 3:
177 | print(f" Expected 3 results, got {len(results)}")
178 | return False
179 |
180 | # Check relevance scores
181 | for result in results:
182 | if result.relevance_score < 0:
183 | print(f" Invalid relevance score: {result.relevance_score}")
184 | return False
185 |
186 | storage.close()
187 | os.remove(db_path)
188 | os.rmdir(temp_dir)
189 | return True
190 |
191 | except Exception as e:
192 | print(f" Vector search error: {e}")
193 | return False
194 |
195 | def test_environment(self):
196 | """Test environment configuration."""
197 | required_vars = {
198 | 'MCP_MEMORY_STORAGE_BACKEND': 'sqlite_vec'
199 | }
200 |
201 | for var, expected in required_vars.items():
202 | actual = os.getenv(var)
203 | if actual != expected:
204 | print(f" Environment variable {var}: expected '{expected}', got '{actual}'")
205 | return False
206 |
207 | return True
208 |
209 | def test_dependencies(self):
210 | """Test that all dependencies are available."""
211 | try:
212 | import sqlite_vec
213 | version = getattr(sqlite_vec, '__version__', 'unknown')
214 | print(f" sqlite-vec version: {version}")
215 |
216 | import sentence_transformers
217 | print(f" sentence-transformers available")
218 |
219 | import torch
220 | print(f" torch available")
221 |
222 | return True
223 | except ImportError as e:
224 | print(f" Dependency error: {e}")
225 | return False
226 |
227 | def test_database_path(self):
228 | """Test database path accessibility."""
229 | home = str(Path.home())
230 | db_path = os.path.join(home, '.local', 'share', 'mcp-memory', 'sqlite_vec.db')
231 | db_dir = os.path.dirname(db_path)
232 |
233 | try:
234 | # Ensure directory exists
235 | os.makedirs(db_dir, exist_ok=True)
236 |
237 | # Test write permission
238 | test_file = os.path.join(db_dir, '.write_test')
239 | with open(test_file, 'w') as f:
240 | f.write('test')
241 | os.remove(test_file)
242 |
243 | print(f" Database path: {db_path}")
244 | print(f" Directory writable: ✅")
245 | return True
246 |
247 | except Exception as e:
248 | print(f" Database path error: {e}")
249 | return False
250 |
251 | async def main():
252 | """Main health check function."""
253 | print("=" * 60)
254 | print("🏥 MCP Memory Service - SQLite-vec Database Health Check")
255 | print("=" * 60)
256 |
257 | checker = HealthChecker()
258 |
259 | # Run all tests
260 | await checker.test("Environment Configuration", checker.test_environment)
261 | await checker.test("Dependencies Check", checker.test_dependencies)
262 | await checker.test("Import Tests", checker.test_imports)
263 | await checker.test("Database Path", checker.test_database_path)
264 | await checker.test("Database Creation", checker.test_database_creation)
265 | await checker.test("Memory Operations", checker.test_memory_operations)
266 | await checker.test("Vector Search", checker.test_vector_search)
267 |
268 | # Summary
269 | print("\n" + "=" * 60)
270 | print("📊 Health Check Summary")
271 | print("=" * 60)
272 |
273 | passed = sum(1 for _, status, _ in checker.results if status == "PASS")
274 | failed = sum(1 for _, status, _ in checker.results if status in ["FAIL", "ERROR"])
275 | total = len(checker.results)
276 |
277 | for name, status, duration in checker.results:
278 | status_icon = "✅" if status == "PASS" else "❌"
279 | print(f"{status_icon} {name}: {status} ({duration:.2f}s)")
280 |
281 | print(f"\nResults: {passed}/{total} tests passed")
282 |
283 | if checker.errors:
284 | print("\n❌ Errors Found:")
285 | for name, error in checker.errors:
286 | print(f" • {name}: {error}")
287 |
288 | if passed == total:
289 | print("\n🎉 Database Health Check: PASSED")
290 | print(" SQLite-vec backend is fully functional and ready for production use!")
291 | print("\n🚀 Ready for Claude Code integration:")
292 | print(" - Start server: python -m src.mcp_memory_service.server")
293 | print(" - Database: ~/.local/share/mcp-memory/sqlite_vec.db")
294 | print(" - 75% memory reduction vs ChromaDB")
295 | return 0
296 | else:
297 | print("\n💥 Database Health Check: FAILED")
298 | print(" Please resolve the issues above before using the service.")
299 | return 1
300 |
301 | if __name__ == "__main__":
302 | sys.exit(asyncio.run(main()))
```
--------------------------------------------------------------------------------
/tests/integration/test_concurrent_clients.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Integration tests for concurrent MCP client access to SQLite-vec backend.
3 | """
4 |
5 | import pytest
6 | import pytest_asyncio
7 | import asyncio
8 | import tempfile
9 | import os
10 | import time
11 | from typing import List
12 |
13 | from mcp_memory_service.storage.sqlite_vec import SqliteVecMemoryStorage
14 | from mcp_memory_service.models.memory import Memory
15 | from mcp_memory_service.utils.hashing import generate_content_hash
16 |
17 |
18 | class TestConcurrentClients:
19 | """Test suite for concurrent client access scenarios."""
20 |
21 | @pytest_asyncio.fixture
22 | async def db_path(self):
23 | """Create a temporary database path."""
24 | with tempfile.NamedTemporaryFile(delete=False, suffix=".db") as tmp:
25 | path = tmp.name
26 | yield path
27 | # Cleanup
28 | for ext in ["", "-wal", "-shm"]:
29 | try:
30 | os.unlink(path + ext)
31 | except:
32 | pass
33 |
34 | async def create_client(self, db_path: str, client_id: str) -> SqliteVecMemoryStorage:
35 | """Create a storage client instance."""
36 | storage = SqliteVecMemoryStorage(db_path)
37 | await storage.initialize()
38 | return storage
39 |
40 | async def client_writer(self, db_path: str, client_id: str, num_memories: int) -> List[tuple]:
41 | """Simulate a client writing memories."""
42 | storage = await self.create_client(db_path, client_id)
43 | results = []
44 |
45 | try:
46 | for i in range(num_memories):
47 | content = f"Memory from {client_id} - {i}"
48 | memory = Memory(
49 | content=content,
50 | content_hash=generate_content_hash(content),
51 | tags=[client_id, "concurrent"],
52 | memory_type="test",
53 | metadata={"client_id": client_id, "index": i}
54 | )
55 |
56 | success, msg = await storage.store(memory)
57 | results.append((success, msg, memory.content_hash))
58 |
59 | # Small delay to simulate real-world timing
60 | await asyncio.sleep(0.01)
61 | finally:
62 | storage.close()
63 |
64 | return results
65 |
66 | async def client_reader(self, db_path: str, client_id: str, num_reads: int) -> List[int]:
67 | """Simulate a client reading memories."""
68 | storage = await self.create_client(db_path, client_id)
69 | counts = []
70 |
71 | try:
72 | for i in range(num_reads):
73 | # Get all memories with tag "concurrent"
74 | memories = await storage.search_by_tag(["concurrent"])
75 | counts.append(len(memories))
76 |
77 | # Small delay between reads
78 | await asyncio.sleep(0.02)
79 | finally:
80 | storage.close()
81 |
82 | return counts
83 |
84 | @pytest.mark.asyncio
85 | async def test_two_clients_concurrent_write(self, db_path):
86 | """Test two clients writing memories concurrently."""
87 | # Run two clients concurrently
88 | results = await asyncio.gather(
89 | self.client_writer(db_path, "client1", 10),
90 | self.client_writer(db_path, "client2", 10),
91 | return_exceptions=True
92 | )
93 |
94 | # Check results
95 | assert len(results) == 2
96 | assert not isinstance(results[0], Exception), f"Client 1 failed: {results[0]}"
97 | assert not isinstance(results[1], Exception), f"Client 2 failed: {results[1]}"
98 |
99 | client1_results, client2_results = results
100 |
101 | # Count successful writes
102 | client1_success = sum(1 for success, _, _ in client1_results if success)
103 | client2_success = sum(1 for success, _, _ in client2_results if success)
104 |
105 | # Both clients should have written their memories
106 | assert client1_success == 10, f"Client 1 only wrote {client1_success}/10 memories"
107 | assert client2_success == 10, f"Client 2 only wrote {client2_success}/10 memories"
108 |
109 | # Verify total memories in database
110 | storage = await self.create_client(db_path, "verifier")
111 | try:
112 | all_memories = await storage.search_by_tag(["concurrent"])
113 | assert len(all_memories) == 20, f"Expected 20 memories, found {len(all_memories)}"
114 | finally:
115 | storage.close()
116 |
117 | @pytest.mark.asyncio
118 | async def test_reader_writer_concurrent(self, db_path):
119 | """Test one client reading while another writes."""
120 | # Start with some initial data
121 | initial_storage = await self.create_client(db_path, "initial")
122 | for i in range(5):
123 | content = f"Initial memory {i}"
124 | memory = Memory(
125 | content=content,
126 | content_hash=generate_content_hash(content),
127 | tags=["concurrent", "initial"],
128 | memory_type="test"
129 | )
130 | await initial_storage.store(memory)
131 | initial_storage.close()
132 |
133 | # Run reader and writer concurrently
134 | results = await asyncio.gather(
135 | self.client_reader(db_path, "reader", 10),
136 | self.client_writer(db_path, "writer", 5),
137 | return_exceptions=True
138 | )
139 |
140 | assert not isinstance(results[0], Exception), f"Reader failed: {results[0]}"
141 | assert not isinstance(results[1], Exception), f"Writer failed: {results[1]}"
142 |
143 | read_counts, write_results = results
144 |
145 | # Reader should see increasing counts as writer adds memories
146 | assert read_counts[0] >= 5, "Reader should see initial memories"
147 | assert read_counts[-1] >= read_counts[0], "Read count should not decrease"
148 |
149 | # Writer should succeed
150 | write_success = sum(1 for success, _, _ in write_results if success)
151 | assert write_success == 5, f"Writer only wrote {write_success}/5 memories"
152 |
153 | @pytest.mark.asyncio
154 | async def test_multiple_readers_one_writer(self, db_path):
155 | """Test multiple readers accessing while one writer updates."""
156 | # Create writer and multiple readers
157 | async def run_scenario():
158 | tasks = [
159 | self.client_writer(db_path, "writer", 10),
160 | self.client_reader(db_path, "reader1", 5),
161 | self.client_reader(db_path, "reader2", 5),
162 | self.client_reader(db_path, "reader3", 5),
163 | ]
164 | return await asyncio.gather(*tasks, return_exceptions=True)
165 |
166 | results = await run_scenario()
167 |
168 | # Check all operations completed without exceptions
169 | for i, result in enumerate(results):
170 | assert not isinstance(result, Exception), f"Task {i} failed: {result}"
171 |
172 | write_results = results[0]
173 | write_success = sum(1 for success, _, _ in write_results if success)
174 | assert write_success == 10, f"Writer only wrote {write_success}/10 memories"
175 |
176 | # All readers should have successfully read
177 | for reader_counts in results[1:]:
178 | assert len(reader_counts) == 5, "Reader should complete all reads"
179 | assert all(isinstance(count, int) for count in reader_counts)
180 |
181 | @pytest.mark.asyncio
182 | async def test_rapid_concurrent_access(self, db_path):
183 | """Test rapid concurrent access with minimal delays."""
184 | async def rapid_writer(client_id: str):
185 | storage = await self.create_client(db_path, client_id)
186 | try:
187 | for i in range(20):
188 | content = f"Rapid {client_id}-{i}"
189 | memory = Memory(
190 | content=content,
191 | content_hash=generate_content_hash(content),
192 | tags=["rapid"],
193 | memory_type="test"
194 | )
195 | await storage.store(memory)
196 | # No delay - rapid fire
197 | finally:
198 | storage.close()
199 |
200 | async def rapid_reader(client_id: str):
201 | storage = await self.create_client(db_path, client_id)
202 | try:
203 | for i in range(20):
204 | await storage.search_by_tag(["rapid"])
205 | # No delay - rapid fire
206 | finally:
207 | storage.close()
208 |
209 | # Run multiple clients with rapid access
210 | tasks = [
211 | rapid_writer("w1"),
212 | rapid_writer("w2"),
213 | rapid_reader("r1"),
214 | rapid_reader("r2"),
215 | ]
216 |
217 | results = await asyncio.gather(*tasks, return_exceptions=True)
218 |
219 | # Check no exceptions occurred
220 | exceptions = [r for r in results if isinstance(r, Exception)]
221 | assert len(exceptions) == 0, f"Exceptions occurred: {exceptions}"
222 |
223 | @pytest.mark.asyncio
224 | async def test_database_consistency(self, db_path):
225 | """Test that database remains consistent under concurrent access."""
226 | # Write unique memories from multiple clients
227 | async def write_unique_memories(client_id: str, start_idx: int):
228 | storage = await self.create_client(db_path, client_id)
229 | hashes = []
230 | try:
231 | for i in range(10):
232 | content = f"Unique memory {start_idx + i}"
233 | memory = Memory(
234 | content=content,
235 | content_hash=generate_content_hash(content),
236 | tags=["consistency", client_id],
237 | memory_type="test"
238 | )
239 | success, msg = await storage.store(memory)
240 | if success:
241 | hashes.append(memory.content_hash)
242 | finally:
243 | storage.close()
244 | return hashes
245 |
246 | # Run concurrent writes
247 | results = await asyncio.gather(
248 | write_unique_memories("client1", 0),
249 | write_unique_memories("client2", 100),
250 | write_unique_memories("client3", 200),
251 | )
252 |
253 | all_hashes = []
254 | for client_hashes in results:
255 | all_hashes.extend(client_hashes)
256 |
257 | # Verify all memories are in database and no duplicates
258 | storage = await self.create_client(db_path, "verifier")
259 | try:
260 | memories = await storage.search_by_tag(["consistency"])
261 |
262 | # Check count matches
263 | assert len(memories) == len(all_hashes), f"Memory count mismatch: {len(memories)} vs {len(all_hashes)}"
264 |
265 | # Check no duplicates
266 | db_hashes = {m.content_hash for m in memories}
267 | assert len(db_hashes) == len(memories), "Duplicate memories found"
268 |
269 | # Check all written memories are present
270 | for hash_val in all_hashes:
271 | assert hash_val in db_hashes, f"Memory {hash_val} not found in database"
272 | finally:
273 | storage.close()
274 |
275 |
276 | if __name__ == "__main__":
277 | pytest.main([__file__, "-v"])
```
--------------------------------------------------------------------------------
/scripts/maintenance/repair_sqlite_vec_embeddings.py:
--------------------------------------------------------------------------------
```python
1 | #!/usr/bin/env python3
2 | """
3 | Repair script to fix existing SQLite-vec databases without full migration.
4 |
5 | This script attempts to repair the database in-place by:
6 | 1. Checking the current state
7 | 2. Regenerating missing embeddings
8 | 3. Fixing dimension mismatches if possible
9 | """
10 |
11 | import asyncio
12 | import os
13 | import sys
14 | import sqlite3
15 | import logging
16 | from typing import List, Tuple
17 |
18 | # Add parent directory to path
19 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
20 |
21 | try:
22 | import sqlite_vec
23 | from sqlite_vec import serialize_float32
24 | SQLITE_VEC_AVAILABLE = True
25 | except ImportError:
26 | SQLITE_VEC_AVAILABLE = False
27 |
28 | try:
29 | from sentence_transformers import SentenceTransformer
30 | SENTENCE_TRANSFORMERS_AVAILABLE = True
31 | except ImportError:
32 | SENTENCE_TRANSFORMERS_AVAILABLE = False
33 |
34 | # Configure logging
35 | logging.basicConfig(
36 | level=logging.INFO,
37 | format='%(asctime)s - %(levelname)s - %(message)s'
38 | )
39 | logger = logging.getLogger(__name__)
40 |
41 |
42 | class SqliteVecRepair:
43 | """Repair SQLite-vec database embeddings."""
44 |
45 | def __init__(self, db_path: str, model_name: str = "all-MiniLM-L6-v2"):
46 | self.db_path = db_path
47 | self.model_name = model_name
48 | self.conn = None
49 | self.model = None
50 | self.embedding_dimension = 384 # Default for all-MiniLM-L6-v2
51 |
52 | def check_dependencies(self):
53 | """Check required dependencies."""
54 | print("Checking dependencies...")
55 |
56 | if not SQLITE_VEC_AVAILABLE:
57 | print("❌ sqlite-vec not installed. Run: pip install sqlite-vec")
58 | return False
59 |
60 | if not SENTENCE_TRANSFORMERS_AVAILABLE:
61 | print("❌ sentence-transformers not installed. Run: pip install sentence-transformers torch")
62 | return False
63 |
64 | print("✅ All dependencies available")
65 | return True
66 |
67 | def connect_database(self):
68 | """Connect to the database."""
69 | print(f"\nConnecting to database: {self.db_path}")
70 |
71 | if not os.path.exists(self.db_path):
72 | raise FileNotFoundError(f"Database not found: {self.db_path}")
73 |
74 | self.conn = sqlite3.connect(self.db_path)
75 | self.conn.enable_load_extension(True)
76 | sqlite_vec.load(self.conn)
77 | self.conn.enable_load_extension(False)
78 |
79 | print("✅ Connected to database")
80 |
81 | def analyze_database(self) -> dict:
82 | """Analyze the current state of the database."""
83 | print("\nAnalyzing database...")
84 |
85 | analysis = {
86 | "memory_count": 0,
87 | "embedding_count": 0,
88 | "missing_embeddings": 0,
89 | "embedding_dimension": None,
90 | "issues": []
91 | }
92 |
93 | # Count memories
94 | cursor = self.conn.execute("SELECT COUNT(*) FROM memories")
95 | analysis["memory_count"] = cursor.fetchone()[0]
96 |
97 | # Count embeddings
98 | cursor = self.conn.execute("SELECT COUNT(*) FROM memory_embeddings")
99 | analysis["embedding_count"] = cursor.fetchone()[0]
100 |
101 | # Check embedding dimension
102 | cursor = self.conn.execute("""
103 | SELECT sql FROM sqlite_master
104 | WHERE type='table' AND name='memory_embeddings'
105 | """)
106 | schema = cursor.fetchone()
107 | if schema:
108 | # Extract dimension from schema
109 | import re
110 | match = re.search(r'FLOAT\[(\d+)\]', schema[0])
111 | if match:
112 | analysis["embedding_dimension"] = int(match.group(1))
113 |
114 | # Find memories without embeddings
115 | cursor = self.conn.execute("""
116 | SELECT COUNT(*) FROM memories m
117 | WHERE NOT EXISTS (
118 | SELECT 1 FROM memory_embeddings e WHERE e.rowid = m.id
119 | )
120 | """)
121 | analysis["missing_embeddings"] = cursor.fetchone()[0]
122 |
123 | # Identify issues
124 | if analysis["memory_count"] != analysis["embedding_count"]:
125 | analysis["issues"].append(
126 | f"Mismatch: {analysis['memory_count']} memories vs {analysis['embedding_count']} embeddings"
127 | )
128 |
129 | if analysis["missing_embeddings"] > 0:
130 | analysis["issues"].append(
131 | f"Missing embeddings: {analysis['missing_embeddings']} memories have no embeddings"
132 | )
133 |
134 | print(f" Memories: {analysis['memory_count']}")
135 | print(f" Embeddings: {analysis['embedding_count']}")
136 | print(f" Missing embeddings: {analysis['missing_embeddings']}")
137 | print(f" Embedding dimension: {analysis['embedding_dimension']}")
138 |
139 | if analysis["issues"]:
140 | print("\n⚠️ Issues found:")
141 | for issue in analysis["issues"]:
142 | print(f" - {issue}")
143 | else:
144 | print("\n✅ No issues found")
145 |
146 | return analysis
147 |
148 | def load_model(self):
149 | """Load the embedding model."""
150 | print(f"\nLoading embedding model: {self.model_name}")
151 |
152 | self.model = SentenceTransformer(self.model_name)
153 |
154 | # Get actual dimension
155 | test_embedding = self.model.encode(["test"], convert_to_numpy=True)
156 | self.embedding_dimension = test_embedding.shape[1]
157 |
158 | print(f"✅ Model loaded (dimension: {self.embedding_dimension})")
159 |
160 | def generate_missing_embeddings(self, analysis: dict) -> int:
161 | """Generate embeddings for memories that don't have them."""
162 | if analysis["missing_embeddings"] == 0:
163 | return 0
164 |
165 | print(f"\nGenerating {analysis['missing_embeddings']} missing embeddings...")
166 |
167 | # Check if dimensions match
168 | if analysis["embedding_dimension"] and analysis["embedding_dimension"] != self.embedding_dimension:
169 | print(f"⚠️ WARNING: Database expects dimension {analysis['embedding_dimension']}, "
170 | f"but model produces dimension {self.embedding_dimension}")
171 | print(" This may cause errors. Consider full migration instead.")
172 | response = input("\nContinue anyway? (y/N): ").strip().lower()
173 | if response != 'y':
174 | return 0
175 |
176 | # Find memories without embeddings
177 | cursor = self.conn.execute("""
178 | SELECT m.id, m.content FROM memories m
179 | WHERE NOT EXISTS (
180 | SELECT 1 FROM memory_embeddings e WHERE e.rowid = m.id
181 | )
182 | """)
183 |
184 | memories_to_fix = cursor.fetchall()
185 | fixed_count = 0
186 |
187 | for memory_id, content in memories_to_fix:
188 | try:
189 | # Generate embedding
190 | embedding = self.model.encode([content], convert_to_numpy=True)[0]
191 |
192 | # Insert embedding
193 | self.conn.execute(
194 | "INSERT INTO memory_embeddings (rowid, content_embedding) VALUES (?, ?)",
195 | (memory_id, serialize_float32(embedding))
196 | )
197 |
198 | fixed_count += 1
199 |
200 | # Show progress
201 | if fixed_count % 10 == 0:
202 | print(f" ... {fixed_count}/{len(memories_to_fix)} embeddings generated")
203 |
204 | except Exception as e:
205 | logger.error(f"Failed to generate embedding for memory {memory_id}: {e}")
206 |
207 | self.conn.commit()
208 | print(f"✅ Generated {fixed_count} embeddings")
209 |
210 | return fixed_count
211 |
212 | def verify_search(self) -> bool:
213 | """Test if semantic search works."""
214 | print("\nTesting semantic search...")
215 |
216 | try:
217 | # Generate a test query embedding
218 | test_query = "test query"
219 | query_embedding = self.model.encode([test_query], convert_to_numpy=True)[0]
220 |
221 | # Try to search
222 | cursor = self.conn.execute("""
223 | SELECT COUNT(*) FROM memory_embeddings
224 | WHERE content_embedding MATCH ?
225 | LIMIT 1
226 | """, (serialize_float32(query_embedding),))
227 |
228 | cursor.fetchone()
229 | print("✅ Semantic search is working")
230 | return True
231 |
232 | except Exception as e:
233 | print(f"❌ Semantic search failed: {e}")
234 | return False
235 |
236 | def run_repair(self):
237 | """Run the repair process."""
238 | print("\n" + "="*60)
239 | print("SQLite-vec Embedding Repair Tool")
240 | print("="*60)
241 |
242 | try:
243 | # Check dependencies
244 | if not self.check_dependencies():
245 | return
246 |
247 | # Connect to database
248 | self.connect_database()
249 |
250 | # Analyze current state
251 | analysis = self.analyze_database()
252 |
253 | if not analysis["issues"]:
254 | print("\n✅ Database appears healthy, no repair needed")
255 | return
256 |
257 | # Load model
258 | self.load_model()
259 |
260 | # Fix missing embeddings
261 | fixed = self.generate_missing_embeddings(analysis)
262 |
263 | # Verify search works
264 | self.verify_search()
265 |
266 | # Re-analyze
267 | print("\nRe-analyzing database after repair...")
268 | new_analysis = self.analyze_database()
269 |
270 | print("\n" + "="*60)
271 | print("Repair Summary")
272 | print("="*60)
273 | print(f"Fixed {fixed} missing embeddings")
274 |
275 | if new_analysis["issues"]:
276 | print("\n⚠️ Some issues remain:")
277 | for issue in new_analysis["issues"]:
278 | print(f" - {issue}")
279 | print("\nConsider running the full migration script instead.")
280 | else:
281 | print("\n✅ All issues resolved!")
282 |
283 | except Exception as e:
284 | print(f"\n❌ Repair failed: {e}")
285 | logger.exception("Repair failed")
286 |
287 | finally:
288 | if self.conn:
289 | self.conn.close()
290 |
291 |
292 | def main():
293 | """Run the repair tool."""
294 | if len(sys.argv) < 2:
295 | print("Usage: python repair_sqlite_vec_embeddings.py <database_path>")
296 | print("\nExample:")
297 | print(" python repair_sqlite_vec_embeddings.py ~/.mcp_memory/sqlite_vec.db")
298 | print("\nThis tool will:")
299 | print(" - Check for missing embeddings")
300 | print(" - Generate embeddings for memories that don't have them")
301 | print(" - Verify semantic search functionality")
302 | print("\nFor more complex issues (dimension mismatches, schema problems),")
303 | print("use migrate_sqlite_vec_embeddings.py instead.")
304 | sys.exit(1)
305 |
306 | db_path = sys.argv[1]
307 |
308 | repair = SqliteVecRepair(db_path)
309 | repair.run_repair()
310 |
311 |
312 | if __name__ == "__main__":
313 | main()
```
--------------------------------------------------------------------------------
/scripts/maintenance/repair_malformed_tags.py:
--------------------------------------------------------------------------------
```python
1 | #!/usr/bin/env python3
2 | # Copyright 2024 Heinrich Krupp
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """
17 | Script to repair malformed tags in the memory database.
18 |
19 | Detects and fixes tags that contain JSON serialization artifacts like:
20 | - Tags with quotes: ["ai" or "bug-fix"
21 | - Tags with brackets: ["important"] or ["note"]
22 | - Double-serialized JSON arrays
23 |
24 | Usage:
25 | python scripts/maintenance/repair_malformed_tags.py --dry-run # Preview changes
26 | python scripts/maintenance/repair_malformed_tags.py # Apply fixes
27 | """
28 |
29 | import sys
30 | import os
31 | import json
32 | import re
33 | import sqlite3
34 | import argparse
35 | import logging
36 | from pathlib import Path
37 | from datetime import datetime
38 | from typing import List, Tuple, Set, Dict
39 |
40 | # Add parent directory to path
41 | sys.path.insert(0, str(Path(__file__).parent.parent.parent))
42 |
43 | from src.mcp_memory_service.config import SQLITE_VEC_PATH
44 |
45 | # Configure logging
46 | logging.basicConfig(
47 | level=logging.INFO,
48 | format='%(asctime)s - %(levelname)s - %(message)s',
49 | )
50 | logger = logging.getLogger(__name__)
51 |
52 |
53 | def parse_malformed_tag(tag: str) -> List[str]:
54 | """
55 | Parse a malformed tag that may contain JSON serialization artifacts.
56 |
57 | Examples:
58 | ["ai" -> ["ai"]
59 | "bug-fix" -> ["bug-fix"]
60 | ["important"] -> ["important"]
61 | [\"ai\",\"bug\"] -> ["ai", "bug"]
62 |
63 | Returns:
64 | List of clean tag strings
65 | """
66 | # Remove leading/trailing whitespace
67 | tag = tag.strip()
68 |
69 | # If tag doesn't contain quotes or brackets, it's clean
70 | if '"' not in tag and '[' not in tag and ']' not in tag:
71 | return [tag] if tag else []
72 |
73 | # Try to parse as JSON first
74 | try:
75 | # Handle cases where tag is a JSON string like ["tag1","tag2"]
76 | if tag.startswith('[') and tag.endswith(']'):
77 | parsed = json.loads(tag)
78 | if isinstance(parsed, list):
79 | # Recursively clean each element
80 | result = []
81 | for item in parsed:
82 | result.extend(parse_malformed_tag(str(item)))
83 | return result
84 | except json.JSONDecodeError:
85 | pass
86 |
87 | # Handle escaped quotes like [\"ai\" or \"bug-fix\"
88 | if '\\"' in tag or tag.startswith('["') or tag.startswith('"'):
89 | # Remove all quotes and brackets
90 | cleaned = tag.replace('\\"', '').replace('"', '').replace('[', '').replace(']', '').strip()
91 | if cleaned:
92 | return [cleaned]
93 |
94 | # Handle patterns like ["ai" (missing closing bracket/quote)
95 | if tag.startswith('["') or tag.startswith('"['):
96 | cleaned = tag.replace('[', '').replace('"', '').strip()
97 | if cleaned:
98 | return [cleaned]
99 |
100 | # If nothing worked, just remove quotes and brackets
101 | cleaned = tag.replace('"', '').replace('[', '').replace(']', '').strip()
102 | return [cleaned] if cleaned else []
103 |
104 |
105 | def is_malformed_tag(tag: str) -> bool:
106 | """Check if a tag contains malformed characters."""
107 | return any(char in tag for char in ['"', '[', ']', '\\'])
108 |
109 |
110 | def analyze_tags(db_path: str) -> Tuple[int, int, Set[str], Dict[str, int]]:
111 | """
112 | Analyze tags in the database to find malformed entries.
113 |
114 | Returns:
115 | (total_memories, malformed_count, malformed_tags_set, tag_frequency)
116 | """
117 | conn = sqlite3.connect(db_path)
118 | cursor = conn.cursor()
119 |
120 | try:
121 | # Get all memories with their tags
122 | cursor.execute("SELECT content_hash, tags FROM memories WHERE tags IS NOT NULL AND tags != ''")
123 | rows = cursor.fetchall()
124 |
125 | total_memories = len(rows)
126 | malformed_count = 0
127 | malformed_tags = set()
128 | tag_frequency = {}
129 |
130 | for content_hash, tags_str in rows:
131 | # Parse tags (comma-separated)
132 | tags = [tag.strip() for tag in tags_str.split(",") if tag.strip()]
133 |
134 | has_malformed = False
135 | for tag in tags:
136 | if is_malformed_tag(tag):
137 | has_malformed = True
138 | malformed_tags.add(tag)
139 | tag_frequency[tag] = tag_frequency.get(tag, 0) + 1
140 |
141 | if has_malformed:
142 | malformed_count += 1
143 |
144 | return total_memories, malformed_count, malformed_tags, tag_frequency
145 |
146 | finally:
147 | conn.close()
148 |
149 |
150 | def repair_tags(db_path: str, dry_run: bool = False) -> Tuple[int, int, Dict[str, List[str]]]:
151 | """
152 | Repair malformed tags in the database.
153 |
154 | Returns:
155 | (memories_updated, tags_fixed, replacements_dict)
156 | """
157 | conn = sqlite3.connect(db_path)
158 | cursor = conn.cursor()
159 |
160 | memories_updated = 0
161 | tags_fixed = 0
162 | replacements = {} # old_tag -> [new_tags]
163 |
164 | try:
165 | # Get all memories with tags
166 | cursor.execute("SELECT content_hash, tags FROM memories WHERE tags IS NOT NULL AND tags != ''")
167 | rows = cursor.fetchall()
168 |
169 | for content_hash, tags_str in rows:
170 | # Parse tags (comma-separated)
171 | original_tags = [tag.strip() for tag in tags_str.split(",") if tag.strip()]
172 |
173 | # Check if any tags are malformed
174 | needs_repair = any(is_malformed_tag(tag) for tag in original_tags)
175 |
176 | if needs_repair:
177 | # Parse and clean each tag
178 | new_tags = []
179 | for tag in original_tags:
180 | if is_malformed_tag(tag):
181 | parsed = parse_malformed_tag(tag)
182 | if parsed:
183 | new_tags.extend(parsed)
184 | replacements[tag] = parsed
185 | tags_fixed += 1
186 | else:
187 | new_tags.append(tag)
188 |
189 | # Remove duplicates while preserving order
190 | seen = set()
191 | unique_tags = []
192 | for tag in new_tags:
193 | if tag and tag not in seen:
194 | seen.add(tag)
195 | unique_tags.append(tag)
196 |
197 | # Update database
198 | new_tags_str = ",".join(unique_tags)
199 |
200 | if dry_run:
201 | logger.info(f"[DRY RUN] Would update {content_hash[:8]}...")
202 | logger.info(f" Old: {tags_str}")
203 | logger.info(f" New: {new_tags_str}")
204 | else:
205 | cursor.execute(
206 | "UPDATE memories SET tags = ? WHERE content_hash = ?",
207 | (new_tags_str, content_hash)
208 | )
209 |
210 | memories_updated += 1
211 |
212 | if not dry_run:
213 | conn.commit()
214 | logger.info(f"✅ Database updated successfully")
215 |
216 | return memories_updated, tags_fixed, replacements
217 |
218 | finally:
219 | conn.close()
220 |
221 |
222 | def create_backup(db_path: str) -> str:
223 | """Create a backup of the database before modifications."""
224 | backup_path = f"{db_path}.backup-{datetime.now().strftime('%Y%m%d_%H%M%S')}"
225 |
226 | import shutil
227 | shutil.copy2(db_path, backup_path)
228 |
229 | logger.info(f"✅ Backup created: {backup_path}")
230 | return backup_path
231 |
232 |
233 | def main():
234 | parser = argparse.ArgumentParser(
235 | description="Repair malformed tags in the memory database",
236 | formatter_class=argparse.RawDescriptionHelpFormatter,
237 | epilog="""
238 | Examples:
239 | # Preview changes without modifying database
240 | python repair_malformed_tags.py --dry-run
241 |
242 | # Apply fixes (creates backup automatically)
243 | python repair_malformed_tags.py
244 |
245 | # Verbose output with detailed logging
246 | python repair_malformed_tags.py --verbose
247 | """
248 | )
249 | parser.add_argument(
250 | '--dry-run',
251 | action='store_true',
252 | help='Preview changes without modifying the database'
253 | )
254 | parser.add_argument(
255 | '--verbose', '-v',
256 | action='store_true',
257 | help='Enable verbose logging'
258 | )
259 | parser.add_argument(
260 | '--db-path',
261 | type=str,
262 | default=SQLITE_VEC_PATH,
263 | help=f'Path to SQLite database (default: {SQLITE_VEC_PATH})'
264 | )
265 |
266 | args = parser.parse_args()
267 |
268 | # Set logging level
269 | if args.verbose:
270 | logging.getLogger().setLevel(logging.DEBUG)
271 |
272 | # Check if database exists
273 | if not os.path.exists(args.db_path):
274 | logger.error(f"❌ Database not found: {args.db_path}")
275 | sys.exit(1)
276 |
277 | logger.info("=" * 60)
278 | logger.info("🔧 Malformed Tag Repair Tool")
279 | logger.info("=" * 60)
280 | logger.info(f"Database: {args.db_path}")
281 | logger.info(f"Mode: {'DRY RUN (no changes)' if args.dry_run else 'REPAIR (will modify database)'}")
282 | logger.info("")
283 |
284 | # Analyze tags
285 | logger.info("📊 Analyzing tags...")
286 | total_memories, malformed_count, malformed_tags, tag_frequency = analyze_tags(args.db_path)
287 |
288 | logger.info(f"Total memories: {total_memories}")
289 | logger.info(f"Memories with malformed tags: {malformed_count}")
290 | logger.info(f"Unique malformed tags: {len(malformed_tags)}")
291 | logger.info("")
292 |
293 | if malformed_count == 0:
294 | logger.info("✅ No malformed tags found! Database is clean.")
295 | return
296 |
297 | # Show most common malformed tags
298 | logger.info("🔍 Most common malformed tags:")
299 | sorted_tags = sorted(tag_frequency.items(), key=lambda x: x[1], reverse=True)
300 | for tag, count in sorted_tags[:10]:
301 | logger.info(f" {tag!r} -> appears {count} times")
302 | parsed = parse_malformed_tag(tag)
303 | logger.info(f" Will become: {parsed}")
304 | logger.info("")
305 |
306 | # Create backup if not dry-run
307 | if not args.dry_run:
308 | logger.info("💾 Creating backup...")
309 | backup_path = create_backup(args.db_path)
310 | logger.info("")
311 |
312 | # Repair tags
313 | logger.info("🔧 Repairing tags...")
314 | memories_updated, tags_fixed, replacements = repair_tags(args.db_path, dry_run=args.dry_run)
315 |
316 | logger.info("")
317 | logger.info("=" * 60)
318 | logger.info("📈 Summary")
319 | logger.info("=" * 60)
320 | logger.info(f"Memories updated: {memories_updated}")
321 | logger.info(f"Tags fixed: {tags_fixed}")
322 | logger.info("")
323 |
324 | if replacements:
325 | logger.info("🔄 Tag replacements:")
326 | for old_tag, new_tags in list(replacements.items())[:10]:
327 | logger.info(f" {old_tag!r} -> {new_tags}")
328 | if len(replacements) > 10:
329 | logger.info(f" ... and {len(replacements) - 10} more")
330 |
331 | logger.info("")
332 | if args.dry_run:
333 | logger.info("⚠️ This was a DRY RUN - no changes were made")
334 | logger.info(" Run without --dry-run to apply fixes")
335 | else:
336 | logger.info("✅ Repair completed successfully!")
337 | logger.info(f" Backup saved to: {backup_path}")
338 |
339 | logger.info("=" * 60)
340 |
341 |
342 | if __name__ == "__main__":
343 | main()
344 |
```
--------------------------------------------------------------------------------
/tests/api/test_compact_types.py:
--------------------------------------------------------------------------------
```python
1 | # Copyright 2024 Heinrich Krupp
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Tests for compact data types.
17 |
18 | Validates token efficiency, immutability, and type safety of compact types.
19 | """
20 |
21 | import pytest
22 | import time
23 | from mcp_memory_service.api.types import (
24 | CompactMemory,
25 | CompactSearchResult,
26 | CompactHealthInfo
27 | )
28 |
29 |
30 | class TestCompactMemory:
31 | """Tests for CompactMemory type."""
32 |
33 | def test_compact_memory_creation(self):
34 | """Test basic CompactMemory creation."""
35 | memory = CompactMemory(
36 | hash='abc12345',
37 | preview='Test content preview',
38 | tags=('test', 'example'),
39 | created=time.time(),
40 | score=0.95
41 | )
42 |
43 | assert memory.hash == 'abc12345'
44 | assert memory.preview == 'Test content preview'
45 | assert memory.tags == ('test', 'example')
46 | assert memory.score == 0.95
47 | assert isinstance(memory.created, float)
48 |
49 | def test_compact_memory_immutability(self):
50 | """Test that CompactMemory is immutable."""
51 | memory = CompactMemory(
52 | hash='abc12345',
53 | preview='Test content',
54 | tags=('test',),
55 | created=time.time(),
56 | score=0.85
57 | )
58 |
59 | # NamedTuple should be immutable
60 | with pytest.raises(AttributeError):
61 | memory.hash = 'new_hash' # type: ignore
62 |
63 | def test_compact_memory_tuple_behavior(self):
64 | """Test that CompactMemory behaves like a tuple."""
65 | memory = CompactMemory(
66 | hash='abc12345',
67 | preview='Test content',
68 | tags=('test',),
69 | created=1730928000.0,
70 | score=0.85
71 | )
72 |
73 | # Should support tuple unpacking
74 | hash_val, preview, tags, created, score = memory
75 |
76 | assert hash_val == 'abc12345'
77 | assert preview == 'Test content'
78 | assert tags == ('test',)
79 | assert created == 1730928000.0
80 | assert score == 0.85
81 |
82 | def test_compact_memory_field_access(self):
83 | """Test named field access."""
84 | memory = CompactMemory(
85 | hash='test123',
86 | preview='Preview text',
87 | tags=('tag1', 'tag2'),
88 | created=1730928000.0,
89 | score=0.75
90 | )
91 |
92 | # Named access should work
93 | assert memory.hash == 'test123'
94 | assert memory.tags[0] == 'tag1'
95 | assert memory.tags[1] == 'tag2'
96 |
97 | def test_compact_memory_token_size(self):
98 | """Test that CompactMemory achieves target token size."""
99 | # Create memory with typical values
100 | memory = CompactMemory(
101 | hash='abc12345',
102 | preview='A' * 200, # 200 char preview
103 | tags=('tag1', 'tag2', 'tag3'),
104 | created=1730928000.0,
105 | score=0.85
106 | )
107 |
108 | # Convert to string representation (approximates token count)
109 | repr_str = str(memory)
110 |
111 | # Should be much smaller than full Memory object (~820 tokens)
112 | # Target: ~73 tokens, allow some margin
113 | # Rough estimate: 1 token H 4 characters
114 | estimated_tokens = len(repr_str) / 4
115 |
116 | assert estimated_tokens < 150, \
117 | f"CompactMemory too large: {estimated_tokens} tokens (target: <150)"
118 |
119 |
120 | class TestCompactSearchResult:
121 | """Tests for CompactSearchResult type."""
122 |
123 | def test_compact_search_result_creation(self):
124 | """Test basic CompactSearchResult creation."""
125 | memories = (
126 | CompactMemory('hash1', 'preview1', ('tag1',), time.time(), 0.95),
127 | CompactMemory('hash2', 'preview2', ('tag2',), time.time(), 0.85),
128 | )
129 |
130 | result = CompactSearchResult(
131 | memories=memories,
132 | total=2,
133 | query='test query'
134 | )
135 |
136 | assert len(result.memories) == 2
137 | assert result.total == 2
138 | assert result.query == 'test query'
139 |
140 | def test_compact_search_result_repr(self):
141 | """Test CompactSearchResult string representation."""
142 | memories = (
143 | CompactMemory('hash1', 'preview1', ('tag1',), time.time(), 0.95),
144 | CompactMemory('hash2', 'preview2', ('tag2',), time.time(), 0.85),
145 | CompactMemory('hash3', 'preview3', ('tag3',), time.time(), 0.75),
146 | )
147 |
148 | result = CompactSearchResult(
149 | memories=memories,
150 | total=3,
151 | query='architecture'
152 | )
153 |
154 | repr_str = repr(result)
155 | assert 'found=3' in repr_str
156 | assert 'shown=3' in repr_str
157 |
158 | def test_compact_search_result_empty(self):
159 | """Test CompactSearchResult with no results."""
160 | result = CompactSearchResult(
161 | memories=(),
162 | total=0,
163 | query='nonexistent query'
164 | )
165 |
166 | assert len(result.memories) == 0
167 | assert result.total == 0
168 | assert repr(result) == 'SearchResult(found=0, shown=0)'
169 |
170 | def test_compact_search_result_iteration(self):
171 | """Test iterating over search results."""
172 | memories = tuple(
173 | CompactMemory(f'hash{i}', f'preview{i}', ('tag',), time.time(), 0.9 - i*0.1)
174 | for i in range(5)
175 | )
176 |
177 | result = CompactSearchResult(
178 | memories=memories,
179 | total=5,
180 | query='test'
181 | )
182 |
183 | # Should be iterable
184 | for i, memory in enumerate(result.memories):
185 | assert memory.hash == f'hash{i}'
186 |
187 | def test_compact_search_result_token_size(self):
188 | """Test that CompactSearchResult achieves target token size."""
189 | # Create result with 5 memories (typical use case)
190 | memories = tuple(
191 | CompactMemory(
192 | f'hash{i:04d}',
193 | 'A' * 200, # 200 char preview
194 | ('tag1', 'tag2'),
195 | time.time(),
196 | 0.9 - i*0.05
197 | )
198 | for i in range(5)
199 | )
200 |
201 | result = CompactSearchResult(
202 | memories=memories,
203 | total=5,
204 | query='architecture decisions'
205 | )
206 |
207 | # Convert to string representation
208 | repr_str = str(result.memories)
209 |
210 | # Target: ~385 tokens for 5 results (vs ~2,625 for full Memory objects)
211 | # Allow some margin
212 | estimated_tokens = len(repr_str) / 4
213 |
214 | assert estimated_tokens < 800, \
215 | f"CompactSearchResult too large: {estimated_tokens} tokens (target: <800 for 5 results)"
216 |
217 |
218 | class TestCompactHealthInfo:
219 | """Tests for CompactHealthInfo type."""
220 |
221 | def test_compact_health_info_creation(self):
222 | """Test basic CompactHealthInfo creation."""
223 | info = CompactHealthInfo(
224 | status='healthy',
225 | count=1247,
226 | backend='sqlite_vec'
227 | )
228 |
229 | assert info.status == 'healthy'
230 | assert info.count == 1247
231 | assert info.backend == 'sqlite_vec'
232 |
233 | def test_compact_health_info_status_values(self):
234 | """Test different status values."""
235 | statuses = ['healthy', 'degraded', 'error']
236 |
237 | for status in statuses:
238 | info = CompactHealthInfo(
239 | status=status,
240 | count=100,
241 | backend='cloudflare'
242 | )
243 | assert info.status == status
244 |
245 | def test_compact_health_info_backends(self):
246 | """Test different backend types."""
247 | backends = ['sqlite_vec', 'cloudflare', 'hybrid']
248 |
249 | for backend in backends:
250 | info = CompactHealthInfo(
251 | status='healthy',
252 | count=500,
253 | backend=backend
254 | )
255 | assert info.backend == backend
256 |
257 | def test_compact_health_info_token_size(self):
258 | """Test that CompactHealthInfo achieves target token size."""
259 | info = CompactHealthInfo(
260 | status='healthy',
261 | count=1247,
262 | backend='sqlite_vec'
263 | )
264 |
265 | repr_str = str(info)
266 |
267 | # Target: ~20 tokens (vs ~125 for full health check)
268 | estimated_tokens = len(repr_str) / 4
269 |
270 | assert estimated_tokens < 50, \
271 | f"CompactHealthInfo too large: {estimated_tokens} tokens (target: <50)"
272 |
273 |
274 | class TestTokenEfficiency:
275 | """Integration tests for overall token efficiency."""
276 |
277 | def test_memory_size_comparison(self):
278 | """Compare CompactMemory size to full Memory object."""
279 | from mcp_memory_service.models.memory import Memory
280 |
281 | # Create full Memory object
282 | full_memory = Memory(
283 | content='A' * 1000, # Long content
284 | content_hash='abc123def456' * 5,
285 | tags=['tag1', 'tag2', 'tag3'],
286 | memory_type='note',
287 | metadata={'key': 'value'},
288 | embedding=[0.1] * 768, # Full embedding vector
289 | )
290 |
291 | # Create compact version
292 | compact = CompactMemory(
293 | hash='abc12345',
294 | preview='A' * 200, # First 200 chars only
295 | tags=('tag1', 'tag2', 'tag3'),
296 | created=time.time(),
297 | score=0.95
298 | )
299 |
300 | # Compare sizes
301 | full_repr = str(full_memory.to_dict())
302 | compact_repr = str(compact)
303 |
304 | # Compact should be significantly smaller
305 | # Note: String representation is not exact token count, allow some margin
306 | size_ratio = len(compact_repr) / len(full_repr)
307 |
308 | assert size_ratio < 0.30, \
309 | f"CompactMemory not small enough: {size_ratio:.2%} of full size (target: <30%)"
310 |
311 | def test_search_result_size_reduction(self):
312 | """Validate 85%+ token reduction for search results."""
313 | # Create 5 compact memories
314 | memories = tuple(
315 | CompactMemory(
316 | f'hash{i:04d}',
317 | 'A' * 200,
318 | ('tag1', 'tag2'),
319 | time.time(),
320 | 0.9 - i*0.05
321 | )
322 | for i in range(5)
323 | )
324 |
325 | result = CompactSearchResult(
326 | memories=memories,
327 | total=5,
328 | query='test'
329 | )
330 |
331 | # Estimate tokens
332 | repr_str = str(result)
333 | estimated_tokens = len(repr_str) / 4
334 |
335 | # Target: 85% reduction from ~2,625 tokens � ~385 tokens
336 | # Allow some margin: should be under 600 tokens
337 | assert estimated_tokens < 600, \
338 | f"Search result not efficient enough: {estimated_tokens} tokens (target: <600)"
339 |
340 | # Verify we're achieving significant reduction
341 | # Original would be ~2,625 tokens, we should be well under 1000
342 | reduction_vs_original = 1 - (estimated_tokens / 2625)
343 | assert reduction_vs_original >= 0.75, \
344 | f"Token reduction insufficient: {reduction_vs_original:.1%} (target: e75%)"
345 |
```
--------------------------------------------------------------------------------
/src/mcp_memory_service/ingestion/pdf_loader.py:
--------------------------------------------------------------------------------
```python
1 | # Copyright 2024 Heinrich Krupp
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | PDF document loader for extracting text content from PDF files.
17 | """
18 |
19 | import logging
20 | from pathlib import Path
21 | from typing import AsyncGenerator, Dict, Any, Optional
22 | import asyncio
23 |
24 | from .base import DocumentLoader, DocumentChunk
25 | from .chunker import TextChunker, ChunkingStrategy
26 |
27 | logger = logging.getLogger(__name__)
28 |
29 | # Try to import PDF processing library
30 | try:
31 | import PyPDF2
32 | HAS_PYPDF2 = True
33 | except ImportError:
34 | HAS_PYPDF2 = False
35 | logger.warning("PyPDF2 not available. PDF support will be limited.")
36 |
37 | try:
38 | import pdfplumber
39 | HAS_PDFPLUMBER = True
40 | except ImportError:
41 | HAS_PDFPLUMBER = False
42 | logger.debug("pdfplumber not available, falling back to PyPDF2")
43 |
44 |
45 | class PDFLoader(DocumentLoader):
46 | """
47 | Document loader for PDF files.
48 |
49 | Supports multiple PDF processing backends:
50 | - pdfplumber (preferred): Better text extraction, table support
51 | - PyPDF2 (fallback): Basic text extraction
52 | """
53 |
54 | def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
55 | """
56 | Initialize PDF loader.
57 |
58 | Args:
59 | chunk_size: Target size for text chunks in characters
60 | chunk_overlap: Number of characters to overlap between chunks
61 | """
62 | super().__init__(chunk_size, chunk_overlap)
63 | self.supported_extensions = ['pdf']
64 | self.chunker = TextChunker(ChunkingStrategy(
65 | chunk_size=chunk_size,
66 | chunk_overlap=chunk_overlap,
67 | respect_paragraph_boundaries=True
68 | ))
69 |
70 | # Check which PDF backend is available
71 | if HAS_PDFPLUMBER:
72 | self.backend = 'pdfplumber'
73 | elif HAS_PYPDF2:
74 | self.backend = 'pypdf2'
75 | else:
76 | self.backend = None
77 | logger.error("No PDF processing library available. Install pypdf2 or pdfplumber.")
78 |
79 | def can_handle(self, file_path: Path) -> bool:
80 | """
81 | Check if this loader can handle the given PDF file.
82 |
83 | Args:
84 | file_path: Path to the file to check
85 |
86 | Returns:
87 | True if this loader can process the PDF file
88 | """
89 | if self.backend is None:
90 | return False
91 |
92 | return (file_path.suffix.lower() == '.pdf' and
93 | file_path.exists() and
94 | file_path.is_file())
95 |
96 | async def extract_chunks(self, file_path: Path, **kwargs) -> AsyncGenerator[DocumentChunk, None]:
97 | """
98 | Extract text chunks from a PDF file.
99 |
100 | Args:
101 | file_path: Path to the PDF file
102 | **kwargs: Additional options:
103 | - extract_images: Whether to extract image descriptions (default: False)
104 | - extract_tables: Whether to extract table content (default: False)
105 | - page_range: Tuple of (start, end) pages to extract (1-indexed)
106 |
107 | Yields:
108 | DocumentChunk objects containing extracted text and metadata
109 |
110 | Raises:
111 | FileNotFoundError: If the PDF file doesn't exist
112 | ValueError: If the PDF file is invalid or can't be processed
113 | """
114 | await self.validate_file(file_path)
115 |
116 | if self.backend is None:
117 | raise ValueError("No PDF processing backend available")
118 |
119 | extract_images = kwargs.get('extract_images', False)
120 | extract_tables = kwargs.get('extract_tables', False)
121 | page_range = kwargs.get('page_range', None)
122 |
123 | logger.info(f"Extracting chunks from PDF: {file_path} using {self.backend}")
124 |
125 | try:
126 | if self.backend == 'pdfplumber':
127 | async for chunk in self._extract_with_pdfplumber(
128 | file_path, extract_images, extract_tables, page_range
129 | ):
130 | yield chunk
131 | else:
132 | async for chunk in self._extract_with_pypdf2(
133 | file_path, page_range
134 | ):
135 | yield chunk
136 |
137 | except Exception as e:
138 | logger.error(f"Error extracting from PDF {file_path}: {str(e)}")
139 | raise ValueError(f"Failed to extract PDF content: {str(e)}") from e
140 |
141 | async def _extract_with_pdfplumber(
142 | self,
143 | file_path: Path,
144 | extract_images: bool,
145 | extract_tables: bool,
146 | page_range: Optional[tuple]
147 | ) -> AsyncGenerator[DocumentChunk, None]:
148 | """
149 | Extract text using pdfplumber backend.
150 |
151 | Args:
152 | file_path: Path to PDF file
153 | extract_images: Whether to extract image descriptions
154 | extract_tables: Whether to extract table content
155 | page_range: Optional page range to extract
156 |
157 | Yields:
158 | DocumentChunk objects
159 | """
160 | def _extract_sync():
161 | """Synchronous extraction function to run in thread pool."""
162 | with pdfplumber.open(file_path) as pdf:
163 | total_pages = len(pdf.pages)
164 | start_page = page_range[0] - 1 if page_range else 0
165 | end_page = min(page_range[1], total_pages) if page_range else total_pages
166 |
167 | for page_num in range(start_page, end_page):
168 | page = pdf.pages[page_num]
169 |
170 | # Extract main text
171 | text = page.extract_text() or ""
172 |
173 | # Extract table content if requested
174 | if extract_tables:
175 | tables = page.extract_tables()
176 | for table in tables or []:
177 | table_text = self._format_table(table)
178 | text += f"\n\n[TABLE]\n{table_text}\n[/TABLE]"
179 |
180 | # Note: Image extraction would require additional processing
181 | if extract_images:
182 | # Placeholder for image extraction
183 | images = page.images
184 | if images:
185 | text += f"\n\n[IMAGES: {len(images)} images found on this page]"
186 |
187 | if text.strip():
188 | yield (page_num + 1, text.strip())
189 |
190 | # Run extraction in thread pool to avoid blocking
191 | loop = asyncio.get_event_loop()
192 | page_generator = await loop.run_in_executor(None, lambda: list(_extract_sync()))
193 |
194 | base_metadata = self.get_base_metadata(file_path)
195 | chunk_index = 0
196 |
197 | for page_num, page_text in page_generator:
198 | page_metadata = base_metadata.copy()
199 | page_metadata.update({
200 | 'page_number': page_num,
201 | 'extraction_method': 'pdfplumber',
202 | 'content_type': 'pdf_page'
203 | })
204 |
205 | # Chunk the page text
206 | chunks = self.chunker.chunk_text(page_text, page_metadata)
207 |
208 | for chunk_text, chunk_metadata in chunks:
209 | yield DocumentChunk(
210 | content=chunk_text,
211 | metadata=chunk_metadata,
212 | chunk_index=chunk_index,
213 | source_file=file_path
214 | )
215 | chunk_index += 1
216 |
217 | async def _extract_with_pypdf2(
218 | self,
219 | file_path: Path,
220 | page_range: Optional[tuple]
221 | ) -> AsyncGenerator[DocumentChunk, None]:
222 | """
223 | Extract text using PyPDF2 backend.
224 |
225 | Args:
226 | file_path: Path to PDF file
227 | page_range: Optional page range to extract
228 |
229 | Yields:
230 | DocumentChunk objects
231 | """
232 | def _extract_sync():
233 | """Synchronous extraction function."""
234 | with open(file_path, 'rb') as file:
235 | pdf_reader = PyPDF2.PdfReader(file)
236 | total_pages = len(pdf_reader.pages)
237 | start_page = page_range[0] - 1 if page_range else 0
238 | end_page = min(page_range[1], total_pages) if page_range else total_pages
239 |
240 | for page_num in range(start_page, end_page):
241 | page = pdf_reader.pages[page_num]
242 | text = page.extract_text()
243 |
244 | if text.strip():
245 | yield (page_num + 1, text.strip())
246 |
247 | # Run extraction in thread pool
248 | loop = asyncio.get_event_loop()
249 | page_generator = await loop.run_in_executor(None, lambda: list(_extract_sync()))
250 |
251 | base_metadata = self.get_base_metadata(file_path)
252 | chunk_index = 0
253 |
254 | for page_num, page_text in page_generator:
255 | page_metadata = base_metadata.copy()
256 | page_metadata.update({
257 | 'page_number': page_num,
258 | 'extraction_method': 'pypdf2',
259 | 'content_type': 'pdf_page'
260 | })
261 |
262 | # Chunk the page text
263 | chunks = self.chunker.chunk_text(page_text, page_metadata)
264 |
265 | for chunk_text, chunk_metadata in chunks:
266 | yield DocumentChunk(
267 | content=chunk_text,
268 | metadata=chunk_metadata,
269 | chunk_index=chunk_index,
270 | source_file=file_path
271 | )
272 | chunk_index += 1
273 |
274 | def _format_table(self, table: list) -> str:
275 | """
276 | Format extracted table data as text.
277 |
278 | Args:
279 | table: Table data as list of rows
280 |
281 | Returns:
282 | Formatted table text
283 | """
284 | if not table:
285 | return ""
286 |
287 | # Simple table formatting
288 | formatted_rows = []
289 | for row in table:
290 | if row: # Skip empty rows
291 | cleaned_row = [str(cell).strip() if cell else "" for cell in row]
292 | formatted_rows.append(" | ".join(cleaned_row))
293 |
294 | return "\n".join(formatted_rows)
295 |
296 |
297 | # Register the PDF loader
298 | def _register_pdf_loader():
299 | """Register PDF loader with the registry."""
300 | try:
301 | from .registry import register_loader
302 | register_loader(PDFLoader, ['pdf'])
303 | logger.debug("PDF loader registered successfully")
304 | except ImportError:
305 | logger.debug("Registry not available during import")
306 |
307 |
308 | # Auto-register when module is imported
309 | _register_pdf_loader()
```
--------------------------------------------------------------------------------
/claude-hooks/utilities/mcp-client.js:
--------------------------------------------------------------------------------
```javascript
1 | /**
2 | * MCP Client for Memory Hook Integration
3 | * Provides MCP protocol communication for memory operations
4 | */
5 |
6 | const { spawn } = require('child_process');
7 | const { EventEmitter } = require('events');
8 | const fs = require('fs');
9 | const path = require('path');
10 |
11 | class MCPClient extends EventEmitter {
12 | constructor(serverCommand, options = {}) {
13 | super();
14 | this.serverCommand = serverCommand;
15 | this.serverWorkingDir = options.workingDir || process.cwd();
16 | this.connectionTimeout = options.connectionTimeout || 5000;
17 | this.toolCallTimeout = options.toolCallTimeout || 10000;
18 | this.serverProcess = null;
19 | this.messageId = 0;
20 | this.pendingRequests = new Map();
21 | this.connected = false;
22 | this.buffer = '';
23 |
24 | // Load environment variables from .env file
25 | this.loadEnvironment();
26 | }
27 |
28 | /**
29 | * Load environment variables from .env file
30 | */
31 | loadEnvironment() {
32 | const envPath = path.join(this.serverWorkingDir, '.env');
33 | try {
34 | if (fs.existsSync(envPath)) {
35 | const envContent = fs.readFileSync(envPath, 'utf8');
36 | const lines = envContent.split('\n');
37 |
38 | for (const line of lines) {
39 | const match = line.match(/^([^#\s][^=]*?)=(.*)$/);
40 | if (match) {
41 | const [, key, value] = match;
42 | if (!process.env[key]) {
43 | process.env[key] = value.replace(/^["']|["']$/g, '');
44 | }
45 | }
46 | }
47 | }
48 | } catch (error) {
49 | // Ignore .env loading errors
50 | }
51 | }
52 |
53 | /**
54 | * Start MCP server and establish connection
55 | */
56 | async connect() {
57 | return new Promise((resolve, reject) => {
58 | try {
59 | // Start MCP server process
60 | this.serverProcess = spawn(this.serverCommand[0], this.serverCommand.slice(1), {
61 | cwd: this.serverWorkingDir,
62 | stdio: ['pipe', 'pipe', 'pipe'],
63 | env: { ...process.env }
64 | });
65 |
66 | // Handle server output
67 | this.serverProcess.stdout.on('data', (data) => {
68 | this.buffer += data.toString();
69 | this.processMessages();
70 | });
71 |
72 | this.serverProcess.stderr.on('data', (data) => {
73 | const error = data.toString();
74 | // Only emit critical errors, ignore warnings and debug info
75 | if (error.includes('FATAL') || error.includes('ExceptionGroup')) {
76 | this.emit('error', new Error(`Server error: ${error.substring(0, 200)}...`));
77 | }
78 | });
79 |
80 | this.serverProcess.on('error', (error) => {
81 | if (!this.connected) {
82 | reject(new Error(`Server process error: ${error.message}`));
83 | }
84 | });
85 |
86 | this.serverProcess.on('exit', (code) => {
87 | this.connected = false;
88 | if (code !== 0 && !this.connected) {
89 | reject(new Error(`Server failed to start (exit code ${code})`));
90 | }
91 | });
92 |
93 | // Initialize MCP connection
94 | this.sendInitialize()
95 | .then(() => {
96 | this.connected = true;
97 | resolve();
98 | })
99 | .catch(reject);
100 |
101 | // Connection timeout
102 | setTimeout(() => {
103 | if (!this.connected) {
104 | reject(new Error('Connection timeout'));
105 | }
106 | }, this.connectionTimeout);
107 |
108 | } catch (error) {
109 | reject(error);
110 | }
111 | });
112 | }
113 |
114 | /**
115 | * Process incoming messages from server
116 | */
117 | processMessages() {
118 | const lines = this.buffer.split('\n');
119 | this.buffer = lines.pop() || ''; // Keep incomplete line in buffer
120 |
121 | for (const line of lines) {
122 | if (line.trim()) {
123 | try {
124 | const message = JSON.parse(line);
125 | this.handleMessage(message);
126 | } catch (error) {
127 | // Ignore malformed messages
128 | }
129 | }
130 | }
131 | }
132 |
133 | /**
134 | * Handle incoming MCP messages
135 | */
136 | handleMessage(message) {
137 | if (message.id && this.pendingRequests.has(message.id)) {
138 | const { resolve, reject } = this.pendingRequests.get(message.id);
139 | this.pendingRequests.delete(message.id);
140 |
141 | if (message.error) {
142 | reject(new Error(message.error.message || 'MCP Error'));
143 | } else {
144 | resolve(message.result);
145 | }
146 | }
147 | }
148 |
149 | /**
150 | * Send MCP initialize request
151 | */
152 | async sendInitialize() {
153 | const message = {
154 | jsonrpc: '2.0',
155 | id: ++this.messageId,
156 | method: 'initialize',
157 | params: {
158 | protocolVersion: '2024-11-05',
159 | capabilities: {
160 | tools: {}
161 | },
162 | clientInfo: {
163 | name: 'claude-hooks-mcp-client',
164 | version: '1.0.0'
165 | }
166 | }
167 | };
168 |
169 | return this.sendMessage(message);
170 | }
171 |
172 | /**
173 | * Send message to MCP server
174 | */
175 | async sendMessage(message) {
176 | return new Promise((resolve, reject) => {
177 | if (!this.serverProcess || this.serverProcess.exitCode !== null) {
178 | reject(new Error('Server not running'));
179 | return;
180 | }
181 |
182 | if (message.id) {
183 | this.pendingRequests.set(message.id, { resolve, reject });
184 |
185 | // Timeout for this specific request
186 | setTimeout(() => {
187 | if (this.pendingRequests.has(message.id)) {
188 | this.pendingRequests.delete(message.id);
189 | reject(new Error('Request timeout'));
190 | }
191 | }, this.toolCallTimeout);
192 | }
193 |
194 | try {
195 | const messageStr = JSON.stringify(message) + '\n';
196 | this.serverProcess.stdin.write(messageStr);
197 |
198 | if (!message.id) {
199 | resolve(); // No response expected
200 | }
201 | } catch (error) {
202 | if (message.id && this.pendingRequests.has(message.id)) {
203 | this.pendingRequests.delete(message.id);
204 | }
205 | reject(error);
206 | }
207 | });
208 | }
209 |
210 | /**
211 | * Call a tool via MCP
212 | */
213 | async callTool(toolName, args = {}) {
214 | const message = {
215 | jsonrpc: '2.0',
216 | id: ++this.messageId,
217 | method: 'tools/call',
218 | params: {
219 | name: toolName,
220 | arguments: args
221 | }
222 | };
223 |
224 | return this.sendMessage(message);
225 | }
226 |
227 | /**
228 | * Get memory service health status
229 | */
230 | async getHealthStatus() {
231 | try {
232 | const result = await this.callTool('check_database_health');
233 | return {
234 | success: true,
235 | data: result.content ? this.parseToolResponse(result.content) : result
236 | };
237 | } catch (error) {
238 | return {
239 | success: false,
240 | error: error.message,
241 | fallback: true
242 | };
243 | }
244 | }
245 |
246 | /**
247 | * Query memories using semantic search
248 | */
249 | async queryMemories(query, limit = 10) {
250 | try {
251 | const result = await this.callTool('retrieve_memory', {
252 | query: query,
253 | n_results: limit
254 | });
255 |
256 | return this.parseToolResponse(result.content);
257 | } catch (error) {
258 | console.warn('[MCP Client] Memory query error:', error.message);
259 | return [];
260 | }
261 | }
262 |
263 | /**
264 | * Query memories by time range
265 | */
266 | async queryMemoriesByTime(timeQuery, limit = 10) {
267 | try {
268 | const result = await this.callTool('recall_memory', {
269 | query: timeQuery,
270 | n_results: limit
271 | });
272 |
273 | return this.parseToolResponse(result.content);
274 | } catch (error) {
275 | console.warn('[MCP Client] Time-based memory query error:', error.message);
276 | return [];
277 | }
278 | }
279 |
280 | /**
281 | * Parse tool response content
282 | */
283 | parseToolResponse(content) {
284 | if (!content) return [];
285 |
286 | // Handle array of content objects
287 | if (Array.isArray(content)) {
288 | const textContent = content.find(c => c.type === 'text')?.text || '';
289 | return this.parseMemoryResults(textContent);
290 | }
291 |
292 | // Handle direct text content
293 | if (typeof content === 'string') {
294 | return this.parseMemoryResults(content);
295 | }
296 |
297 | // Handle object with text property
298 | if (content.text) {
299 | return this.parseMemoryResults(content.text);
300 | }
301 |
302 | return [];
303 | }
304 |
305 | /**
306 | * Parse memory results from text response
307 | */
308 | parseMemoryResults(textData) {
309 | try {
310 | // Handle Python dict format conversion to JSON
311 | let cleanText = textData
312 | .replace(/'/g, '"')
313 | .replace(/True/g, 'true')
314 | .replace(/False/g, 'false')
315 | .replace(/None/g, 'null');
316 |
317 | const parsed = JSON.parse(cleanText);
318 | return parsed.results || parsed.memories || parsed || [];
319 | } catch (error) {
320 | // Try to extract JSON from text
321 | const jsonMatch = textData.match(/\{[\s\S]*\}/);
322 | if (jsonMatch) {
323 | try {
324 | const extracted = JSON.parse(jsonMatch[0]);
325 | return extracted.results || extracted.memories || [extracted];
326 | } catch {}
327 | }
328 |
329 | console.warn('[MCP Client] Could not parse memory results:', error.message);
330 | return [];
331 | }
332 | }
333 |
334 | /**
335 | * Disconnect and cleanup
336 | */
337 | async disconnect() {
338 | this.connected = false;
339 |
340 | // Clear pending requests
341 | for (const [id, { reject }] of this.pendingRequests) {
342 | reject(new Error('Connection closed'));
343 | }
344 | this.pendingRequests.clear();
345 |
346 | // Terminate server process
347 | if (this.serverProcess && this.serverProcess.exitCode === null) {
348 | this.serverProcess.kill('SIGTERM');
349 |
350 | // Force kill if doesn't exit gracefully
351 | setTimeout(() => {
352 | if (this.serverProcess && this.serverProcess.exitCode === null) {
353 | this.serverProcess.kill('SIGKILL');
354 | }
355 | }, 2000);
356 | }
357 | }
358 | }
359 |
360 | module.exports = { MCPClient };
```
--------------------------------------------------------------------------------
/scripts/validation/validate_timestamp_integrity.py:
--------------------------------------------------------------------------------
```python
1 | #!/usr/bin/env python3
2 | """
3 | Timestamp Integrity Validation Script
4 |
5 | Detects timestamp anomalies that could indicate the regression bug where
6 | created_at timestamps were being reset during metadata sync operations.
7 |
8 | Checks for:
9 | 1. Suspicious clusters of recent created_at timestamps
10 | 2. Created_at timestamps that are newer than they should be
11 | 3. Memories with identical or very similar created_at timestamps (indicating bulk reset)
12 | 4. created_at > updated_at (logically impossible)
13 | """
14 |
15 | import asyncio
16 | import sys
17 | import os
18 | import time
19 | from datetime import datetime, timedelta
20 | from pathlib import Path
21 | from collections import Counter
22 | from typing import List, Dict, Tuple, Optional
23 |
24 | # Add project root to path
25 | project_root = Path(__file__).parent.parent.parent
26 | sys.path.insert(0, str(project_root / "src"))
27 |
28 | from mcp_memory_service.storage.sqlite_vec import SqliteVecMemoryStorage
29 | import mcp_memory_service.config as config_module
30 |
31 |
32 | class TimestampIntegrityValidator:
33 | """Validator for detecting timestamp integrity issues."""
34 |
35 | def __init__(self, storage):
36 | self.storage = storage
37 | self.warnings = []
38 | self.errors = []
39 |
40 | async def validate_all(self) -> Tuple[bool, List[str], List[str]]:
41 | """
42 | Run all timestamp integrity checks.
43 |
44 | Returns:
45 | Tuple of (is_healthy, warnings, errors)
46 | """
47 | print("🔍 Running timestamp integrity validation...\n")
48 |
49 | await self.check_impossible_timestamps()
50 | await self.check_suspicious_clusters()
51 | await self.check_future_timestamps()
52 | await self.check_timestamp_distribution()
53 |
54 | # Print summary
55 | print("\n" + "="*60)
56 | print("📊 VALIDATION SUMMARY")
57 | print("="*60)
58 |
59 | if not self.errors and not self.warnings:
60 | print("✅ No timestamp integrity issues detected!")
61 | return True, [], []
62 |
63 | if self.errors:
64 | print(f"\n❌ ERRORS: {len(self.errors)}")
65 | for error in self.errors:
66 | print(f" - {error}")
67 |
68 | if self.warnings:
69 | print(f"\n⚠️ WARNINGS: {len(self.warnings)}")
70 | for warning in self.warnings:
71 | print(f" - {warning}")
72 |
73 | return len(self.errors) == 0, self.warnings, self.errors
74 |
75 | async def check_impossible_timestamps(self):
76 | """Check for logically impossible timestamps (created_at > updated_at)."""
77 | print("1️⃣ Checking for impossible timestamps (created_at > updated_at)...")
78 |
79 | if hasattr(self.storage, 'conn'): # SQLite-vec
80 | cursor = self.storage.conn.execute('''
81 | SELECT content_hash, created_at, updated_at,
82 | created_at_iso, updated_at_iso
83 | FROM memories
84 | WHERE created_at > updated_at
85 | ''')
86 |
87 | impossible = cursor.fetchall()
88 |
89 | if impossible:
90 | self.errors.append(
91 | f"Found {len(impossible)} memories with created_at > updated_at (impossible!)"
92 | )
93 | for row in impossible[:5]: # Show first 5
94 | content_hash, created_at, updated_at, created_iso, updated_iso = row
95 | print(f" ❌ {content_hash[:8]}: created={created_iso}, updated={updated_iso}")
96 | else:
97 | print(" ✅ No impossible timestamps found")
98 |
99 | else:
100 | print(" ⚠️ Skipped (not SQLite backend)")
101 |
102 | async def check_suspicious_clusters(self):
103 | """Check for suspicious clusters of recent created_at timestamps."""
104 | print("\n2️⃣ Checking for suspicious timestamp clusters...")
105 |
106 | if hasattr(self.storage, 'conn'): # SQLite-vec
107 | # Get all created_at timestamps
108 | cursor = self.storage.conn.execute('''
109 | SELECT created_at, created_at_iso, COUNT(*) as count
110 | FROM memories
111 | GROUP BY created_at
112 | HAVING COUNT(*) > 1
113 | ORDER BY count DESC
114 | LIMIT 10
115 | ''')
116 |
117 | clusters = cursor.fetchall()
118 |
119 | if clusters:
120 | # Check if there are large clusters (> 5 memories with same timestamp)
121 | large_clusters = [c for c in clusters if c[2] > 5]
122 |
123 | if large_clusters:
124 | self.warnings.append(
125 | f"Found {len(large_clusters)} suspicious timestamp clusters "
126 | f"(multiple memories with identical created_at)"
127 | )
128 | print(f" ⚠️ {len(large_clusters)} suspicious clusters found:")
129 | for created_at, created_iso, count in large_clusters[:5]:
130 | age_hours = (time.time() - created_at) / 3600
131 | print(f" - {count} memories at {created_iso} ({age_hours:.1f}h ago)")
132 | else:
133 | print(f" ✅ No suspicious clusters (some duplicates normal)")
134 |
135 | else:
136 | print(" ✅ No timestamp clusters found")
137 |
138 | else:
139 | print(" ⚠️ Skipped (not SQLite backend)")
140 |
141 | async def check_future_timestamps(self):
142 | """Check for timestamps in the future."""
143 | print("\n3️⃣ Checking for future timestamps...")
144 |
145 | now = time.time()
146 | future_threshold = now + 300 # 5 minutes tolerance
147 |
148 | if hasattr(self.storage, 'conn'): # SQLite-vec
149 | cursor = self.storage.conn.execute('''
150 | SELECT content_hash, created_at, updated_at,
151 | created_at_iso, updated_at_iso
152 | FROM memories
153 | WHERE created_at > ? OR updated_at > ?
154 | ''', (future_threshold, future_threshold))
155 |
156 | future_timestamps = cursor.fetchall()
157 |
158 | if future_timestamps:
159 | self.errors.append(
160 | f"Found {len(future_timestamps)} memories with timestamps in the future!"
161 | )
162 | for row in future_timestamps[:5]:
163 | content_hash, created_at, updated_at, created_iso, updated_iso = row
164 | if created_at > future_threshold:
165 | print(f" ❌ {content_hash[:8]}: created_at in future: {created_iso}")
166 | if updated_at > future_threshold:
167 | print(f" ❌ {content_hash[:8]}: updated_at in future: {updated_iso}")
168 | else:
169 | print(" ✅ No future timestamps found")
170 |
171 | else:
172 | print(" ⚠️ Skipped (not SQLite backend)")
173 |
174 | async def check_timestamp_distribution(self):
175 | """Check timestamp distribution for anomalies (e.g., all recent)."""
176 | print("\n4️⃣ Checking timestamp distribution...")
177 |
178 | if hasattr(self.storage, 'conn'): # SQLite-vec
179 | # Get timestamp statistics
180 | cursor = self.storage.conn.execute('''
181 | SELECT
182 | COUNT(*) as total,
183 | MIN(created_at) as oldest,
184 | MAX(created_at) as newest,
185 | AVG(created_at) as avg_timestamp
186 | FROM memories
187 | ''')
188 |
189 | row = cursor.fetchone()
190 | if not row or row[0] == 0:
191 | print(" ℹ️ No memories to analyze")
192 | return
193 |
194 | total, oldest, newest, avg_timestamp = row
195 |
196 | # Calculate time ranges
197 | now = time.time()
198 | oldest_age_days = (now - oldest) / 86400
199 | newest_age_hours = (now - newest) / 3600
200 |
201 | print(f" 📈 Total memories: {total}")
202 | print(f" 📅 Oldest: {datetime.utcfromtimestamp(oldest).isoformat()}Z ({oldest_age_days:.1f} days ago)")
203 | print(f" 📅 Newest: {datetime.utcfromtimestamp(newest).isoformat()}Z ({newest_age_hours:.1f} hours ago)")
204 |
205 | # Check for anomaly: if > 50% of memories created in last 24 hours
206 | # but oldest is > 7 days old (indicates bulk timestamp reset)
207 | cursor = self.storage.conn.execute('''
208 | SELECT COUNT(*) FROM memories
209 | WHERE created_at > ?
210 | ''', (now - 86400,))
211 |
212 | recent_count = cursor.fetchone()[0]
213 | recent_percentage = (recent_count / total) * 100
214 |
215 | if recent_percentage > 50 and oldest_age_days > 7:
216 | self.warnings.append(
217 | f"Suspicious: {recent_percentage:.1f}% of memories created in last 24h, "
218 | f"but oldest memory is {oldest_age_days:.1f} days old. "
219 | f"This could indicate timestamp reset bug!"
220 | )
221 | print(f" ⚠️ {recent_percentage:.1f}% created in last 24h (suspicious if many old memories)")
222 |
223 | # Check distribution by age buckets
224 | buckets = [
225 | ("Last hour", 3600),
226 | ("Last 24 hours", 86400),
227 | ("Last week", 604800),
228 | ("Last month", 2592000),
229 | ("Older", float('inf'))
230 | ]
231 |
232 | print(f"\n 📊 Distribution by age:")
233 | for label, seconds in buckets:
234 | if seconds == float('inf'):
235 | cursor = self.storage.conn.execute('''
236 | SELECT COUNT(*) FROM memories
237 | WHERE created_at < ?
238 | ''', (now - 2592000,))
239 | else:
240 | cursor = self.storage.conn.execute('''
241 | SELECT COUNT(*) FROM memories
242 | WHERE created_at > ?
243 | ''', (now - seconds,))
244 |
245 | count = cursor.fetchone()[0]
246 | percentage = (count / total) * 100
247 | print(f" {label:15}: {count:4} ({percentage:5.1f}%)")
248 |
249 | else:
250 | print(" ⚠️ Skipped (not SQLite backend)")
251 |
252 |
253 | async def main():
254 | """Main validation function."""
255 | print("="*60)
256 | print("⏰ TIMESTAMP INTEGRITY VALIDATION")
257 | print("="*60)
258 | print()
259 |
260 | try:
261 | # Get database path from config
262 | storage_backend = os.getenv('MCP_MEMORY_STORAGE_BACKEND', 'sqlite_vec')
263 | db_path = config_module.MEMORY_SQLITE_DB_PATH
264 |
265 | print(f"Backend: {storage_backend}")
266 | print(f"Database: {db_path}")
267 | print()
268 |
269 | # Initialize storage
270 | storage = SqliteVecMemoryStorage(
271 | db_path=db_path,
272 | embedding_model="all-MiniLM-L6-v2"
273 | )
274 | await storage.initialize()
275 |
276 | # Run validation
277 | validator = TimestampIntegrityValidator(storage)
278 | is_healthy, warnings, errors = await validator.validate_all()
279 |
280 | # Close storage
281 | if hasattr(storage, 'close'):
282 | storage.close()
283 |
284 | # Exit with appropriate code
285 | if errors:
286 | print("\n❌ Validation FAILED with errors")
287 | sys.exit(1)
288 | elif warnings:
289 | print("\n⚠️ Validation completed with warnings")
290 | sys.exit(0)
291 | else:
292 | print("\n✅ Validation PASSED - Timestamps are healthy!")
293 | sys.exit(0)
294 |
295 | except Exception as e:
296 | print(f"\n❌ Validation failed with exception: {e}")
297 | import traceback
298 | traceback.print_exc()
299 | sys.exit(1)
300 |
301 |
302 | if __name__ == "__main__":
303 | asyncio.run(main())
304 |
```